From b57c9da1f8effd9521156b21f67c788f05cb1579 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 5 Feb 2009 15:15:13 +0000 Subject: [PATCH] - fixes to doc, ppt, xls parser: better title - fixes to httpd server response header generation - fixes to a server date computation bug - new Button in indexControl to view content of url in ViewFile git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5576 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControlURLs_p.html | 6 ++++ htroot/ViewFile.java | 2 +- htroot/api/yacydoc.java | 2 +- source/de/anomic/http/httpHeader.java | 10 +++++- source/de/anomic/http/httpResponseHeader.java | 16 ++------- source/de/anomic/http/httpd.java | 33 +++---------------- source/de/anomic/http/httpdFileHandler.java | 7 ++-- .../anomic/kelondro/order/DateFormatter.java | 2 +- .../anomic/plasma/parser/doc/docParser.java | 17 ++++++---- .../anomic/plasma/parser/ppt/pptParser.java | 16 +++++---- .../anomic/plasma/parser/xls/xlsParser.java | 8 ++--- 11 files changed, 50 insertions(+), 69 deletions(-) diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html index c0af4ee46..9c7c88f46 100644 --- a/htroot/IndexControlURLs_p.html +++ b/htroot/IndexControlURLs_p.html @@ -123,6 +123,12 @@ Click the API icon to see an example call to the search rss API. To see a list of all APIs, please visit the API wiki page. +
+ + + +
+
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 06fc45971..344f41b9d 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -102,7 +102,7 @@ public class ViewFile { return prop; } - // gettin the url that belongs to the entry + // getting the url that belongs to the entry final indexURLReference.Components comp = urlEntry.comp(); if ((comp == null) || (comp.url() == null)) { prop.put("error", "3"); diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index 5bb7de363..ca4262b83 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -84,7 +84,7 @@ public class yacydoc { prop.putXML("dc_publisher", comp.url().toNormalform(false, true)); prop.putXML("dc_contributor", ""); prop.putXML("dc_date", entry.moddate().toString()); - prop.put("dc_type", entry.doctype()); + prop.putXML("dc_type", "" + entry.doctype()); prop.putXML("dc_identifier", urlhash); prop.putXML("dc_language", entry.language()); diff --git a/source/de/anomic/http/httpHeader.java b/source/de/anomic/http/httpHeader.java index 5afb2bb14..760801674 100644 --- a/source/de/anomic/http/httpHeader.java +++ b/source/de/anomic/http/httpHeader.java @@ -73,7 +73,6 @@ public class httpHeader extends TreeMap implements Map implements Map implements Map reverseMappingCache, final Map othermap) { super(reverseMappingCache, othermap); } - - public Date date() { - return headerDate(httpHeader.DATE); + Date d = headerDate(httpHeader.DATE); + if (d == null) return new Date(); else return d; } public Date expires() { diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java index 3f1d5125b..bd7fee6a0 100644 --- a/source/de/anomic/http/httpd.java +++ b/source/de/anomic/http/httpd.java @@ -1236,34 +1236,6 @@ public final class httpd implements serverHandler, Cloneable { if (o != null) try { o.close(); } catch (final Exception e) { e.printStackTrace(); } } } - - public static final void sendRespondHeader( - final Properties conProp, - final OutputStream respond, - final String httpVersion, - final int httpStatusCode, - final String httpStatusText, - final long contentLength - ) throws IOException { - sendRespondHeader(conProp,respond,httpVersion,httpStatusCode,httpStatusText,null,contentLength,null,null,null,null,null); - } - - public static final void sendRespondHeader( - final Properties conProp, - final OutputStream respond, - final String httpVersion, - final int httpStatusCode, - final String httpStatusText, - final String contentType, - final long contentLength, - final Date moddate, - final Date expires, - final httpResponseHeader headers, - final String contentEnc, - final String transferEnc - ) throws IOException { - sendRespondHeader(conProp,respond,httpVersion,httpStatusCode,httpStatusText,contentType,contentLength,moddate,expires,headers,contentEnc,transferEnc,true); - } public static final void sendRespondHeader( final Properties conProp, @@ -1303,7 +1275,10 @@ public final class httpd implements serverHandler, Cloneable { headers.put(httpResponseHeader.SERVER, "AnomicHTTPD (www.anomic.de)"); headers.put(httpResponseHeader.DATE, DateFormatter.formatRFC1123(now)); - if (moddate.after(now)) moddate = now; + if (moddate.after(now)) { + System.out.println("*** DEBUG: correcting moddate = " + moddate.toString() + " to now = " + now.toString()); + moddate = now; + } headers.put(httpResponseHeader.LAST_MODIFIED, DateFormatter.formatRFC1123(moddate)); if (nocache) { diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index 13f7f95f6..de7ddc91c 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -742,9 +742,7 @@ public final class httpdFileHandler { // call rewrite-class - if (targetClass == null) { - targetDate = new Date(targetFile.lastModified()); - } else { + if (targetClass != null) { // CGI-class: call the class to create a property for rewriting try { requestHeader.put(httpHeader.CONNECTION_PROP_CLIENTIP, conProp.getProperty(httpHeader.CONNECTION_PROP_CLIENTIP)); @@ -804,9 +802,10 @@ public final class httpdFileHandler { targetClass = null; throw e; } - targetDate = new Date(System.currentTimeMillis()); nocache = true; } + + targetDate = new Date(targetFile.lastModified()); // rewrite the file InputStream fis = null; diff --git a/source/de/anomic/kelondro/order/DateFormatter.java b/source/de/anomic/kelondro/order/DateFormatter.java index 4f5e9d47a..a221ee771 100644 --- a/source/de/anomic/kelondro/order/DateFormatter.java +++ b/source/de/anomic/kelondro/order/DateFormatter.java @@ -147,7 +147,7 @@ public final class DateFormatter { public static final String formatRFC1123(final Date date) { if (date == null) return ""; - if (date.getTime() - lastRFC1123long < 1000) { + if (Math.abs(date.getTime() - lastRFC1123long) < 1000) { //System.out.println("date cache hit - " + lastRFC1123string); return lastRFC1123string; } diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index 2e3cee06e..a7d7f7e09 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -65,19 +65,22 @@ public class docParser extends AbstractParser implements Parser { try { final WordTextExtractorFactory extractorFactory = new WordTextExtractorFactory(); final TextExtractor extractor = extractorFactory.textExtractor(source); - final String contents = extractor.getText(); - + final String contents = extractor.getText().trim(); + String title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim(); + if (title.length() > 80) title = title.substring(0, 80); + int l = title.length(); + while (true) { + title = title.replaceAll(" ", " "); + if (title.length() == l) break; + l = title.length(); + } final plasmaParserDocument theDoc = new plasmaParserDocument( location, mimeType, "UTF-8", null, null, - ((contents.length() > 80)? contents.substring(0, 80):contents.trim()). - replaceAll("\r\n"," "). - replaceAll("\n"," "). - replaceAll("\r"," "). - replaceAll("\t"," "), + title, "", // TODO: AUTHOR null, null, diff --git a/source/de/anomic/plasma/parser/ppt/pptParser.java b/source/de/anomic/plasma/parser/ppt/pptParser.java index 9256201fe..6bd3a3481 100644 --- a/source/de/anomic/plasma/parser/ppt/pptParser.java +++ b/source/de/anomic/plasma/parser/ppt/pptParser.java @@ -77,7 +77,15 @@ public class pptParser extends AbstractParser implements Parser { * of the document */ final PowerPointExtractor pptExtractor = new PowerPointExtractor(new BufferedInputStream(source)); - final String contents = pptExtractor.getText(true, true); + final String contents = pptExtractor.getText(true, true).trim(); + String title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim(); + if (title.length() > 80) title = title.substring(0, 80); + int l = title.length(); + while (true) { + title = title.replaceAll(" ", " "); + if (title.length() == l) break; + l = title.length(); + } /* * create the plasmaParserDocument for the database @@ -89,11 +97,7 @@ public class pptParser extends AbstractParser implements Parser { "UTF-8", null, null, - ((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()). - replaceAll("\r\n"," "). - replaceAll("\n"," "). - replaceAll("\r"," "). - replaceAll("\t"," "), + title, "", // TODO: AUTHOR null, null, diff --git a/source/de/anomic/plasma/parser/xls/xlsParser.java b/source/de/anomic/plasma/parser/xls/xlsParser.java index a7633a311..a538d270d 100644 --- a/source/de/anomic/plasma/parser/xls/xlsParser.java +++ b/source/de/anomic/plasma/parser/xls/xlsParser.java @@ -105,7 +105,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener { din.close(); //now the parsed strings are in the StringBuilder, now convert them to a String - final String contents = sbFoundStrings.toString(); + final String contents = sbFoundStrings.toString().trim(); /* * create the plasmaParserDocument for the database @@ -117,11 +117,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener { "UTF-8", null, null, - ((contents.length() > 80) ? contents.substring(0, 80) : contents.trim()). - replaceAll("\r\n"," "). - replaceAll("\n"," "). - replaceAll("\r"," "). - replaceAll("\t"," "), + location.getFile(), "", // TODO: AUTHOR null, null,