diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 995395a6e..b1d854387 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -46,6 +46,7 @@ package de.anomic.htmlFilter; import de.anomic.server.serverByteBuffer; import de.anomic.net.URL; +import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.UnsupportedCharsetException; @@ -123,6 +124,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen this.charset = charset; } + public String getCharset() { + return this.charset; + } + public void scrapeText(byte[] newtext) { // System.out.println("SCRAPE: " + new String(newtext)); if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32); @@ -243,8 +248,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (s.length() > 0) return s; // extract headline from content - if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80)); - return cleanLine(content.trim().toString()); + if (content.length() > 80) { + try { + return cleanLine(new String(content.getBytes(), 0, 80,this.charset)); + } catch (UnsupportedEncodingException e) { + return cleanLine(new String(content.getBytes(), 0, 80)); + } + } + return cleanLine(content.trim().toString(this.charset)); } public String[] getHeadlines(int i) { diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index 4e4cd0044..46aa1196a 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -89,6 +89,7 @@ implements Parser { plasmaParserDocument theDoc = new plasmaParserDocument( location, mimeType, + "UTF-8", null, ((contents.length() > 80)? contents.substring(0, 80):contents.trim()). replaceAll("\r\n"," "). diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index b6a530d69..31c2e5091 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -152,6 +152,7 @@ public class odtParser extends AbstractParser implements Parser { return new plasmaParserDocument( location, mimeType, + "UTF-8", docKeywords, docShortTitle, docLongTitle, diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index df3e49d1e..b394b2e83 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -152,6 +152,7 @@ public class pdfParser extends AbstractParser implements Parser { plasmaParserDocument theDoc = new plasmaParserDocument( location, mimeType, + "UTF-8", docKeyWords, docSubject, docTitle, diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index a3f62dc8a..7e117f4f5 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -147,6 +147,7 @@ public class rpmParser extends AbstractParser implements Parser { plasmaParserDocument theDoc = new plasmaParserDocument( location, mimeType, + "UTF-8", null, name, summary, diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index 38fdbad1e..05209cc75 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -190,6 +190,7 @@ public class rssParser extends AbstractParser implements Parser { plasmaParserDocument theDoc = new plasmaParserDocument( location, mimeType, + "UTF-8", null, null, feedTitle, diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index 12b305687..fdef82b99 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -95,6 +95,7 @@ implements Parser { plasmaParserDocument theDoc = new plasmaParserDocument( location, mimeType, + "UTF-8", null, ((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()). replaceAll("\r\n"," "). diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index 6b8871012..402ecee8c 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -189,6 +189,7 @@ public class tarParser extends AbstractParser implements Parser { return new plasmaParserDocument( location, mimeType, + null, docKeywords.toString(), docShortTitle.toString(), docLongTitle.toString(), diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index 829d00441..1dc963e95 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -243,6 +243,7 @@ public class vcfParser extends AbstractParser implements Parser { plasmaParserDocument theDoc = new plasmaParserDocument( location, // url of the source document mimeType, // the documents mime type + null, null, // a list of extracted keywords null, // a short document title parsedTitle.toString(), // a long document title diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index 146f85006..a4367ebd1 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -165,6 +165,7 @@ public class zipParser extends AbstractParser implements Parser { return new plasmaParserDocument( location, mimeType, + null, docKeywords.toString(), docShortTitle.toString(), docLongTitle.toString(), diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 6102f8766..9b3ffd6ab 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -588,7 +588,7 @@ public final class plasmaParser { int p = 0; for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; plasmaParserDocument ppd = new plasmaParserDocument(new URL(location.toNormalform()), - mimeType, null, null, scraper.getTitle(), + mimeType, scraper.getCharset(), null, null, scraper.getTitle(), sections, null, scraper.getText(), scraper.getAnchors(), scraper.getImages()); //scraper.close(); @@ -749,7 +749,12 @@ public final class plasmaParser { if (document != null) { // found text String[] sentences = document.getSentences(); - if (sentences != null) for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]); + if (sentences != null) { + for (int i = 0; i < sentences.length; i++) { + System.out.print("line " + i + ": "); + System.out.println(sentences[i]); + } + } // found links int anchorNr = 0; diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index b3e3feb95..63a90bae6 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -56,6 +56,7 @@ public class plasmaParserDocument { URL location; // the source url String mimeType; // mimeType as taken from http header + String charset; // the charset of the document String keywords; // most resources provide a keyword field String shortTitle; // a shortTitle mostly appears in the window header (border) String longTitle; // the real title of the document, commonly h1-tags @@ -73,12 +74,13 @@ public class plasmaParserDocument { plasmaCondenser condenser; boolean resorted; - public plasmaParserDocument(URL location, String mimeType, + public plasmaParserDocument(URL location, String mimeType, String charset, String keywords, String shortTitle, String longTitle, String[] sections, String abstrct, byte[] text, Map anchors, TreeSet images) { this.location = location; this.mimeType = (mimeType==null)?"application/octet-stream":mimeType; + this.charset = charset; this.keywords = (keywords==null)?"":keywords; this.shortTitle = (shortTitle==null)?"":shortTitle; this.longTitle = (longTitle==null)?"":longTitle; @@ -98,6 +100,13 @@ public class plasmaParserDocument { return this.mimeType; } + /** + * @return the supposed charset of this document or null if unknown + */ + public String getCharset() { + return this.charset; + } + public String getMainShortTitle() { if (shortTitle != null) return shortTitle; else return longTitle; } diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java index 0943327df..b7ae27edf 100644 --- a/source/de/anomic/server/serverByteBuffer.java +++ b/source/de/anomic/server/serverByteBuffer.java @@ -346,6 +346,14 @@ public final class serverByteBuffer extends OutputStream { public String toString() { return new String(buffer, offset, length); } + + public String toString(String charsetName) { + try { + return new String(this.getBytes(),charsetName); + } catch (UnsupportedEncodingException e) { + return new String(this.getBytes()); + } + } public String toString(int left, int rightbound) { return new String(buffer, offset + left, rightbound - left);