diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 995395a6e..b1d854387 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -46,6 +46,7 @@ package de.anomic.htmlFilter;
import de.anomic.server.serverByteBuffer;
import de.anomic.net.URL;
+import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
@@ -123,6 +124,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.charset = charset;
}
+ public String getCharset() {
+ return this.charset;
+ }
+
public void scrapeText(byte[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);
@@ -243,8 +248,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (s.length() > 0) return s;
// extract headline from content
- if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80));
- return cleanLine(content.trim().toString());
+ if (content.length() > 80) {
+ try {
+ return cleanLine(new String(content.getBytes(), 0, 80,this.charset));
+ } catch (UnsupportedEncodingException e) {
+ return cleanLine(new String(content.getBytes(), 0, 80));
+ }
+ }
+ return cleanLine(content.trim().toString(this.charset));
}
public String[] getHeadlines(int i) {
diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java
index 4e4cd0044..46aa1196a 100644
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@@ -89,6 +89,7 @@ implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
+ "UTF-8",
null,
((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
replaceAll("\r\n"," ").
diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java
index b6a530d69..31c2e5091 100644
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@@ -152,6 +152,7 @@ public class odtParser extends AbstractParser implements Parser {
return new plasmaParserDocument(
location,
mimeType,
+ "UTF-8",
docKeywords,
docShortTitle,
docLongTitle,
diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
index df3e49d1e..b394b2e83 100644
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -152,6 +152,7 @@ public class pdfParser extends AbstractParser implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
+ "UTF-8",
docKeyWords,
docSubject,
docTitle,
diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java
index a3f62dc8a..7e117f4f5 100644
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@@ -147,6 +147,7 @@ public class rpmParser extends AbstractParser implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
+ "UTF-8",
null,
name,
summary,
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index 38fdbad1e..05209cc75 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -190,6 +190,7 @@ public class rssParser extends AbstractParser implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
+ "UTF-8",
null,
null,
feedTitle,
diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java
index 12b305687..fdef82b99 100644
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@@ -95,6 +95,7 @@ implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
+ "UTF-8",
null,
((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
replaceAll("\r\n"," ").
diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
index 6b8871012..402ecee8c 100644
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -189,6 +189,7 @@ public class tarParser extends AbstractParser implements Parser {
return new plasmaParserDocument(
location,
mimeType,
+ null,
docKeywords.toString(),
docShortTitle.toString(),
docLongTitle.toString(),
diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java
index 829d00441..1dc963e95 100644
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@@ -243,6 +243,7 @@ public class vcfParser extends AbstractParser implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location, // url of the source document
mimeType, // the documents mime type
+ null,
null, // a list of extracted keywords
null, // a short document title
parsedTitle.toString(), // a long document title
diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java
index 146f85006..a4367ebd1 100644
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@@ -165,6 +165,7 @@ public class zipParser extends AbstractParser implements Parser {
return new plasmaParserDocument(
location,
mimeType,
+ null,
docKeywords.toString(),
docShortTitle.toString(),
docLongTitle.toString(),
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 6102f8766..9b3ffd6ab 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -588,7 +588,7 @@ public final class plasmaParser {
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
plasmaParserDocument ppd = new plasmaParserDocument(new URL(location.toNormalform()),
- mimeType, null, null, scraper.getTitle(),
+ mimeType, scraper.getCharset(), null, null, scraper.getTitle(),
sections, null,
scraper.getText(), scraper.getAnchors(), scraper.getImages());
//scraper.close();
@@ -749,7 +749,12 @@ public final class plasmaParser {
if (document != null) {
// found text
String[] sentences = document.getSentences();
- if (sentences != null) for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
+ if (sentences != null) {
+ for (int i = 0; i < sentences.length; i++) {
+ System.out.print("line " + i + ": ");
+ System.out.println(sentences[i]);
+ }
+ }
// found links
int anchorNr = 0;
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index b3e3feb95..63a90bae6 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -56,6 +56,7 @@ public class plasmaParserDocument {
URL location; // the source url
String mimeType; // mimeType as taken from http header
+ String charset; // the charset of the document
String keywords; // most resources provide a keyword field
String shortTitle; // a shortTitle mostly appears in the window header (border)
String longTitle; // the real title of the document, commonly h1-tags
@@ -73,12 +74,13 @@ public class plasmaParserDocument {
plasmaCondenser condenser;
boolean resorted;
- public plasmaParserDocument(URL location, String mimeType,
+ public plasmaParserDocument(URL location, String mimeType, String charset,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, TreeSet images) {
this.location = location;
this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
+ this.charset = charset;
this.keywords = (keywords==null)?"":keywords;
this.shortTitle = (shortTitle==null)?"":shortTitle;
this.longTitle = (longTitle==null)?"":longTitle;
@@ -98,6 +100,13 @@ public class plasmaParserDocument {
return this.mimeType;
}
+ /**
+ * @return the supposed charset of this document or null
if unknown
+ */
+ public String getCharset() {
+ return this.charset;
+ }
+
public String getMainShortTitle() {
if (shortTitle != null) return shortTitle; else return longTitle;
}
diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java
index 0943327df..b7ae27edf 100644
--- a/source/de/anomic/server/serverByteBuffer.java
+++ b/source/de/anomic/server/serverByteBuffer.java
@@ -346,6 +346,14 @@ public final class serverByteBuffer extends OutputStream {
public String toString() {
return new String(buffer, offset, length);
}
+
+ public String toString(String charsetName) {
+ try {
+ return new String(this.getBytes(),charsetName);
+ } catch (UnsupportedEncodingException e) {
+ return new String(this.getBytes());
+ }
+ }
public String toString(int left, int rightbound) {
return new String(buffer, offset + left, rightbound - left);