*) storing document charset into plasmaParserDocument object (is needed later by the condenser)

*) htmlFilterContentScraper.java: using proper charset for document title *) serverByteBuffer.java: adding new toString which allows to specify the charset for byte encoding git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2593 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 74c3e7cf29
parent c5d3020941
commit 74c3e7cf29
13 changed files with 47 additions and 5 deletions
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -46,6 +46,7 @@ package de.anomic.htmlFilter;
 import de.anomic.server.serverByteBuffer;
 import de.anomic.net.URL;

+import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.nio.charset.UnsupportedCharsetException;
@ -123,6 +124,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        this.charset = charset;
    }
    
+    public String getCharset() {
+        return this.charset;
+    }
+    
    public void scrapeText(byte[] newtext) {
        // System.out.println("SCRAPE: " + new String(newtext));
        if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);
@ -243,8 +248,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        if (s.length() > 0) return s;
        
        // extract headline from content
-        if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80));
-        return cleanLine(content.trim().toString());
+        if (content.length() > 80) {
+            try {
+                return cleanLine(new String(content.getBytes(), 0, 80,this.charset));
+            } catch (UnsupportedEncodingException e) {
+                return cleanLine(new String(content.getBytes(), 0, 80));
+            }
+        }
+        return cleanLine(content.trim().toString(this.charset));
    }
    
    public String[] getHeadlines(int i) {
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@ -89,6 +89,7 @@ implements Parser {
              plasmaParserDocument theDoc = new plasmaParserDocument(
                      location,
                      mimeType,
+                      "UTF-8",
                      null,
                      ((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
                          replaceAll("\r\n"," ").
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@ -152,6 +152,7 @@ public class odtParser extends AbstractParser implements Parser {
            return new plasmaParserDocument(
                    location,
                    mimeType,
+                    "UTF-8",
                    docKeywords,
                    docShortTitle, 
                    docLongTitle,
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@ -152,6 +152,7 @@ public class pdfParser extends AbstractParser implements Parser {
            plasmaParserDocument theDoc = new plasmaParserDocument(
                    location,
                    mimeType,
+                    "UTF-8",
                    docKeyWords,
                    docSubject,
                    docTitle,
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@ -147,6 +147,7 @@ public class rpmParser extends AbstractParser implements Parser {
            plasmaParserDocument theDoc = new plasmaParserDocument(
                    location,
                    mimeType,
+                    "UTF-8",
                    null,
                    name,
                    summary,
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@ -190,6 +190,7 @@ public class rssParser extends AbstractParser implements Parser {
            plasmaParserDocument theDoc = new plasmaParserDocument(
                    location,
                    mimeType,
+                    "UTF-8",
                    null,
                    null,
                    feedTitle,
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@ -95,6 +95,7 @@ implements Parser {
            plasmaParserDocument theDoc = new plasmaParserDocument(
                    location,
                    mimeType,
+                    "UTF-8",
                    null,
                    ((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
                        replaceAll("\r\n"," ").
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@ -189,6 +189,7 @@ public class tarParser extends AbstractParser implements Parser {
            return new plasmaParserDocument(
                    location,
                    mimeType,
+                    null,
                    docKeywords.toString(),
                    docShortTitle.toString(), 
                    docLongTitle.toString(),
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@ -243,6 +243,7 @@ public class vcfParser extends AbstractParser implements Parser {
            plasmaParserDocument theDoc = new plasmaParserDocument(
                    location,                   // url of the source document
                    mimeType,                   // the documents mime type
+                    null,
                    null,                       // a list of extracted keywords
                    null,                       // a short document title
                    parsedTitle.toString(),     // a long document title
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@ -165,6 +165,7 @@ public class zipParser extends AbstractParser implements Parser {
            return new plasmaParserDocument(
                    location,
                    mimeType,
+                    null,
                    docKeywords.toString(),
                    docShortTitle.toString(), 
                    docLongTitle.toString(),
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -588,7 +588,7 @@ public final class plasmaParser {
            int p = 0;
            for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
            plasmaParserDocument ppd =  new plasmaParserDocument(new URL(location.toNormalform()),
-                                mimeType, null, null, scraper.getTitle(),
+                                mimeType, scraper.getCharset(), null, null, scraper.getTitle(),
                                sections, null,
                                scraper.getText(), scraper.getAnchors(), scraper.getImages());
            //scraper.close();
@ -749,7 +749,12 @@ public final class plasmaParser {
            if (document != null) {
                // found text
                String[] sentences = document.getSentences();
-                if (sentences != null) for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
+                if (sentences != null) {
+                    for (int i = 0; i < sentences.length; i++) {
+                        System.out.print("line " + i + ": ");
+                        System.out.println(sentences[i]);
+                    }
+                }
                
                // found links
                int anchorNr = 0;
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -56,6 +56,7 @@ public class plasmaParserDocument {
    
    URL location;       // the source url
    String mimeType;    // mimeType as taken from http header
+    String charset;     // the charset of the document
    String keywords;    // most resources provide a keyword field
    String shortTitle;  // a shortTitle mostly appears in the window header (border)
    String longTitle;   // the real title of the document, commonly h1-tags
@ -73,12 +74,13 @@ public class plasmaParserDocument {
    plasmaCondenser condenser;
    boolean resorted;
                    
-    public plasmaParserDocument(URL location, String mimeType,
+    public plasmaParserDocument(URL location, String mimeType, String charset,
                    String keywords, String shortTitle, String longTitle,
                    String[] sections, String abstrct,
                    byte[] text, Map anchors, TreeSet images) {
        this.location = location;
        this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
+        this.charset = charset;
        this.keywords = (keywords==null)?"":keywords;
        this.shortTitle = (shortTitle==null)?"":shortTitle;
        this.longTitle = (longTitle==null)?"":longTitle;
@ -98,6 +100,13 @@ public class plasmaParserDocument {
        return this.mimeType;
    }
    
+    /**
+     * @return the supposed charset of this document or <code>null</code> if unknown
+     */
+    public String getCharset() {
+        return this.charset;
+    }
+    
    public String getMainShortTitle() {
        if (shortTitle != null) return shortTitle; else return longTitle;
    }
--- a/source/de/anomic/server/serverByteBuffer.java
+++ b/source/de/anomic/server/serverByteBuffer.java
@ -346,6 +346,14 @@ public final class serverByteBuffer extends OutputStream {
    public String toString() {
        return new String(buffer, offset, length);
    }
+    
+    public String toString(String charsetName) {
+        try {
+            return new String(this.getBytes(),charsetName);
+        } catch (UnsupportedEncodingException e) {
+            return new String(this.getBytes());
+        }
+    }

    public String toString(int left, int rightbound) {
        return new String(buffer, offset + left, rightbound - left);