From d0a5a53789f57dad0ea199f327e868cb426fb1d9 Mon Sep 17 00:00:00 2001 From: theli Date: Fri, 15 Sep 2006 12:52:46 +0000 Subject: [PATCH] *) changes needed for multi-language support - parsers may need to know the charset of the byte stream git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2591 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../htmlFilter/htmlFilterContentScraper.java | 12 ++++++ .../de/anomic/plasma/cache/IResourceInfo.java | 6 +++ .../anomic/plasma/cache/ftp/ResourceInfo.java | 4 ++ .../plasma/cache/http/ResourceInfo.java | 9 +++++ .../anomic/plasma/parser/AbstractParser.java | 19 +++++++--- source/de/anomic/plasma/parser/Parser.java | 9 +++-- .../anomic/plasma/parser/bzip/bzipParser.java | 4 +- .../anomic/plasma/parser/doc/docParser.java | 2 +- .../anomic/plasma/parser/gzip/gzipParser.java | 4 +- .../parser/mimeType/mimeTypeParser.java | 8 ++-- .../anomic/plasma/parser/odt/odtParser.java | 8 ++-- .../anomic/plasma/parser/pdf/pdfParser.java | 2 +- .../anomic/plasma/parser/rpm/rpmParser.java | 8 ++-- .../anomic/plasma/parser/rss/rssParser.java | 2 +- .../anomic/plasma/parser/rtf/rtfParser.java | 2 +- .../anomic/plasma/parser/tar/tarParser.java | 4 +- source/de/anomic/plasma/parser/vcf/build.xml | 25 ++++++++----- .../anomic/plasma/parser/vcf/vcfParser.java | 37 +++++++++++-------- .../anomic/plasma/parser/zip/zipParser.java | 4 +- source/de/anomic/plasma/plasmaParser.java | 17 ++++++--- .../de/anomic/plasma/plasmaSnippetCache.java | 4 +- .../de/anomic/plasma/plasmaSwitchboard.java | 3 +- .../anomic/plasma/plasmaSwitchboardQueue.java | 5 +++ 23 files changed, 133 insertions(+), 65 deletions(-) diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 01a82ddb5..9fb48ffd2 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -47,6 +47,8 @@ import de.anomic.server.serverByteBuffer; import de.anomic.net.URL; import java.net.MalformedURLException; +import java.nio.charset.Charset; +import java.nio.charset.UnsupportedCharsetException; import java.text.Collator; import java.util.ArrayList; import java.util.HashMap; @@ -93,7 +95,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen //private String headline; private List[] headlines; private serverByteBuffer content; + private URL root; + private String charset = "UTF-8"; public htmlFilterContentScraper(URL root) { // the root value here will not be used to load the resource. @@ -109,6 +113,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen this.content = new serverByteBuffer(1024); } + public void setCharset(String charset) throws UnsupportedCharsetException { + // testing if charset exists + Charset.forName(charset); + + // remember it + this.charset = charset; + } + public void scrapeText(byte[] newtext) { // System.out.println("SCRAPE: " + new String(newtext)); if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32); diff --git a/source/de/anomic/plasma/cache/IResourceInfo.java b/source/de/anomic/plasma/cache/IResourceInfo.java index 8cabfe366..81cfc2491 100644 --- a/source/de/anomic/plasma/cache/IResourceInfo.java +++ b/source/de/anomic/plasma/cache/IResourceInfo.java @@ -82,6 +82,12 @@ public interface IResourceInfo { */ public String getMimeType(); + /** + * Returns the charset of the resource + * @return returns the name of the charset or null if unknown + */ + public String getCharSet(); + /** * Returns the modification date of the cached object * @return the modifiaction date diff --git a/source/de/anomic/plasma/cache/ftp/ResourceInfo.java b/source/de/anomic/plasma/cache/ftp/ResourceInfo.java index 9d04e6646..dfbc32ccd 100644 --- a/source/de/anomic/plasma/cache/ftp/ResourceInfo.java +++ b/source/de/anomic/plasma/cache/ftp/ResourceInfo.java @@ -161,4 +161,8 @@ public class ResourceInfo implements IResourceInfo { return responseStatus != null && responseStatus.equalsIgnoreCase("OK"); } + public String getCharSet() { + return null; + } + } diff --git a/source/de/anomic/plasma/cache/http/ResourceInfo.java b/source/de/anomic/plasma/cache/http/ResourceInfo.java index 396304572..ec3768fd1 100644 --- a/source/de/anomic/plasma/cache/http/ResourceInfo.java +++ b/source/de/anomic/plasma/cache/http/ResourceInfo.java @@ -110,6 +110,15 @@ public class ResourceInfo implements IResourceInfo { int pos = mimeType.indexOf(';'); return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); } + + public String getCharSet() { + if (this.responseHeader == null) return null; + + String mimeType = this.responseHeader.mime(); + + int pos = mimeType.indexOf(';'); + return ((pos < 0) ? null : mimeType.substring(pos)); + } /** * @see de.anomic.plasma.cache.IResourceInfo#getModificationDate() diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java index e8632ce93..c69c60496 100644 --- a/source/de/anomic/plasma/parser/AbstractParser.java +++ b/source/de/anomic/plasma/parser/AbstractParser.java @@ -103,6 +103,7 @@ public abstract class AbstractParser implements Parser{ * Parsing a document available as byte array. * @param location the origin of the document * @param mimeType the mimetype of the document + * @param charset the supposed charset of the document or null if unkown * @param source the content byte array * @return a {@link plasmaParserDocument} containing the extracted plain text of the document * and some additional metadata. @@ -113,12 +114,13 @@ public abstract class AbstractParser implements Parser{ public plasmaParserDocument parse( URL location, String mimeType, + String charset, byte[] source ) throws ParserException, InterruptedException { ByteArrayInputStream contentInputStream = null; try { contentInputStream = new ByteArrayInputStream(source); - return this.parse(location,mimeType,contentInputStream); + return this.parse(location,mimeType,charset,contentInputStream); } finally { if (contentInputStream != null) { try { @@ -133,6 +135,7 @@ public abstract class AbstractParser implements Parser{ * Parsing a document stored in a {@link File} * @param location the origin of the document * @param mimeType the mimetype of the document + * @param charset the supposed charset of the document or null if unkown * @param sourceFile the file containing the content of the document * @return a {@link plasmaParserDocument} containing the extracted plain text of the document * and some additional metadata. @@ -140,12 +143,16 @@ public abstract class AbstractParser implements Parser{ * * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File) */ - public plasmaParserDocument parse(URL location, String mimeType, - File sourceFile) throws ParserException, InterruptedException { + public plasmaParserDocument parse( + URL location, + String mimeType, + String charset, + File sourceFile + ) throws ParserException, InterruptedException { BufferedInputStream contentInputStream = null; try { contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile)); - return this.parse(location, mimeType, contentInputStream); + return this.parse(location, mimeType, charset, contentInputStream); } catch (FileNotFoundException e) { throw new ParserException(e.getMessage()); } finally { @@ -157,6 +164,7 @@ public abstract class AbstractParser implements Parser{ * Parsing a document available as {@link InputStream} * @param location the origin of the document * @param mimeType the mimetype of the document + * @param charset the supposed charset of the document or null if unkown * @param source the {@link InputStream} containing the document content * @return a {@link plasmaParserDocument} containing the extracted plain text of the document * and some additional metadata. @@ -164,8 +172,7 @@ public abstract class AbstractParser implements Parser{ * * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream) */ - public abstract plasmaParserDocument parse(URL location, String mimeType, - InputStream source) throws ParserException, InterruptedException; + public abstract plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException; /** * @return Returns a list of library names that are needed by this parser diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java index c44b1d84c..6ffa7662e 100644 --- a/source/de/anomic/plasma/parser/Parser.java +++ b/source/de/anomic/plasma/parser/Parser.java @@ -64,39 +64,42 @@ public interface Parser { * Parsing a document available as byte array * @param location the origin of the document * @param mimeType the mimetype of the document + * @param charset the supposed charset of the document or null if unkown * @param source the content byte array * @return a {@link plasmaParserDocument} containing the extracted plain text of the document * and some additional metadata. * * @throws ParserException if the content could not be parsed properly */ - public plasmaParserDocument parse(URL location, String mimeType, byte[] source) + public plasmaParserDocument parse(URL location, String mimeType, String charset, byte[] source) throws ParserException, InterruptedException; /** * Parsing a document stored in a {@link File} * @param location the origin of the document * @param mimeType the mimetype of the document + * @param charset the supposed charset of the document or null if unkown * @param sourceFile the file containing the content of the document * @return a {@link plasmaParserDocument} containing the extracted plain text of the document * and some additional metadata. * * @throws ParserException if the content could not be parsed properly */ - public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) + public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException; /** * Parsing a document available as {@link InputStream} * @param location the origin of the document * @param mimeType the mimetype of the document + * @param charset the supposed charset of the document or null if unkown * @param source the {@link InputStream} containing the document content * @return a {@link plasmaParserDocument} containing the extracted plain text of the document * and some additional metadata. * * @throws ParserException if the content could not be parsed properly */ - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException; /** diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java index 7ce87893f..8b2020c81 100644 --- a/source/de/anomic/plasma/parser/bzip/bzipParser.java +++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java @@ -87,7 +87,7 @@ public class bzipParser extends AbstractParser implements Parser { return SUPPORTED_MIME_TYPES; } - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { File tempFile = null; try { @@ -126,7 +126,7 @@ public class bzipParser extends AbstractParser implements Parser { // creating a new parser class to parse the unzipped content plasmaParser theParser = new plasmaParser(); - return theParser.parseSource(location,null,tempFile); + return theParser.parseSource(location,null,null,tempFile); } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; throw new ParserException("Unable to parse the gzip content. " + e.getMessage()); diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index 8cf1bb32c..4e4cd0044 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -78,7 +78,7 @@ implements Parser { parserName = "Word Document Parser"; } - public plasmaParserDocument parse(URL location, String mimeType, + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java index b9db9827b..abc58e26e 100644 --- a/source/de/anomic/plasma/parser/gzip/gzipParser.java +++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java @@ -83,7 +83,7 @@ public class gzipParser extends AbstractParser implements Parser { return SUPPORTED_MIME_TYPES; } - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { File tempFile = null; try { @@ -110,7 +110,7 @@ public class gzipParser extends AbstractParser implements Parser { // creating a new parser class to parse the unzipped content plasmaParser theParser = new plasmaParser(); - return theParser.parseSource(location,null,tempFile); + return theParser.parseSource(location,null,null,tempFile); } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; throw new ParserException("Unable to parse the gzip content. " + e.getMessage()); diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java index 738018dd2..f2b86124f 100644 --- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java +++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java @@ -125,7 +125,7 @@ implements Parser { return null; } - public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException { String orgMimeType = mimeType; @@ -168,7 +168,7 @@ implements Parser { // parsing the content using the determined mimetype plasmaParser theParser = new plasmaParser(); - return theParser.parseSource(location,mimeType,sourceFile); + return theParser.parseSource(location,mimeType,charset,sourceFile); } return null; @@ -185,13 +185,13 @@ implements Parser { } } - public plasmaParserDocument parse(URL location, String mimeType, + public plasmaParserDocument parse(URL location, String mimeType,String charset, InputStream source) throws ParserException { File dstFile = null; try { dstFile = File.createTempFile("mimeTypeParser",".tmp"); serverFileUtils.copy(source,dstFile); - return parse(location,mimeType,dstFile); + return parse(location,mimeType,charset,dstFile); } catch (Exception e) { return null; } finally { diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index 5089bf6a5..b6a530d69 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -91,7 +91,7 @@ public class odtParser extends AbstractParser implements Parser { return SUPPORTED_MIME_TYPES; } - public plasmaParserDocument parse(URL location, String mimeType, File dest) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, File dest) throws ParserException, InterruptedException { try { byte[] docContent = null; @@ -168,7 +168,7 @@ public class odtParser extends AbstractParser implements Parser { } } - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException { File dest = null; try { // creating a tempfile @@ -179,7 +179,7 @@ public class odtParser extends AbstractParser implements Parser { serverFileUtils.copy(source, dest); // parsing the content - return parse(location, mimeType, dest); + return parse(location, mimeType, charset, dest); } catch (Exception e) { throw new ParserException("Unable to parse the odt document. " + e.getMessage()); } finally { @@ -210,7 +210,7 @@ public class odtParser extends AbstractParser implements Parser { ByteArrayInputStream input = new ByteArrayInputStream(content); // parsing the document - testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", input); + testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input); } catch (Exception e) { e.printStackTrace(); } diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index c513aee76..df3e49d1e 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -85,7 +85,7 @@ public class pdfParser extends AbstractParser implements Parser { return SUPPORTED_MIME_TYPES; } - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { PDDocument theDocument = null; diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index 5070007bf..a3f62dc8a 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -91,13 +91,13 @@ public class rpmParser extends AbstractParser implements Parser { return SUPPORTED_MIME_TYPES; } - public plasmaParserDocument parse(URL location, String mimeType, + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException { File dstFile = null; try { dstFile = File.createTempFile("rpmParser",".tmp"); serverFileUtils.copy(source,dstFile); - return parse(location,mimeType,dstFile); + return parse(location,mimeType,charset,dstFile); } catch (Exception e) { return null; } finally { @@ -105,7 +105,7 @@ public class rpmParser extends AbstractParser implements Parser { } } - public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException { RPMFile rpmFile = null; try { String summary = null, description = null, name = sourceFile.getName(); @@ -177,7 +177,7 @@ public class rpmParser extends AbstractParser implements Parser { rpmParser testParser = new rpmParser(); byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null); ByteArrayInputStream input = new ByteArrayInputStream(content); - testParser.parse(contentUrl, "application/x-rpm", input); + testParser.parse(contentUrl, "application/x-rpm", null, input); } catch (Exception e) { e.printStackTrace(); } diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index d5985a533..38fdbad1e 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -100,7 +100,7 @@ public class rssParser extends AbstractParser implements Parser { parserName = "Rich Site Summary/Atom Feed Parser"; } - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { try { LinkedList feedSections = new LinkedList(); diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index 9998d62fd..12b305687 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -80,7 +80,7 @@ implements Parser { parserName = "Rich Text Format Parser"; } - public plasmaParserDocument parse(URL location, String mimeType, + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index f24f55a12..6b8871012 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -94,7 +94,7 @@ public class tarParser extends AbstractParser implements Parser { return SUPPORTED_MIME_TYPES; } - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { try { // creating a new parser class to parse the unzipped content @@ -153,7 +153,7 @@ public class tarParser extends AbstractParser implements Parser { checkInterruption(); // parsing the content - theDoc = theParser.parseSource(new URL(tempFile),entryMime,tempFile); + theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile); } finally { if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){} } diff --git a/source/de/anomic/plasma/parser/vcf/build.xml b/source/de/anomic/plasma/parser/vcf/build.xml index 5270df9f2..6f1e4df85 100644 --- a/source/de/anomic/plasma/parser/vcf/build.xml +++ b/source/de/anomic/plasma/parser/vcf/build.xml @@ -1,55 +1,62 @@ - - A class to parse vCard files - + A class to parse vCard files - + + + - + + + + + + - + + + + + - - + - diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index ecd16d8be..829d00441 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -90,14 +90,14 @@ public class vcfParser extends AbstractParser implements Parser { public vcfParser() { super(LIBX_DEPENDENCIES); - parserName = "vCard Parser"; + this.parserName = "vCard Parser"; } public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { try { StringBuffer parsedTitle = new StringBuffer(); @@ -109,7 +109,9 @@ public class vcfParser extends AbstractParser implements Parser { boolean useLastLine = false; int lineNr = 0; String line = null; - BufferedReader inputReader = new BufferedReader(new InputStreamReader(source)); + BufferedReader inputReader = (charset!=null) + ? new BufferedReader(new InputStreamReader(source,charset)) + : new BufferedReader(new InputStreamReader(source)); while (true) { // check for interruption checkInterruption(); @@ -236,21 +238,26 @@ public class vcfParser extends AbstractParser implements Parser { } } + String[] sections = (String[]) parsedNames.toArray(new String[parsedNames.size()]); + byte[] text = parsedDataText.toString().getBytes(); plasmaParserDocument theDoc = new plasmaParserDocument( - location, - mimeType, - null, - null, - parsedTitle.toString(), - (String[]) parsedNames.toArray(new String[parsedNames.size()]), - "vCard", - parsedDataText.toString().getBytes(), - anchors, - null); + location, // url of the source document + mimeType, // the documents mime type + null, // a list of extracted keywords + null, // a short document title + parsedTitle.toString(), // a long document title + sections, // an array of section headlines + "vCard", // an abstract + text, // the parsed document text + anchors, // a map of extracted anchors + null); // a treeset of image URLs return theDoc; } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the vcard content. " + e.getMessage()); + + String errorMsg = "Unable to parse the vcard content. " + e.getMessage(); + this.theLogger.logSevere(errorMsg); + throw new ParserException(errorMsg); } finally { } } @@ -267,7 +274,7 @@ public class vcfParser extends AbstractParser implements Parser { vcfParser testParser = new vcfParser(); byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null); ByteArrayInputStream input = new ByteArrayInputStream(content); - testParser.parse(contentUrl, "text/x-vcard", input); + testParser.parse(contentUrl, "text/x-vcard", "UTF-8",input); } catch (Exception e) { e.printStackTrace(); } diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index f1da328c7..146f85006 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -91,7 +91,7 @@ public class zipParser extends AbstractParser implements Parser { return SUPPORTED_MIME_TYPES; } - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { try { StringBuffer docKeywords = new StringBuffer(); @@ -132,7 +132,7 @@ public class zipParser extends AbstractParser implements Parser { checkInterruption(); // parsing the content - plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,ut); + plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,null, ut); if (theDoc == null) continue; // merging all documents together diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 092d12440..b97b68aa5 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -465,12 +465,12 @@ public final class plasmaParser { } catch (Exception e) { } } - public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) throws InterruptedException { + public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) throws InterruptedException { File tempFile = null; try { tempFile = File.createTempFile("parseSource", ".tmp"); serverFileUtils.write(source, tempFile); - return parseSource(location, mimeType, tempFile); + return parseSource(location, mimeType, charset, tempFile); } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; serverLog.logSevere("PARSER", "parseSource1: " + e.getMessage(), e); @@ -481,7 +481,7 @@ public final class plasmaParser { } - public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) throws InterruptedException { + public plasmaParserDocument parseSource(URL location, String mimeType, String charset, File sourceFile) throws InterruptedException { Parser theParser = null; try { @@ -546,10 +546,12 @@ public final class plasmaParser { // if a parser was found we use it ... if (theParser != null) { - return theParser.parse(location, mimeType,sourceFile); + return theParser.parse(location, mimeType,charset,sourceFile); } else if (realtimeParsableMimeTypesContains(mimeType)) { // ...otherwise we make a scraper and transformer htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); + scraper.setCharset(PARSER_MODE_URLREDIRECTOR); + OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); serverFileUtils.copy(sourceFile, hfos); hfos.close(); @@ -691,6 +693,7 @@ public final class plasmaParser { File contentFile = null; URL contentURL = null; String contentMimeType = "application/octet-stream"; + String charSet = "UTF-8"; if (args.length < 2) { System.err.println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]"); @@ -715,6 +718,10 @@ public final class plasmaParser { contentMimeType = args[3]; } + if ((args.length == 6)&&(args[4].equalsIgnoreCase("-c"))) { + charSet = args[5]; + } + // creating a plasma parser plasmaParser theParser = new plasmaParser(); @@ -725,7 +732,7 @@ public final class plasmaParser { plasmaParser.enableAllParsers(PARSER_MODE_PROXY); // parsing the content - plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, contentFile); + plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile); // printing out all parsed sentences if (document != null) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index dfc87e157..5c0a78fa2 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -389,12 +389,12 @@ public class plasmaSnippetCache { supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1)); } - return this.parser.parseSource(url, supposedMime, resource); + return this.parser.parseSource(url, supposedMime, null, resource); } return null; } if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) { - return this.parser.parseSource(url, docInfo.getMimeType(), resource); + return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharSet(), resource); } return null; } catch (InterruptedException e) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 908c6b6e2..0a167b881 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1398,6 +1398,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // the mimetype of this entry String mimeType = entry.getMimeType(); + String charset = entry.getCharSet(); // the parser logger serverLog parserLogger = parser.getLogger(); @@ -1409,7 +1410,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ){ if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) { parserLogger.logFine("'" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File"); - document = parser.parseSource(entry.url(), mimeType, entry.cacheFile()); + document = parser.parseSource(entry.url(), mimeType, charset, entry.cacheFile()); } else { parserLogger.logFine("'" + entry.normalizedURLString() + "' cannot be parsed, no resource available"); addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength)); diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index ae8b7af1c..ff77eb002 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -320,6 +320,11 @@ public class plasmaSwitchboardQueue { return (info == null) ? null : info.getMimeType(); } + public String getCharSet() { + IResourceInfo info = this.getCachedObjectInfo(); + return (info == null) ? null : info.getCharSet(); + } + public Date getModificationDate() { IResourceInfo info = this.getCachedObjectInfo(); return (info == null) ? new Date() : info.getModificationDate();