diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 01a82ddb5..9fb48ffd2 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -47,6 +47,8 @@ import de.anomic.server.serverByteBuffer;
import de.anomic.net.URL;
import java.net.MalformedURLException;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
import java.text.Collator;
import java.util.ArrayList;
import java.util.HashMap;
@@ -93,7 +95,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
//private String headline;
private List[] headlines;
private serverByteBuffer content;
+
private URL root;
+ private String charset = "UTF-8";
public htmlFilterContentScraper(URL root) {
// the root value here will not be used to load the resource.
@@ -109,6 +113,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.content = new serverByteBuffer(1024);
}
+ public void setCharset(String charset) throws UnsupportedCharsetException {
+ // testing if charset exists
+ Charset.forName(charset);
+
+ // remember it
+ this.charset = charset;
+ }
+
public void scrapeText(byte[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);
diff --git a/source/de/anomic/plasma/cache/IResourceInfo.java b/source/de/anomic/plasma/cache/IResourceInfo.java
index 8cabfe366..81cfc2491 100644
--- a/source/de/anomic/plasma/cache/IResourceInfo.java
+++ b/source/de/anomic/plasma/cache/IResourceInfo.java
@@ -82,6 +82,12 @@ public interface IResourceInfo {
*/
public String getMimeType();
+ /**
+ * Returns the charset of the resource
+ * @return returns the name of the charset or null
if unknown
+ */
+ public String getCharSet();
+
/**
* Returns the modification date of the cached object
* @return the modifiaction date
diff --git a/source/de/anomic/plasma/cache/ftp/ResourceInfo.java b/source/de/anomic/plasma/cache/ftp/ResourceInfo.java
index 9d04e6646..dfbc32ccd 100644
--- a/source/de/anomic/plasma/cache/ftp/ResourceInfo.java
+++ b/source/de/anomic/plasma/cache/ftp/ResourceInfo.java
@@ -161,4 +161,8 @@ public class ResourceInfo implements IResourceInfo {
return responseStatus != null && responseStatus.equalsIgnoreCase("OK");
}
+ public String getCharSet() {
+ return null;
+ }
+
}
diff --git a/source/de/anomic/plasma/cache/http/ResourceInfo.java b/source/de/anomic/plasma/cache/http/ResourceInfo.java
index 396304572..ec3768fd1 100644
--- a/source/de/anomic/plasma/cache/http/ResourceInfo.java
+++ b/source/de/anomic/plasma/cache/http/ResourceInfo.java
@@ -110,6 +110,15 @@ public class ResourceInfo implements IResourceInfo {
int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
}
+
+ public String getCharSet() {
+ if (this.responseHeader == null) return null;
+
+ String mimeType = this.responseHeader.mime();
+
+ int pos = mimeType.indexOf(';');
+ return ((pos < 0) ? null : mimeType.substring(pos));
+ }
/**
* @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()
diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java
index e8632ce93..c69c60496 100644
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@@ -103,6 +103,7 @@ public abstract class AbstractParser implements Parser{
* Parsing a document available as byte array.
* @param location the origin of the document
* @param mimeType the mimetype of the document
+ * @param charset the supposed charset of the document or null
if unkown
* @param source the content byte array
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
@@ -113,12 +114,13 @@ public abstract class AbstractParser implements Parser{
public plasmaParserDocument parse(
URL location,
String mimeType,
+ String charset,
byte[] source
) throws ParserException, InterruptedException {
ByteArrayInputStream contentInputStream = null;
try {
contentInputStream = new ByteArrayInputStream(source);
- return this.parse(location,mimeType,contentInputStream);
+ return this.parse(location,mimeType,charset,contentInputStream);
} finally {
if (contentInputStream != null) {
try {
@@ -133,6 +135,7 @@ public abstract class AbstractParser implements Parser{
* Parsing a document stored in a {@link File}
* @param location the origin of the document
* @param mimeType the mimetype of the document
+ * @param charset the supposed charset of the document or null
if unkown
* @param sourceFile the file containing the content of the document
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
@@ -140,12 +143,16 @@ public abstract class AbstractParser implements Parser{
*
* @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File)
*/
- public plasmaParserDocument parse(URL location, String mimeType,
- File sourceFile) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(
+ URL location,
+ String mimeType,
+ String charset,
+ File sourceFile
+ ) throws ParserException, InterruptedException {
BufferedInputStream contentInputStream = null;
try {
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
- return this.parse(location, mimeType, contentInputStream);
+ return this.parse(location, mimeType, charset, contentInputStream);
} catch (FileNotFoundException e) {
throw new ParserException(e.getMessage());
} finally {
@@ -157,6 +164,7 @@ public abstract class AbstractParser implements Parser{
* Parsing a document available as {@link InputStream}
* @param location the origin of the document
* @param mimeType the mimetype of the document
+ * @param charset the supposed charset of the document or null
if unkown
* @param source the {@link InputStream} containing the document content
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
@@ -164,8 +172,7 @@ public abstract class AbstractParser implements Parser{
*
* @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
*/
- public abstract plasmaParserDocument parse(URL location, String mimeType,
- InputStream source) throws ParserException, InterruptedException;
+ public abstract plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;
/**
* @return Returns a list of library names that are needed by this parser
diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java
index c44b1d84c..6ffa7662e 100644
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@@ -64,39 +64,42 @@ public interface Parser {
* Parsing a document available as byte array
* @param location the origin of the document
* @param mimeType the mimetype of the document
+ * @param charset the supposed charset of the document or null
if unkown
* @param source the content byte array
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
- public plasmaParserDocument parse(URL location, String mimeType, byte[] source)
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, byte[] source)
throws ParserException, InterruptedException;
/**
* Parsing a document stored in a {@link File}
* @param location the origin of the document
* @param mimeType the mimetype of the document
+ * @param charset the supposed charset of the document or null
if unkown
* @param sourceFile the file containing the content of the document
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
- public plasmaParserDocument parse(URL location, String mimeType, File sourceFile)
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile)
throws ParserException, InterruptedException;
/**
* Parsing a document available as {@link InputStream}
* @param location the origin of the document
* @param mimeType the mimetype of the document
+ * @param charset the supposed charset of the document or null
if unkown
* @param source the {@link InputStream} containing the document content
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
- public plasmaParserDocument parse(URL location, String mimeType, InputStream source)
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source)
throws ParserException, InterruptedException;
/**
diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java
index 7ce87893f..8b2020c81 100644
--- a/source/de/anomic/plasma/parser/bzip/bzipParser.java
+++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java
@@ -87,7 +87,7 @@ public class bzipParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
- public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
File tempFile = null;
try {
@@ -126,7 +126,7 @@ public class bzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
- return theParser.parseSource(location,null,tempFile);
+ return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java
index 8cf1bb32c..4e4cd0044 100644
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@@ -78,7 +78,7 @@ implements Parser {
parserName = "Word Document Parser";
}
- public plasmaParserDocument parse(URL location, String mimeType,
+ public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException, InterruptedException {
diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java
index b9db9827b..abc58e26e 100644
--- a/source/de/anomic/plasma/parser/gzip/gzipParser.java
+++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java
@@ -83,7 +83,7 @@ public class gzipParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
- public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
File tempFile = null;
try {
@@ -110,7 +110,7 @@ public class gzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
- return theParser.parseSource(location,null,tempFile);
+ return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
index 738018dd2..f2b86124f 100644
--- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
+++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
@@ -125,7 +125,7 @@ implements Parser {
return null;
}
- public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
String orgMimeType = mimeType;
@@ -168,7 +168,7 @@ implements Parser {
// parsing the content using the determined mimetype
plasmaParser theParser = new plasmaParser();
- return theParser.parseSource(location,mimeType,sourceFile);
+ return theParser.parseSource(location,mimeType,charset,sourceFile);
}
return null;
@@ -185,13 +185,13 @@ implements Parser {
}
}
- public plasmaParserDocument parse(URL location, String mimeType,
+ public plasmaParserDocument parse(URL location, String mimeType,String charset,
InputStream source) throws ParserException {
File dstFile = null;
try {
dstFile = File.createTempFile("mimeTypeParser",".tmp");
serverFileUtils.copy(source,dstFile);
- return parse(location,mimeType,dstFile);
+ return parse(location,mimeType,charset,dstFile);
} catch (Exception e) {
return null;
} finally {
diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java
index 5089bf6a5..b6a530d69 100644
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@@ -91,7 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
- public plasmaParserDocument parse(URL location, String mimeType, File dest) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, File dest) throws ParserException, InterruptedException {
try {
byte[] docContent = null;
@@ -168,7 +168,7 @@ public class odtParser extends AbstractParser implements Parser {
}
}
- public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException {
File dest = null;
try {
// creating a tempfile
@@ -179,7 +179,7 @@ public class odtParser extends AbstractParser implements Parser {
serverFileUtils.copy(source, dest);
// parsing the content
- return parse(location, mimeType, dest);
+ return parse(location, mimeType, charset, dest);
} catch (Exception e) {
throw new ParserException("Unable to parse the odt document. " + e.getMessage());
} finally {
@@ -210,7 +210,7 @@ public class odtParser extends AbstractParser implements Parser {
ByteArrayInputStream input = new ByteArrayInputStream(content);
// parsing the document
- testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", input);
+ testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input);
} catch (Exception e) {
e.printStackTrace();
}
diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
index c513aee76..df3e49d1e 100644
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -85,7 +85,7 @@ public class pdfParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
- public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
PDDocument theDocument = null;
diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java
index 5070007bf..a3f62dc8a 100644
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@@ -91,13 +91,13 @@ public class rpmParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
- public plasmaParserDocument parse(URL location, String mimeType,
+ public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException {
File dstFile = null;
try {
dstFile = File.createTempFile("rpmParser",".tmp");
serverFileUtils.copy(source,dstFile);
- return parse(location,mimeType,dstFile);
+ return parse(location,mimeType,charset,dstFile);
} catch (Exception e) {
return null;
} finally {
@@ -105,7 +105,7 @@ public class rpmParser extends AbstractParser implements Parser {
}
}
- public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
RPMFile rpmFile = null;
try {
String summary = null, description = null, name = sourceFile.getName();
@@ -177,7 +177,7 @@ public class rpmParser extends AbstractParser implements Parser {
rpmParser testParser = new rpmParser();
byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
ByteArrayInputStream input = new ByteArrayInputStream(content);
- testParser.parse(contentUrl, "application/x-rpm", input);
+ testParser.parse(contentUrl, "application/x-rpm", null, input);
} catch (Exception e) {
e.printStackTrace();
}
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index d5985a533..38fdbad1e 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -100,7 +100,7 @@ public class rssParser extends AbstractParser implements Parser {
parserName = "Rich Site Summary/Atom Feed Parser";
}
- public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try {
LinkedList feedSections = new LinkedList();
diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java
index 9998d62fd..12b305687 100644
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@@ -80,7 +80,7 @@ implements Parser {
parserName = "Rich Text Format Parser";
}
- public plasmaParserDocument parse(URL location, String mimeType,
+ public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException, InterruptedException {
diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
index f24f55a12..6b8871012 100644
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -94,7 +94,7 @@ public class tarParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
- public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try {
// creating a new parser class to parse the unzipped content
@@ -153,7 +153,7 @@ public class tarParser extends AbstractParser implements Parser {
checkInterruption();
// parsing the content
- theDoc = theParser.parseSource(new URL(tempFile),entryMime,tempFile);
+ theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile);
} finally {
if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){}
}
diff --git a/source/de/anomic/plasma/parser/vcf/build.xml b/source/de/anomic/plasma/parser/vcf/build.xml
index 5270df9f2..6f1e4df85 100644
--- a/source/de/anomic/plasma/parser/vcf/build.xml
+++ b/source/de/anomic/plasma/parser/vcf/build.xml
@@ -1,55 +1,62 @@
-
- A class to parse vCard files
-
+ A class to parse vCard files
-
+
+
+
-
+
+
+
+
+
+
-
+
+
+
+
+
-
-
+
-
diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java
index ecd16d8be..829d00441 100644
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@@ -90,14 +90,14 @@ public class vcfParser extends AbstractParser implements Parser {
public vcfParser() {
super(LIBX_DEPENDENCIES);
- parserName = "vCard Parser";
+ this.parserName = "vCard Parser";
}
public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
- public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try {
StringBuffer parsedTitle = new StringBuffer();
@@ -109,7 +109,9 @@ public class vcfParser extends AbstractParser implements Parser {
boolean useLastLine = false;
int lineNr = 0;
String line = null;
- BufferedReader inputReader = new BufferedReader(new InputStreamReader(source));
+ BufferedReader inputReader = (charset!=null)
+ ? new BufferedReader(new InputStreamReader(source,charset))
+ : new BufferedReader(new InputStreamReader(source));
while (true) {
// check for interruption
checkInterruption();
@@ -236,21 +238,26 @@ public class vcfParser extends AbstractParser implements Parser {
}
}
+ String[] sections = (String[]) parsedNames.toArray(new String[parsedNames.size()]);
+ byte[] text = parsedDataText.toString().getBytes();
plasmaParserDocument theDoc = new plasmaParserDocument(
- location,
- mimeType,
- null,
- null,
- parsedTitle.toString(),
- (String[]) parsedNames.toArray(new String[parsedNames.size()]),
- "vCard",
- parsedDataText.toString().getBytes(),
- anchors,
- null);
+ location, // url of the source document
+ mimeType, // the documents mime type
+ null, // a list of extracted keywords
+ null, // a short document title
+ parsedTitle.toString(), // a long document title
+ sections, // an array of section headlines
+ "vCard", // an abstract
+ text, // the parsed document text
+ anchors, // a map of extracted anchors
+ null); // a treeset of image URLs
return theDoc;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the vcard content. " + e.getMessage());
+
+ String errorMsg = "Unable to parse the vcard content. " + e.getMessage();
+ this.theLogger.logSevere(errorMsg);
+ throw new ParserException(errorMsg);
} finally {
}
}
@@ -267,7 +274,7 @@ public class vcfParser extends AbstractParser implements Parser {
vcfParser testParser = new vcfParser();
byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
ByteArrayInputStream input = new ByteArrayInputStream(content);
- testParser.parse(contentUrl, "text/x-vcard", input);
+ testParser.parse(contentUrl, "text/x-vcard", "UTF-8",input);
} catch (Exception e) {
e.printStackTrace();
}
diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java
index f1da328c7..146f85006 100644
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@@ -91,7 +91,7 @@ public class zipParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
- public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try {
StringBuffer docKeywords = new StringBuffer();
@@ -132,7 +132,7 @@ public class zipParser extends AbstractParser implements Parser {
checkInterruption();
// parsing the content
- plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,ut);
+ plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,null, ut);
if (theDoc == null) continue;
// merging all documents together
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 092d12440..b97b68aa5 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -465,12 +465,12 @@ public final class plasmaParser {
} catch (Exception e) { }
}
- public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) throws InterruptedException {
+ public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) throws InterruptedException {
File tempFile = null;
try {
tempFile = File.createTempFile("parseSource", ".tmp");
serverFileUtils.write(source, tempFile);
- return parseSource(location, mimeType, tempFile);
+ return parseSource(location, mimeType, charset, tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
serverLog.logSevere("PARSER", "parseSource1: " + e.getMessage(), e);
@@ -481,7 +481,7 @@ public final class plasmaParser {
}
- public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) throws InterruptedException {
+ public plasmaParserDocument parseSource(URL location, String mimeType, String charset, File sourceFile) throws InterruptedException {
Parser theParser = null;
try {
@@ -546,10 +546,12 @@ public final class plasmaParser {
// if a parser was found we use it ...
if (theParser != null) {
- return theParser.parse(location, mimeType,sourceFile);
+ return theParser.parse(location, mimeType,charset,sourceFile);
} else if (realtimeParsableMimeTypesContains(mimeType)) {
// ...otherwise we make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
+ scraper.setCharset(PARSER_MODE_URLREDIRECTOR);
+
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.copy(sourceFile, hfos);
hfos.close();
@@ -691,6 +693,7 @@ public final class plasmaParser {
File contentFile = null;
URL contentURL = null;
String contentMimeType = "application/octet-stream";
+ String charSet = "UTF-8";
if (args.length < 2) {
System.err.println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]");
@@ -715,6 +718,10 @@ public final class plasmaParser {
contentMimeType = args[3];
}
+ if ((args.length == 6)&&(args[4].equalsIgnoreCase("-c"))) {
+ charSet = args[5];
+ }
+
// creating a plasma parser
plasmaParser theParser = new plasmaParser();
@@ -725,7 +732,7 @@ public final class plasmaParser {
plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
// parsing the content
- plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, contentFile);
+ plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile);
// printing out all parsed sentences
if (document != null) {
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index dfc87e157..5c0a78fa2 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -389,12 +389,12 @@ public class plasmaSnippetCache {
supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1));
}
- return this.parser.parseSource(url, supposedMime, resource);
+ return this.parser.parseSource(url, supposedMime, null, resource);
}
return null;
}
if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
- return this.parser.parseSource(url, docInfo.getMimeType(), resource);
+ return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharSet(), resource);
}
return null;
} catch (InterruptedException e) {
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 908c6b6e2..0a167b881 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1398,6 +1398,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// the mimetype of this entry
String mimeType = entry.getMimeType();
+ String charset = entry.getCharSet();
// the parser logger
serverLog parserLogger = parser.getLogger();
@@ -1409,7 +1410,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
){
if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) {
parserLogger.logFine("'" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File");
- document = parser.parseSource(entry.url(), mimeType, entry.cacheFile());
+ document = parser.parseSource(entry.url(), mimeType, charset, entry.cacheFile());
} else {
parserLogger.logFine("'" + entry.normalizedURLString() + "' cannot be parsed, no resource available");
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength));
diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
index ae8b7af1c..ff77eb002 100644
--- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
@@ -320,6 +320,11 @@ public class plasmaSwitchboardQueue {
return (info == null) ? null : info.getMimeType();
}
+ public String getCharSet() {
+ IResourceInfo info = this.getCachedObjectInfo();
+ return (info == null) ? null : info.getCharSet();
+ }
+
public Date getModificationDate() {
IResourceInfo info = this.getCachedObjectInfo();
return (info == null) ? new Date() : info.getModificationDate();