*) changes needed for multi-language support

- parsers may need to know the charset of the byte stream git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2591 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · d0a5a53789
parent 31d6cdea53
commit d0a5a53789
23 changed files with 133 additions and 65 deletions
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -47,6 +47,8 @@ import de.anomic.server.serverByteBuffer;
 import de.anomic.net.URL;

 import java.net.MalformedURLException;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
 import java.text.Collator;
 import java.util.ArrayList;
 import java.util.HashMap;
@ -93,7 +95,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
    //private String headline;
    private List[] headlines;
    private serverByteBuffer content;
+    
    private URL root;
+    private String charset = "UTF-8";

    public htmlFilterContentScraper(URL root) {
        // the root value here will not be used to load the resource.
@ -109,6 +113,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        this.content = new serverByteBuffer(1024);
    }

+    public void setCharset(String charset) throws UnsupportedCharsetException {
+        // testing if charset exists
+        Charset.forName(charset);
+        
+        // remember it
+        this.charset = charset;
+    }
+    
    public void scrapeText(byte[] newtext) {
        // System.out.println("SCRAPE: " + new String(newtext));
        if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);
--- a/source/de/anomic/plasma/cache/IResourceInfo.java
+++ b/source/de/anomic/plasma/cache/IResourceInfo.java
@ -82,6 +82,12 @@ public interface IResourceInfo {
     */
    public String getMimeType();
    
+    /**
+     * Returns the charset of the resource
+     * @return returns the name of the charset or <code>null</code> if unknown
+     */
+    public String getCharSet();
+    
    /**
     * Returns the modification date of the cached object
     * @return the modifiaction date
--- a/source/de/anomic/plasma/cache/ftp/ResourceInfo.java
+++ b/source/de/anomic/plasma/cache/ftp/ResourceInfo.java
@ -161,4 +161,8 @@ public class ResourceInfo implements IResourceInfo {
        return responseStatus != null && responseStatus.equalsIgnoreCase("OK");
    }

+    public String getCharSet() {
+        return null;
+    }
+
 }
--- a/source/de/anomic/plasma/cache/http/ResourceInfo.java
+++ b/source/de/anomic/plasma/cache/http/ResourceInfo.java
@ -110,6 +110,15 @@ public class ResourceInfo implements IResourceInfo {
        int pos = mimeType.indexOf(';');
        return ((pos < 0) ? mimeType : mimeType.substring(0, pos));          
    }
+    
+    public String getCharSet() {
+        if (this.responseHeader == null) return null;
+        
+        String mimeType = this.responseHeader.mime();
+        
+        int pos = mimeType.indexOf(';');
+        return ((pos < 0) ? null : mimeType.substring(pos));          
+    }

    /**
     * @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@ -103,6 +103,7 @@ public abstract class AbstractParser implements Parser{
 	 * Parsing a document available as byte array.
     * @param location the origin of the document 
     * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
     * @param source the content byte array
     * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
     * and some additional metadata.
@ -113,12 +114,13 @@ public abstract class AbstractParser implements Parser{
 	public plasmaParserDocument parse(
            URL location, 
            String mimeType,
+            String charset,
            byte[] source
    ) throws ParserException, InterruptedException {
        ByteArrayInputStream contentInputStream = null;
        try {
            contentInputStream = new ByteArrayInputStream(source);
-            return this.parse(location,mimeType,contentInputStream); 
+            return this.parse(location,mimeType,charset,contentInputStream); 
        } finally {
            if (contentInputStream != null) {
                try {
@ -133,6 +135,7 @@ public abstract class AbstractParser implements Parser{
 	 * Parsing a document stored in a {@link File}
     * @param location the origin of the document 
     * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
     * @param sourceFile the file containing the content of the document
     * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
     * and some additional metadata.
@ -140,12 +143,16 @@ public abstract class AbstractParser implements Parser{
 	 * 
 	 * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File)
 	 */
-	public plasmaParserDocument parse(URL location, String mimeType,
-			File sourceFile) throws ParserException, InterruptedException {
+	public plasmaParserDocument parse(
+            URL location, 
+            String mimeType,
+            String charset,
+			File sourceFile
+	) throws ParserException, InterruptedException {
        BufferedInputStream contentInputStream = null;
        try {
            contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
-            return this.parse(location, mimeType, contentInputStream);
+            return this.parse(location, mimeType, charset, contentInputStream);
        } catch (FileNotFoundException e) {
            throw new ParserException(e.getMessage());
        } finally {
@ -157,6 +164,7 @@ public abstract class AbstractParser implements Parser{
     * Parsing a document available as {@link InputStream}
     * @param location the origin of the document 
     * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
     * @param source the {@link InputStream} containing the document content
     * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
     * and some additional metadata.
@ -164,8 +172,7 @@ public abstract class AbstractParser implements Parser{
     * 
     * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
     */
-    public abstract plasmaParserDocument parse(URL location, String mimeType,
-			InputStream source) throws ParserException, InterruptedException;
+    public abstract plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;

    /**
     * @return Returns a list of library names that are needed by this parser
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@ -64,39 +64,42 @@ public interface Parser {
     * Parsing a document available as byte array
     * @param location the origin of the document 
     * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
     * @param source the content byte array
     * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
     * and some additional metadata.
     *  
     * @throws ParserException if the content could not be parsed properly 
     */
-    public plasmaParserDocument parse(URL location, String mimeType, byte[] source)
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, byte[] source)
    throws ParserException, InterruptedException;
    
    /**
     * Parsing a document stored in a {@link File}
     * @param location the origin of the document 
     * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown 
     * @param sourceFile the file containing the content of the document
     * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
     * and some additional metadata.
     *  
     * @throws ParserException if the content could not be parsed properly 
     */    
-    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile)
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile)
    throws ParserException, InterruptedException;
    
    /**
     * Parsing a document available as {@link InputStream}
     * @param location the origin of the document 
     * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown 
     * @param source the {@link InputStream} containing the document content
     * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
     * and some additional metadata.
     *  
     * @throws ParserException if the content could not be parsed properly 
     */    
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) 
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) 
    throws ParserException, InterruptedException;
            
    /**
--- a/source/de/anomic/plasma/parser/bzip/bzipParser.java
+++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java
@ -87,7 +87,7 @@ public class bzipParser extends AbstractParser implements Parser {
        return SUPPORTED_MIME_TYPES;
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
        
        File tempFile = null;
        try {           
@ -126,7 +126,7 @@ public class bzipParser extends AbstractParser implements Parser {
            
            // creating a new parser class to parse the unzipped content
            plasmaParser theParser = new plasmaParser();
-            return theParser.parseSource(location,null,tempFile);
+            return theParser.parseSource(location,null,null,tempFile);
        } catch (Exception e) {  
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@ -78,7 +78,7 @@ implements Parser {
        parserName = "Word Document Parser";
 	}

-	public plasmaParserDocument parse(URL location, String mimeType,
+	public plasmaParserDocument parse(URL location, String mimeType, String charset,
 			InputStream source) throws ParserException, InterruptedException {

        
--- a/source/de/anomic/plasma/parser/gzip/gzipParser.java
+++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java
@ -83,7 +83,7 @@ public class gzipParser extends AbstractParser implements Parser {
        return SUPPORTED_MIME_TYPES;
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
        
        File tempFile = null;
        try {           
@ -110,7 +110,7 @@ public class gzipParser extends AbstractParser implements Parser {
            
            // creating a new parser class to parse the unzipped content
            plasmaParser theParser = new plasmaParser();
-            return theParser.parseSource(location,null,tempFile);
+            return theParser.parseSource(location,null,null,tempFile);
        } catch (Exception e) {    
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
--- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
+++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
@ -125,7 +125,7 @@ implements Parser {
        return null;        
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
        
        String orgMimeType = mimeType;
        
@ -168,7 +168,7 @@ implements Parser {
                
                // parsing the content using the determined mimetype
                plasmaParser theParser = new plasmaParser();
-                return theParser.parseSource(location,mimeType,sourceFile);
+                return theParser.parseSource(location,mimeType,charset,sourceFile);
            }
            return null;
            
@ -185,13 +185,13 @@ implements Parser {
        }
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType,
+    public plasmaParserDocument parse(URL location, String mimeType,String charset,
            InputStream source) throws ParserException {
        File dstFile = null;
        try {
            dstFile = File.createTempFile("mimeTypeParser",".tmp");
            serverFileUtils.copy(source,dstFile);
-            return parse(location,mimeType,dstFile);
+            return parse(location,mimeType,charset,dstFile);
        } catch (Exception e) {            
            return null;
        } finally {
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@ -91,7 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
        return SUPPORTED_MIME_TYPES;
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType, File dest) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File dest) throws ParserException, InterruptedException {
        
        try {          
            byte[] docContent     = null;
@ -168,7 +168,7 @@ public class odtParser extends AbstractParser implements Parser {
        }
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException {
        File dest = null;
        try {
            // creating a tempfile
@ -179,7 +179,7 @@ public class odtParser extends AbstractParser implements Parser {
            serverFileUtils.copy(source, dest);
            
            // parsing the content
-            return parse(location, mimeType, dest);
+            return parse(location, mimeType, charset, dest);
        } catch (Exception e) {
            throw new ParserException("Unable to parse the odt document. " + e.getMessage());
        } finally {
@ -210,7 +210,7 @@ public class odtParser extends AbstractParser implements Parser {
            ByteArrayInputStream input = new ByteArrayInputStream(content);
            
            // parsing the document
-            testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", input);            
+            testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input);            
        } catch (Exception e) {
            e.printStackTrace();
        }
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@ -85,7 +85,7 @@ public class pdfParser extends AbstractParser implements Parser {
        return SUPPORTED_MIME_TYPES;
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
        
        
        PDDocument theDocument = null;
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@ -91,13 +91,13 @@ public class rpmParser extends AbstractParser implements Parser {
        return SUPPORTED_MIME_TYPES;
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType,
+    public plasmaParserDocument parse(URL location, String mimeType, String charset,
            InputStream source) throws ParserException {
        File dstFile = null;
        try {
            dstFile = File.createTempFile("rpmParser",".tmp");
            serverFileUtils.copy(source,dstFile);
-            return parse(location,mimeType,dstFile);
+            return parse(location,mimeType,charset,dstFile);
        } catch (Exception e) {            
            return null;
        } finally {
@ -105,7 +105,7 @@ public class rpmParser extends AbstractParser implements Parser {
        }        
    }    
    
-    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
        RPMFile rpmFile = null;        
        try {
            String summary = null, description = null, name = sourceFile.getName();
@ -177,7 +177,7 @@ public class rpmParser extends AbstractParser implements Parser {
            rpmParser testParser = new rpmParser();
            byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
            ByteArrayInputStream input = new ByteArrayInputStream(content);
-            testParser.parse(contentUrl, "application/x-rpm", input);
+            testParser.parse(contentUrl, "application/x-rpm", null, input);
        } catch (Exception e) {
            e.printStackTrace();
        }
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@ -100,7 +100,7 @@ public class rssParser extends AbstractParser implements Parser {
        parserName = "Rich Site Summary/Atom Feed Parser"; 
 	}

-	public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+	public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {

        try {
            LinkedList feedSections = new LinkedList();
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@ -80,7 +80,7 @@ implements Parser {
        parserName = "Rich Text Format Parser";  
 	}

-	public plasmaParserDocument parse(URL location, String mimeType,
+	public plasmaParserDocument parse(URL location, String mimeType, String charset,
 			InputStream source) throws ParserException, InterruptedException {

        
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@ -94,7 +94,7 @@ public class tarParser extends AbstractParser implements Parser {
        return SUPPORTED_MIME_TYPES;
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
        
        try {           
            // creating a new parser class to parse the unzipped content
@ -153,7 +153,7 @@ public class tarParser extends AbstractParser implements Parser {
                    checkInterruption();
                    
                    // parsing the content                    
-                    theDoc = theParser.parseSource(new URL(tempFile),entryMime,tempFile);
+                    theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile);
                } finally {
                    if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){}
                }
--- a/source/de/anomic/plasma/parser/vcf/build.xml
+++ b/source/de/anomic/plasma/parser/vcf/build.xml
@ -1,55 +1,62 @@
 <?xml version="1.0"?>
 <project name="YACY - vcfParser" default="dist">
-    <description>
-            A class to parse vCard files
-    </description>
+    <description>A class to parse vCard files</description>

    <property name="parserShortName" value="vcf"/>
 	<property name="parserVersion" value="0.1"/>
-
    <property name="parserLongName" value="yacyContentParser_${parserShortName}"/>    	
   	<property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.tgz"/>	    	
    	
+	<!-- compile the sources of this parser -->
    <target name="compile">
  	  <javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}" destdir="${build}" source="${javacSource}" target="${javacTarget}" debug="true" debuglevel="lines,vars,source">
  	  	<classpath>
  	  	  <pathelement location="${build}" />	
+  	  	  
+  	  	  <!-- libraries needed by this parser -->
    	  <pathelement location="${libx}/commons-codec-1.3.jar" />
  	  	</classpath>
  	  </javac>    	
    </target>
 	
-
+	<!-- add all parts of this parser into a single zip file -->
    <target name="zip" depends="compile">
  	  <tar destfile="${parserArchive}" compression="gzip">
+  	  	<!-- needed libraries -->
  	  	<tarfileset dir="${libx}" 
  	  				includes="commons-codec-1.3.*" 
  	  				prefix="${releaseFileParentDir}/libx/"
 			  	  	dirmode="755" mode="644"/>  	  	
+		<!-- source files of this parser -->
  	  	<tarfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" 
  	  				prefix="${releaseFileParentDir}/source/de/anomic/plasma/parser/${parserShortName}"
 			  	  	dirmode="755" mode="644"/>
+  	  	<!-- class files of this parser -->
  	  	<tarfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" 
  	  				prefix="${releaseFileParentDir}/classes/de/anomic/plasma/parser/${parserShortName}"
 			  	  	dirmode="755" mode="644"/>	  	
  	  </tar>    	
    </target>	

+	<!-- just copy all parts of this parser into the release directory -->
    <target name="copy" depends="compile">
+		<!-- copy needed libs -->
        <copy todir="${release}/libx/">
             <fileset dir="${libx}" includes="commons-codec-1.3.*"/> 
-        </copy>    	
+        </copy>  
+	
+		<!-- copy source code files -->
        <copy todir="${release}/source/de/anomic/plasma/parser/${parserShortName}">
             <fileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/> 
        </copy>        
+	
+		<!-- copy compiled classes -->
        <copy todir="${release}/classes/de/anomic/plasma/parser/${parserShortName}">
             <fileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/> 
        </copy>         
    </target> 
- 
-    
+     
    <target name="dist" depends="compile,zip" description="Compile and zip the parser"/>        
 	
-	
 </project>

--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@ -90,14 +90,14 @@ public class vcfParser extends AbstractParser implements Parser {
    
    public vcfParser() {        
        super(LIBX_DEPENDENCIES);
-        parserName = "vCard Parser"; 
+        this.parserName = "vCard Parser"; 
    }
    
    public Hashtable getSupportedMimeTypes() {
        return SUPPORTED_MIME_TYPES;
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
        
        try {
            StringBuffer parsedTitle = new StringBuffer();
@ -109,7 +109,9 @@ public class vcfParser extends AbstractParser implements Parser {
            boolean useLastLine = false;
            int lineNr = 0;
            String line = null;            
-            BufferedReader inputReader = new BufferedReader(new InputStreamReader(source));
+            BufferedReader inputReader = (charset!=null)
+                                       ? new BufferedReader(new InputStreamReader(source,charset))
+                                       : new BufferedReader(new InputStreamReader(source));
            while (true) {
                // check for interruption
                checkInterruption();
@ -236,21 +238,26 @@ public class vcfParser extends AbstractParser implements Parser {
                }
            }

+            String[] sections = (String[]) parsedNames.toArray(new String[parsedNames.size()]);
+            byte[] text = parsedDataText.toString().getBytes();
            plasmaParserDocument theDoc = new plasmaParserDocument(
-                    location,
-                    mimeType,
-                    null,
-                    null,
-                    parsedTitle.toString(),
-                    (String[]) parsedNames.toArray(new String[parsedNames.size()]),
-                    "vCard",
-                    parsedDataText.toString().getBytes(),
-                    anchors,
-                    null);    
+                    location,                   // url of the source document
+                    mimeType,                   // the documents mime type
+                    null,                       // a list of extracted keywords
+                    null,                       // a short document title
+                    parsedTitle.toString(),     // a long document title
+                    sections,                   // an array of section headlines
+                    "vCard",                    // an abstract
+                    text,                       // the parsed document text
+                    anchors,                    // a map of extracted anchors
+                    null);                      // a treeset of image URLs
            return theDoc;
        } catch (Exception e) { 
            if (e instanceof InterruptedException) throw (InterruptedException) e;
-            throw new ParserException("Unable to parse the vcard content. " + e.getMessage());
+            
+            String errorMsg = "Unable to parse the vcard content. " + e.getMessage();
+            this.theLogger.logSevere(errorMsg);            
+            throw new ParserException(errorMsg);
        } finally {
        }
    }
@ -267,7 +274,7 @@ public class vcfParser extends AbstractParser implements Parser {
            vcfParser testParser = new vcfParser();
            byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
            ByteArrayInputStream input = new ByteArrayInputStream(content);
-            testParser.parse(contentUrl, "text/x-vcard", input);
+            testParser.parse(contentUrl, "text/x-vcard", "UTF-8",input);
        } catch (Exception e) {
            e.printStackTrace();
        }
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@ -91,7 +91,7 @@ public class zipParser extends AbstractParser implements Parser {
        return SUPPORTED_MIME_TYPES;
    }
    
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
        
        try {           
            StringBuffer docKeywords = new StringBuffer();
@ -132,7 +132,7 @@ public class zipParser extends AbstractParser implements Parser {
                checkInterruption();
                
                // parsing the content
-                plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,ut);
+                plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,null, ut);
                if (theDoc == null) continue;
                
                // merging all documents together
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -465,12 +465,12 @@ public final class plasmaParser {
        } catch (Exception e) { }
    }    
    
-    public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) throws InterruptedException {
+    public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) throws InterruptedException {
        File tempFile = null;
        try {
            tempFile = File.createTempFile("parseSource", ".tmp");
            serverFileUtils.write(source, tempFile);
-            return parseSource(location, mimeType, tempFile);
+            return parseSource(location, mimeType, charset, tempFile);
        } catch (Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            serverLog.logSevere("PARSER", "parseSource1: " + e.getMessage(), e);
@ -481,7 +481,7 @@ public final class plasmaParser {
        
    }

-    public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) throws InterruptedException {
+    public plasmaParserDocument parseSource(URL location, String mimeType, String charset, File sourceFile) throws InterruptedException {

        Parser theParser = null;
        try {
@ -546,10 +546,12 @@ public final class plasmaParser {
            
            // if a parser was found we use it ...
            if (theParser != null) {
-                return theParser.parse(location, mimeType,sourceFile);
+                return theParser.parse(location, mimeType,charset,sourceFile);
            } else if (realtimeParsableMimeTypesContains(mimeType)) {                      
                // ...otherwise we make a scraper and transformer
                htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
+                scraper.setCharset(PARSER_MODE_URLREDIRECTOR);
+                
                OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
                serverFileUtils.copy(sourceFile, hfos);
                hfos.close();
@ -691,6 +693,7 @@ public final class plasmaParser {
            File contentFile = null;
            URL contentURL = null;
            String contentMimeType = "application/octet-stream";
+            String charSet = "UTF-8";
            
            if (args.length < 2) {
                System.err.println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]");
@ -715,6 +718,10 @@ public final class plasmaParser {
                contentMimeType = args[3];
            }
            
+            if ((args.length == 6)&&(args[4].equalsIgnoreCase("-c"))) {
+                charSet = args[5];
+            }            
+            
            // creating a plasma parser
            plasmaParser theParser = new plasmaParser();
            
@ -725,7 +732,7 @@ public final class plasmaParser {
            plasmaParser.enableAllParsers(PARSER_MODE_PROXY);

            // parsing the content
-            plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, contentFile);
+            plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile);

            // printing out all parsed sentences
            if (document != null) {
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -389,12 +389,12 @@ public class plasmaSnippetCache {
                        supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1));
                    }

-                    return this.parser.parseSource(url, supposedMime, resource);
+                    return this.parser.parseSource(url, supposedMime, null, resource);
                }
                return null;
            }
            if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
-                return this.parser.parseSource(url, docInfo.getMimeType(), resource);
+                return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharSet(), resource);
            }
            return null;
        } catch (InterruptedException e) {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1398,6 +1398,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser

        // the mimetype of this entry
        String mimeType = entry.getMimeType();
+        String charset = entry.getCharSet();        

        // the parser logger
        serverLog parserLogger = parser.getLogger();
@ -1409,7 +1410,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        ){
            if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) {
                parserLogger.logFine("'" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File");
-                document = parser.parseSource(entry.url(), mimeType, entry.cacheFile());
+                document = parser.parseSource(entry.url(), mimeType, charset, entry.cacheFile());
            } else {
                parserLogger.logFine("'" + entry.normalizedURLString() + "' cannot be parsed, no resource available");
                addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength));
--- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
@ -320,6 +320,11 @@ public class plasmaSwitchboardQueue {
            return (info == null) ? null : info.getMimeType();
        }
        
+        public String getCharSet() {
+            IResourceInfo info = this.getCachedObjectInfo();
+            return (info == null) ? null : info.getCharSet();
+        }
+        
        public Date getModificationDate() {
            IResourceInfo info = this.getCachedObjectInfo();
            return (info == null) ? new Date() : info.getModificationDate();