From d0a5a53789f57dad0ea199f327e868cb426fb1d9 Mon Sep 17 00:00:00 2001
From: theli <theli@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Fri, 15 Sep 2006 12:52:46 +0000
Subject: [PATCH] *) changes needed for multi-language support    - parsers may
 need to know the charset of the byte stream

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2591 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 .../htmlFilter/htmlFilterContentScraper.java  | 12 ++++++
 .../de/anomic/plasma/cache/IResourceInfo.java |  6 +++
 .../anomic/plasma/cache/ftp/ResourceInfo.java |  4 ++
 .../plasma/cache/http/ResourceInfo.java       |  9 +++++
 .../anomic/plasma/parser/AbstractParser.java  | 19 +++++++---
 source/de/anomic/plasma/parser/Parser.java    |  9 +++--
 .../anomic/plasma/parser/bzip/bzipParser.java |  4 +-
 .../anomic/plasma/parser/doc/docParser.java   |  2 +-
 .../anomic/plasma/parser/gzip/gzipParser.java |  4 +-
 .../parser/mimeType/mimeTypeParser.java       |  8 ++--
 .../anomic/plasma/parser/odt/odtParser.java   |  8 ++--
 .../anomic/plasma/parser/pdf/pdfParser.java   |  2 +-
 .../anomic/plasma/parser/rpm/rpmParser.java   |  8 ++--
 .../anomic/plasma/parser/rss/rssParser.java   |  2 +-
 .../anomic/plasma/parser/rtf/rtfParser.java   |  2 +-
 .../anomic/plasma/parser/tar/tarParser.java   |  4 +-
 source/de/anomic/plasma/parser/vcf/build.xml  | 25 ++++++++-----
 .../anomic/plasma/parser/vcf/vcfParser.java   | 37 +++++++++++--------
 .../anomic/plasma/parser/zip/zipParser.java   |  4 +-
 source/de/anomic/plasma/plasmaParser.java     | 17 ++++++---
 .../de/anomic/plasma/plasmaSnippetCache.java  |  4 +-
 .../de/anomic/plasma/plasmaSwitchboard.java   |  3 +-
 .../anomic/plasma/plasmaSwitchboardQueue.java |  5 +++
 23 files changed, 133 insertions(+), 65 deletions(-)
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 01a82ddb5..9fb48ffd2 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -47,6 +47,8 @@ import de.anomic.server.serverByteBuffer;
 import de.anomic.net.URL;
 
 import java.net.MalformedURLException;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
 import java.text.Collator;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -93,7 +95,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
     //private String headline;
     private List[] headlines;
     private serverByteBuffer content;
+    
     private URL root;
+    private String charset = "UTF-8";
 
     public htmlFilterContentScraper(URL root) {
         // the root value here will not be used to load the resource.
@@ -109,6 +113,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         this.content = new serverByteBuffer(1024);
     }
 
+    public void setCharset(String charset) throws UnsupportedCharsetException {
+        // testing if charset exists
+        Charset.forName(charset);
+        
+        // remember it
+        this.charset = charset;
+    }
+    
     public void scrapeText(byte[] newtext) {
         // System.out.println("SCRAPE: " + new String(newtext));
         if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);
diff --git a/source/de/anomic/plasma/cache/IResourceInfo.java b/source/de/anomic/plasma/cache/IResourceInfo.java
index 8cabfe366..81cfc2491 100644
--- a/source/de/anomic/plasma/cache/IResourceInfo.java
+++ b/source/de/anomic/plasma/cache/IResourceInfo.java
@@ -82,6 +82,12 @@ public interface IResourceInfo {
      */
     public String getMimeType();
     
+    /**
+     * Returns the charset of the resource
+     * @return returns the name of the charset or <code>null</code> if unknown
+     */
+    public String getCharSet();
+    
     /**
      * Returns the modification date of the cached object
      * @return the modifiaction date
diff --git a/source/de/anomic/plasma/cache/ftp/ResourceInfo.java b/source/de/anomic/plasma/cache/ftp/ResourceInfo.java
index 9d04e6646..dfbc32ccd 100644
--- a/source/de/anomic/plasma/cache/ftp/ResourceInfo.java
+++ b/source/de/anomic/plasma/cache/ftp/ResourceInfo.java
@@ -161,4 +161,8 @@ public class ResourceInfo implements IResourceInfo {
         return responseStatus != null && responseStatus.equalsIgnoreCase("OK");
     }
 
+    public String getCharSet() {
+        return null;
+    }
+
 }
diff --git a/source/de/anomic/plasma/cache/http/ResourceInfo.java b/source/de/anomic/plasma/cache/http/ResourceInfo.java
index 396304572..ec3768fd1 100644
--- a/source/de/anomic/plasma/cache/http/ResourceInfo.java
+++ b/source/de/anomic/plasma/cache/http/ResourceInfo.java
@@ -110,6 +110,15 @@ public class ResourceInfo implements IResourceInfo {
         int pos = mimeType.indexOf(';');
         return ((pos < 0) ? mimeType : mimeType.substring(0, pos));          
     }
+    
+    public String getCharSet() {
+        if (this.responseHeader == null) return null;
+        
+        String mimeType = this.responseHeader.mime();
+        
+        int pos = mimeType.indexOf(';');
+        return ((pos < 0) ? null : mimeType.substring(pos));          
+    }
 
     /**
      * @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()
diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java
index e8632ce93..c69c60496 100644
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@@ -103,6 +103,7 @@ public abstract class AbstractParser implements Parser{
 	 * Parsing a document available as byte array.
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
      * @param source the content byte array
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
@@ -113,12 +114,13 @@ public abstract class AbstractParser implements Parser{
 	public plasmaParserDocument parse(
             URL location, 
             String mimeType,
+            String charset,
             byte[] source
     ) throws ParserException, InterruptedException {
         ByteArrayInputStream contentInputStream = null;
         try {
             contentInputStream = new ByteArrayInputStream(source);
-            return this.parse(location,mimeType,contentInputStream); 
+            return this.parse(location,mimeType,charset,contentInputStream); 
         } finally {
             if (contentInputStream != null) {
                 try {
@@ -133,6 +135,7 @@ public abstract class AbstractParser implements Parser{
 	 * Parsing a document stored in a {@link File}
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
      * @param sourceFile the file containing the content of the document
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
@@ -140,12 +143,16 @@ public abstract class AbstractParser implements Parser{
 	 * 
 	 * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File)
 	 */
-	public plasmaParserDocument parse(URL location, String mimeType,
-			File sourceFile) throws ParserException, InterruptedException {
+	public plasmaParserDocument parse(
+            URL location, 
+            String mimeType,
+            String charset,
+			File sourceFile
+	) throws ParserException, InterruptedException {
         BufferedInputStream contentInputStream = null;
         try {
             contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
-            return this.parse(location, mimeType, contentInputStream);
+            return this.parse(location, mimeType, charset, contentInputStream);
         } catch (FileNotFoundException e) {
             throw new ParserException(e.getMessage());
         } finally {
@@ -157,6 +164,7 @@ public abstract class AbstractParser implements Parser{
      * Parsing a document available as {@link InputStream}
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
      * @param source the {@link InputStream} containing the document content
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
@@ -164,8 +172,7 @@ public abstract class AbstractParser implements Parser{
      * 
      * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
      */
-    public abstract plasmaParserDocument parse(URL location, String mimeType,
-			InputStream source) throws ParserException, InterruptedException;
+    public abstract plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;
 
     /**
      * @return Returns a list of library names that are needed by this parser
diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java
index c44b1d84c..6ffa7662e 100644
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@@ -64,39 +64,42 @@ public interface Parser {
      * Parsing a document available as byte array
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown
      * @param source the content byte array
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
      *  
      * @throws ParserException if the content could not be parsed properly 
      */
-    public plasmaParserDocument parse(URL location, String mimeType, byte[] source)
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, byte[] source)
     throws ParserException, InterruptedException;
     
     /**
      * Parsing a document stored in a {@link File}
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown 
      * @param sourceFile the file containing the content of the document
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
      *  
      * @throws ParserException if the content could not be parsed properly 
      */    
-    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile)
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile)
     throws ParserException, InterruptedException;
     
     /**
      * Parsing a document available as {@link InputStream}
      * @param location the origin of the document 
      * @param mimeType the mimetype of the document
+     * @param charset the supposed charset of the document or <code>null</code> if unkown 
      * @param source the {@link InputStream} containing the document content
      * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
      * and some additional metadata.
      *  
      * @throws ParserException if the content could not be parsed properly 
      */    
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) 
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) 
     throws ParserException, InterruptedException;
             
     /**
diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java
index 7ce87893f..8b2020c81 100644
--- a/source/de/anomic/plasma/parser/bzip/bzipParser.java
+++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java
@@ -87,7 +87,7 @@ public class bzipParser extends AbstractParser implements Parser {
         return SUPPORTED_MIME_TYPES;
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
         
         File tempFile = null;
         try {           
@@ -126,7 +126,7 @@ public class bzipParser extends AbstractParser implements Parser {
             
             // creating a new parser class to parse the unzipped content
             plasmaParser theParser = new plasmaParser();
-            return theParser.parseSource(location,null,tempFile);
+            return theParser.parseSource(location,null,null,tempFile);
         } catch (Exception e) {  
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java
index 8cf1bb32c..4e4cd0044 100644
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@@ -78,7 +78,7 @@ implements Parser {
         parserName = "Word Document Parser";
 	}
 
-	public plasmaParserDocument parse(URL location, String mimeType,
+	public plasmaParserDocument parse(URL location, String mimeType, String charset,
 			InputStream source) throws ParserException, InterruptedException {
 
         
diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java
index b9db9827b..abc58e26e 100644
--- a/source/de/anomic/plasma/parser/gzip/gzipParser.java
+++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java
@@ -83,7 +83,7 @@ public class gzipParser extends AbstractParser implements Parser {
         return SUPPORTED_MIME_TYPES;
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
         
         File tempFile = null;
         try {           
@@ -110,7 +110,7 @@ public class gzipParser extends AbstractParser implements Parser {
             
             // creating a new parser class to parse the unzipped content
             plasmaParser theParser = new plasmaParser();
-            return theParser.parseSource(location,null,tempFile);
+            return theParser.parseSource(location,null,null,tempFile);
         } catch (Exception e) {    
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
index 738018dd2..f2b86124f 100644
--- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
+++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
@@ -125,7 +125,7 @@ implements Parser {
         return null;        
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
         
         String orgMimeType = mimeType;
         
@@ -168,7 +168,7 @@ implements Parser {
                 
                 // parsing the content using the determined mimetype
                 plasmaParser theParser = new plasmaParser();
-                return theParser.parseSource(location,mimeType,sourceFile);
+                return theParser.parseSource(location,mimeType,charset,sourceFile);
             }
             return null;
             
@@ -185,13 +185,13 @@ implements Parser {
         }
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType,
+    public plasmaParserDocument parse(URL location, String mimeType,String charset,
             InputStream source) throws ParserException {
         File dstFile = null;
         try {
             dstFile = File.createTempFile("mimeTypeParser",".tmp");
             serverFileUtils.copy(source,dstFile);
-            return parse(location,mimeType,dstFile);
+            return parse(location,mimeType,charset,dstFile);
         } catch (Exception e) {            
             return null;
         } finally {
diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java
index 5089bf6a5..b6a530d69 100644
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@@ -91,7 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
         return SUPPORTED_MIME_TYPES;
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType, File dest) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File dest) throws ParserException, InterruptedException {
         
         try {          
             byte[] docContent     = null;
@@ -168,7 +168,7 @@ public class odtParser extends AbstractParser implements Parser {
         }
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException {
         File dest = null;
         try {
             // creating a tempfile
@@ -179,7 +179,7 @@ public class odtParser extends AbstractParser implements Parser {
             serverFileUtils.copy(source, dest);
             
             // parsing the content
-            return parse(location, mimeType, dest);
+            return parse(location, mimeType, charset, dest);
         } catch (Exception e) {
             throw new ParserException("Unable to parse the odt document. " + e.getMessage());
         } finally {
@@ -210,7 +210,7 @@ public class odtParser extends AbstractParser implements Parser {
             ByteArrayInputStream input = new ByteArrayInputStream(content);
             
             // parsing the document
-            testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", input);            
+            testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input);            
         } catch (Exception e) {
             e.printStackTrace();
         }
diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
index c513aee76..df3e49d1e 100644
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -85,7 +85,7 @@ public class pdfParser extends AbstractParser implements Parser {
         return SUPPORTED_MIME_TYPES;
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
         
         
         PDDocument theDocument = null;
diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java
index 5070007bf..a3f62dc8a 100644
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@@ -91,13 +91,13 @@ public class rpmParser extends AbstractParser implements Parser {
         return SUPPORTED_MIME_TYPES;
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType,
+    public plasmaParserDocument parse(URL location, String mimeType, String charset,
             InputStream source) throws ParserException {
         File dstFile = null;
         try {
             dstFile = File.createTempFile("rpmParser",".tmp");
             serverFileUtils.copy(source,dstFile);
-            return parse(location,mimeType,dstFile);
+            return parse(location,mimeType,charset,dstFile);
         } catch (Exception e) {            
             return null;
         } finally {
@@ -105,7 +105,7 @@ public class rpmParser extends AbstractParser implements Parser {
         }        
     }    
     
-    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
         RPMFile rpmFile = null;        
         try {
             String summary = null, description = null, name = sourceFile.getName();
@@ -177,7 +177,7 @@ public class rpmParser extends AbstractParser implements Parser {
             rpmParser testParser = new rpmParser();
             byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
             ByteArrayInputStream input = new ByteArrayInputStream(content);
-            testParser.parse(contentUrl, "application/x-rpm", input);
+            testParser.parse(contentUrl, "application/x-rpm", null, input);
         } catch (Exception e) {
             e.printStackTrace();
         }
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index d5985a533..38fdbad1e 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -100,7 +100,7 @@ public class rssParser extends AbstractParser implements Parser {
         parserName = "Rich Site Summary/Atom Feed Parser"; 
 	}
 
-	public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+	public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
 
         try {
             LinkedList feedSections = new LinkedList();
diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java
index 9998d62fd..12b305687 100644
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@@ -80,7 +80,7 @@ implements Parser {
         parserName = "Rich Text Format Parser";  
 	}
 
-	public plasmaParserDocument parse(URL location, String mimeType,
+	public plasmaParserDocument parse(URL location, String mimeType, String charset,
 			InputStream source) throws ParserException, InterruptedException {
 
         
diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
index f24f55a12..6b8871012 100644
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -94,7 +94,7 @@ public class tarParser extends AbstractParser implements Parser {
         return SUPPORTED_MIME_TYPES;
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
         
         try {           
             // creating a new parser class to parse the unzipped content
@@ -153,7 +153,7 @@ public class tarParser extends AbstractParser implements Parser {
                     checkInterruption();
                     
                     // parsing the content                    
-                    theDoc = theParser.parseSource(new URL(tempFile),entryMime,tempFile);
+                    theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile);
                 } finally {
                     if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){}
                 }
diff --git a/source/de/anomic/plasma/parser/vcf/build.xml b/source/de/anomic/plasma/parser/vcf/build.xml
index 5270df9f2..6f1e4df85 100644
--- a/source/de/anomic/plasma/parser/vcf/build.xml
+++ b/source/de/anomic/plasma/parser/vcf/build.xml
@@ -1,55 +1,62 @@
 <?xml version="1.0"?>
 <project name="YACY - vcfParser" default="dist">
-    <description>
-            A class to parse vCard files
-    </description>
+    <description>A class to parse vCard files</description>
 
     <property name="parserShortName" value="vcf"/>
 	<property name="parserVersion" value="0.1"/>
-
     <property name="parserLongName" value="yacyContentParser_${parserShortName}"/>    	
    	<property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.tgz"/>	    	
     	
+	<!-- compile the sources of this parser -->
     <target name="compile">
   	  <javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}" destdir="${build}" source="${javacSource}" target="${javacTarget}" debug="true" debuglevel="lines,vars,source">
   	  	<classpath>
   	  	  <pathelement location="${build}" />	
+  	  	  
+  	  	  <!-- libraries needed by this parser -->
     	  <pathelement location="${libx}/commons-codec-1.3.jar" />
   	  	</classpath>
   	  </javac>    	
     </target>
 	
-
+	<!-- add all parts of this parser into a single zip file -->
     <target name="zip" depends="compile">
   	  <tar destfile="${parserArchive}" compression="gzip">
+  	  	<!-- needed libraries -->
   	  	<tarfileset dir="${libx}" 
   	  				includes="commons-codec-1.3.*" 
   	  				prefix="${releaseFileParentDir}/libx/"
 			  	  	dirmode="755" mode="644"/>  	  	
+		<!-- source files of this parser -->
   	  	<tarfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" 
   	  				prefix="${releaseFileParentDir}/source/de/anomic/plasma/parser/${parserShortName}"
 			  	  	dirmode="755" mode="644"/>
+  	  	<!-- class files of this parser -->
   	  	<tarfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" 
   	  				prefix="${releaseFileParentDir}/classes/de/anomic/plasma/parser/${parserShortName}"
 			  	  	dirmode="755" mode="644"/>	  	
   	  </tar>    	
     </target>	
 
+	<!-- just copy all parts of this parser into the release directory -->
     <target name="copy" depends="compile">
+		<!-- copy needed libs -->
         <copy todir="${release}/libx/">
              <fileset dir="${libx}" includes="commons-codec-1.3.*"/> 
-        </copy>    	
+        </copy>  
+	
+		<!-- copy source code files -->
         <copy todir="${release}/source/de/anomic/plasma/parser/${parserShortName}">
              <fileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/> 
         </copy>        
+	
+		<!-- copy compiled classes -->
         <copy todir="${release}/classes/de/anomic/plasma/parser/${parserShortName}">
              <fileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/> 
         </copy>         
     </target> 
- 
-    
+     
     <target name="dist" depends="compile,zip" description="Compile and zip the parser"/>        
 	
-	
 </project>
 
diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java
index ecd16d8be..829d00441 100644
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@@ -90,14 +90,14 @@ public class vcfParser extends AbstractParser implements Parser {
     
     public vcfParser() {        
         super(LIBX_DEPENDENCIES);
-        parserName = "vCard Parser"; 
+        this.parserName = "vCard Parser"; 
     }
     
     public Hashtable getSupportedMimeTypes() {
         return SUPPORTED_MIME_TYPES;
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
         
         try {
             StringBuffer parsedTitle = new StringBuffer();
@@ -109,7 +109,9 @@ public class vcfParser extends AbstractParser implements Parser {
             boolean useLastLine = false;
             int lineNr = 0;
             String line = null;            
-            BufferedReader inputReader = new BufferedReader(new InputStreamReader(source));
+            BufferedReader inputReader = (charset!=null)
+                                       ? new BufferedReader(new InputStreamReader(source,charset))
+                                       : new BufferedReader(new InputStreamReader(source));
             while (true) {
                 // check for interruption
                 checkInterruption();
@@ -236,21 +238,26 @@ public class vcfParser extends AbstractParser implements Parser {
                 }
             }
 
+            String[] sections = (String[]) parsedNames.toArray(new String[parsedNames.size()]);
+            byte[] text = parsedDataText.toString().getBytes();
             plasmaParserDocument theDoc = new plasmaParserDocument(
-                    location,
-                    mimeType,
-                    null,
-                    null,
-                    parsedTitle.toString(),
-                    (String[]) parsedNames.toArray(new String[parsedNames.size()]),
-                    "vCard",
-                    parsedDataText.toString().getBytes(),
-                    anchors,
-                    null);    
+                    location,                   // url of the source document
+                    mimeType,                   // the documents mime type
+                    null,                       // a list of extracted keywords
+                    null,                       // a short document title
+                    parsedTitle.toString(),     // a long document title
+                    sections,                   // an array of section headlines
+                    "vCard",                    // an abstract
+                    text,                       // the parsed document text
+                    anchors,                    // a map of extracted anchors
+                    null);                      // a treeset of image URLs
             return theDoc;
         } catch (Exception e) { 
             if (e instanceof InterruptedException) throw (InterruptedException) e;
-            throw new ParserException("Unable to parse the vcard content. " + e.getMessage());
+            
+            String errorMsg = "Unable to parse the vcard content. " + e.getMessage();
+            this.theLogger.logSevere(errorMsg);            
+            throw new ParserException(errorMsg);
         } finally {
         }
     }
@@ -267,7 +274,7 @@ public class vcfParser extends AbstractParser implements Parser {
             vcfParser testParser = new vcfParser();
             byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
             ByteArrayInputStream input = new ByteArrayInputStream(content);
-            testParser.parse(contentUrl, "text/x-vcard", input);
+            testParser.parse(contentUrl, "text/x-vcard", "UTF-8",input);
         } catch (Exception e) {
             e.printStackTrace();
         }
diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java
index f1da328c7..146f85006 100644
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@@ -91,7 +91,7 @@ public class zipParser extends AbstractParser implements Parser {
         return SUPPORTED_MIME_TYPES;
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
         
         try {           
             StringBuffer docKeywords = new StringBuffer();
@@ -132,7 +132,7 @@ public class zipParser extends AbstractParser implements Parser {
                 checkInterruption();
                 
                 // parsing the content
-                plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,ut);
+                plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,null, ut);
                 if (theDoc == null) continue;
                 
                 // merging all documents together
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 092d12440..b97b68aa5 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -465,12 +465,12 @@ public final class plasmaParser {
         } catch (Exception e) { }
     }    
     
-    public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) throws InterruptedException {
+    public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) throws InterruptedException {
         File tempFile = null;
         try {
             tempFile = File.createTempFile("parseSource", ".tmp");
             serverFileUtils.write(source, tempFile);
-            return parseSource(location, mimeType, tempFile);
+            return parseSource(location, mimeType, charset, tempFile);
         } catch (Exception e) {
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             serverLog.logSevere("PARSER", "parseSource1: " + e.getMessage(), e);
@@ -481,7 +481,7 @@ public final class plasmaParser {
         
     }
 
-    public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) throws InterruptedException {
+    public plasmaParserDocument parseSource(URL location, String mimeType, String charset, File sourceFile) throws InterruptedException {
 
         Parser theParser = null;
         try {
@@ -546,10 +546,12 @@ public final class plasmaParser {
             
             // if a parser was found we use it ...
             if (theParser != null) {
-                return theParser.parse(location, mimeType,sourceFile);
+                return theParser.parse(location, mimeType,charset,sourceFile);
             } else if (realtimeParsableMimeTypesContains(mimeType)) {                      
                 // ...otherwise we make a scraper and transformer
                 htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
+                scraper.setCharset(PARSER_MODE_URLREDIRECTOR);
+                
                 OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
                 serverFileUtils.copy(sourceFile, hfos);
                 hfos.close();
@@ -691,6 +693,7 @@ public final class plasmaParser {
             File contentFile = null;
             URL contentURL = null;
             String contentMimeType = "application/octet-stream";
+            String charSet = "UTF-8";
             
             if (args.length < 2) {
                 System.err.println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]");
@@ -715,6 +718,10 @@ public final class plasmaParser {
                 contentMimeType = args[3];
             }
             
+            if ((args.length == 6)&&(args[4].equalsIgnoreCase("-c"))) {
+                charSet = args[5];
+            }            
+            
             // creating a plasma parser
             plasmaParser theParser = new plasmaParser();
             
@@ -725,7 +732,7 @@ public final class plasmaParser {
             plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
 
             // parsing the content
-            plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, contentFile);
+            plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile);
 
             // printing out all parsed sentences
             if (document != null) {
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index dfc87e157..5c0a78fa2 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -389,12 +389,12 @@ public class plasmaSnippetCache {
                         supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1));
                     }
 
-                    return this.parser.parseSource(url, supposedMime, resource);
+                    return this.parser.parseSource(url, supposedMime, null, resource);
                 }
                 return null;
             }
             if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
-                return this.parser.parseSource(url, docInfo.getMimeType(), resource);
+                return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharSet(), resource);
             }
             return null;
         } catch (InterruptedException e) {
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 908c6b6e2..0a167b881 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1398,6 +1398,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
 
         // the mimetype of this entry
         String mimeType = entry.getMimeType();
+        String charset = entry.getCharSet();        
 
         // the parser logger
         serverLog parserLogger = parser.getLogger();
@@ -1409,7 +1410,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
         ){
             if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) {
                 parserLogger.logFine("'" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File");
-                document = parser.parseSource(entry.url(), mimeType, entry.cacheFile());
+                document = parser.parseSource(entry.url(), mimeType, charset, entry.cacheFile());
             } else {
                 parserLogger.logFine("'" + entry.normalizedURLString() + "' cannot be parsed, no resource available");
                 addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength));
diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
index ae8b7af1c..ff77eb002 100644
--- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
@@ -320,6 +320,11 @@ public class plasmaSwitchboardQueue {
             return (info == null) ? null : info.getMimeType();
         }
         
+        public String getCharSet() {
+            IResourceInfo info = this.getCachedObjectInfo();
+            return (info == null) ? null : info.getCharSet();
+        }
+        
         public Date getModificationDate() {
             IResourceInfo info = this.getCachedObjectInfo();
             return (info == null) ? new Date() : info.getModificationDate();