*) changes needed for multi-language support

- parsers may need to know the charset of the byte stream 

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2591 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 31d6cdea53
commit d0a5a53789

@ -47,6 +47,8 @@ import de.anomic.server.serverByteBuffer;
import de.anomic.net.URL; import de.anomic.net.URL;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.text.Collator; import java.text.Collator;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
@ -93,7 +95,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
//private String headline; //private String headline;
private List[] headlines; private List[] headlines;
private serverByteBuffer content; private serverByteBuffer content;
private URL root; private URL root;
private String charset = "UTF-8";
public htmlFilterContentScraper(URL root) { public htmlFilterContentScraper(URL root) {
// the root value here will not be used to load the resource. // the root value here will not be used to load the resource.
@ -109,6 +113,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.content = new serverByteBuffer(1024); this.content = new serverByteBuffer(1024);
} }
public void setCharset(String charset) throws UnsupportedCharsetException {
// testing if charset exists
Charset.forName(charset);
// remember it
this.charset = charset;
}
public void scrapeText(byte[] newtext) { public void scrapeText(byte[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext)); // System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32); if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);

@ -82,6 +82,12 @@ public interface IResourceInfo {
*/ */
public String getMimeType(); public String getMimeType();
/**
* Returns the charset of the resource
* @return returns the name of the charset or <code>null</code> if unknown
*/
public String getCharSet();
/** /**
* Returns the modification date of the cached object * Returns the modification date of the cached object
* @return the modifiaction date * @return the modifiaction date

@ -161,4 +161,8 @@ public class ResourceInfo implements IResourceInfo {
return responseStatus != null && responseStatus.equalsIgnoreCase("OK"); return responseStatus != null && responseStatus.equalsIgnoreCase("OK");
} }
public String getCharSet() {
return null;
}
} }

@ -111,6 +111,15 @@ public class ResourceInfo implements IResourceInfo {
return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
} }
public String getCharSet() {
if (this.responseHeader == null) return null;
String mimeType = this.responseHeader.mime();
int pos = mimeType.indexOf(';');
return ((pos < 0) ? null : mimeType.substring(pos));
}
/** /**
* @see de.anomic.plasma.cache.IResourceInfo#getModificationDate() * @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()
*/ */

@ -103,6 +103,7 @@ public abstract class AbstractParser implements Parser{
* Parsing a document available as byte array. * Parsing a document available as byte array.
* @param location the origin of the document * @param location the origin of the document
* @param mimeType the mimetype of the document * @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the content byte array * @param source the content byte array
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata. * and some additional metadata.
@ -113,12 +114,13 @@ public abstract class AbstractParser implements Parser{
public plasmaParserDocument parse( public plasmaParserDocument parse(
URL location, URL location,
String mimeType, String mimeType,
String charset,
byte[] source byte[] source
) throws ParserException, InterruptedException { ) throws ParserException, InterruptedException {
ByteArrayInputStream contentInputStream = null; ByteArrayInputStream contentInputStream = null;
try { try {
contentInputStream = new ByteArrayInputStream(source); contentInputStream = new ByteArrayInputStream(source);
return this.parse(location,mimeType,contentInputStream); return this.parse(location,mimeType,charset,contentInputStream);
} finally { } finally {
if (contentInputStream != null) { if (contentInputStream != null) {
try { try {
@ -133,6 +135,7 @@ public abstract class AbstractParser implements Parser{
* Parsing a document stored in a {@link File} * Parsing a document stored in a {@link File}
* @param location the origin of the document * @param location the origin of the document
* @param mimeType the mimetype of the document * @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param sourceFile the file containing the content of the document * @param sourceFile the file containing the content of the document
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata. * and some additional metadata.
@ -140,12 +143,16 @@ public abstract class AbstractParser implements Parser{
* *
* @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File) * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File)
*/ */
public plasmaParserDocument parse(URL location, String mimeType, public plasmaParserDocument parse(
File sourceFile) throws ParserException, InterruptedException { URL location,
String mimeType,
String charset,
File sourceFile
) throws ParserException, InterruptedException {
BufferedInputStream contentInputStream = null; BufferedInputStream contentInputStream = null;
try { try {
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile)); contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
return this.parse(location, mimeType, contentInputStream); return this.parse(location, mimeType, charset, contentInputStream);
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
throw new ParserException(e.getMessage()); throw new ParserException(e.getMessage());
} finally { } finally {
@ -157,6 +164,7 @@ public abstract class AbstractParser implements Parser{
* Parsing a document available as {@link InputStream} * Parsing a document available as {@link InputStream}
* @param location the origin of the document * @param location the origin of the document
* @param mimeType the mimetype of the document * @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the {@link InputStream} containing the document content * @param source the {@link InputStream} containing the document content
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata. * and some additional metadata.
@ -164,8 +172,7 @@ public abstract class AbstractParser implements Parser{
* *
* @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream) * @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
*/ */
public abstract plasmaParserDocument parse(URL location, String mimeType, public abstract plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;
InputStream source) throws ParserException, InterruptedException;
/** /**
* @return Returns a list of library names that are needed by this parser * @return Returns a list of library names that are needed by this parser

@ -64,39 +64,42 @@ public interface Parser {
* Parsing a document available as byte array * Parsing a document available as byte array
* @param location the origin of the document * @param location the origin of the document
* @param mimeType the mimetype of the document * @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the content byte array * @param source the content byte array
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata. * and some additional metadata.
* *
* @throws ParserException if the content could not be parsed properly * @throws ParserException if the content could not be parsed properly
*/ */
public plasmaParserDocument parse(URL location, String mimeType, byte[] source) public plasmaParserDocument parse(URL location, String mimeType, String charset, byte[] source)
throws ParserException, InterruptedException; throws ParserException, InterruptedException;
/** /**
* Parsing a document stored in a {@link File} * Parsing a document stored in a {@link File}
* @param location the origin of the document * @param location the origin of the document
* @param mimeType the mimetype of the document * @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param sourceFile the file containing the content of the document * @param sourceFile the file containing the content of the document
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata. * and some additional metadata.
* *
* @throws ParserException if the content could not be parsed properly * @throws ParserException if the content could not be parsed properly
*/ */
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile)
throws ParserException, InterruptedException; throws ParserException, InterruptedException;
/** /**
* Parsing a document available as {@link InputStream} * Parsing a document available as {@link InputStream}
* @param location the origin of the document * @param location the origin of the document
* @param mimeType the mimetype of the document * @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the {@link InputStream} containing the document content * @param source the {@link InputStream} containing the document content
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document * @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata. * and some additional metadata.
* *
* @throws ParserException if the content could not be parsed properly * @throws ParserException if the content could not be parsed properly
*/ */
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source)
throws ParserException, InterruptedException; throws ParserException, InterruptedException;
/** /**

@ -87,7 +87,7 @@ public class bzipParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
File tempFile = null; File tempFile = null;
try { try {
@ -126,7 +126,7 @@ public class bzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser(); plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,null,tempFile); return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) { } catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the gzip content. " + e.getMessage()); throw new ParserException("Unable to parse the gzip content. " + e.getMessage());

@ -78,7 +78,7 @@ implements Parser {
parserName = "Word Document Parser"; parserName = "Word Document Parser";
} }
public plasmaParserDocument parse(URL location, String mimeType, public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException, InterruptedException { InputStream source) throws ParserException, InterruptedException {

@ -83,7 +83,7 @@ public class gzipParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
File tempFile = null; File tempFile = null;
try { try {
@ -110,7 +110,7 @@ public class gzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser(); plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,null,tempFile); return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) { } catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the gzip content. " + e.getMessage()); throw new ParserException("Unable to parse the gzip content. " + e.getMessage());

@ -125,7 +125,7 @@ implements Parser {
return null; return null;
} }
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException { public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
String orgMimeType = mimeType; String orgMimeType = mimeType;
@ -168,7 +168,7 @@ implements Parser {
// parsing the content using the determined mimetype // parsing the content using the determined mimetype
plasmaParser theParser = new plasmaParser(); plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,mimeType,sourceFile); return theParser.parseSource(location,mimeType,charset,sourceFile);
} }
return null; return null;
@ -185,13 +185,13 @@ implements Parser {
} }
} }
public plasmaParserDocument parse(URL location, String mimeType, public plasmaParserDocument parse(URL location, String mimeType,String charset,
InputStream source) throws ParserException { InputStream source) throws ParserException {
File dstFile = null; File dstFile = null;
try { try {
dstFile = File.createTempFile("mimeTypeParser",".tmp"); dstFile = File.createTempFile("mimeTypeParser",".tmp");
serverFileUtils.copy(source,dstFile); serverFileUtils.copy(source,dstFile);
return parse(location,mimeType,dstFile); return parse(location,mimeType,charset,dstFile);
} catch (Exception e) { } catch (Exception e) {
return null; return null;
} finally { } finally {

@ -91,7 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public plasmaParserDocument parse(URL location, String mimeType, File dest) throws ParserException, InterruptedException { public plasmaParserDocument parse(URL location, String mimeType, String charset, File dest) throws ParserException, InterruptedException {
try { try {
byte[] docContent = null; byte[] docContent = null;
@ -168,7 +168,7 @@ public class odtParser extends AbstractParser implements Parser {
} }
} }
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException {
File dest = null; File dest = null;
try { try {
// creating a tempfile // creating a tempfile
@ -179,7 +179,7 @@ public class odtParser extends AbstractParser implements Parser {
serverFileUtils.copy(source, dest); serverFileUtils.copy(source, dest);
// parsing the content // parsing the content
return parse(location, mimeType, dest); return parse(location, mimeType, charset, dest);
} catch (Exception e) { } catch (Exception e) {
throw new ParserException("Unable to parse the odt document. " + e.getMessage()); throw new ParserException("Unable to parse the odt document. " + e.getMessage());
} finally { } finally {
@ -210,7 +210,7 @@ public class odtParser extends AbstractParser implements Parser {
ByteArrayInputStream input = new ByteArrayInputStream(content); ByteArrayInputStream input = new ByteArrayInputStream(content);
// parsing the document // parsing the document
testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", input); testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }

@ -85,7 +85,7 @@ public class pdfParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
PDDocument theDocument = null; PDDocument theDocument = null;

@ -91,13 +91,13 @@ public class rpmParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public plasmaParserDocument parse(URL location, String mimeType, public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException { InputStream source) throws ParserException {
File dstFile = null; File dstFile = null;
try { try {
dstFile = File.createTempFile("rpmParser",".tmp"); dstFile = File.createTempFile("rpmParser",".tmp");
serverFileUtils.copy(source,dstFile); serverFileUtils.copy(source,dstFile);
return parse(location,mimeType,dstFile); return parse(location,mimeType,charset,dstFile);
} catch (Exception e) { } catch (Exception e) {
return null; return null;
} finally { } finally {
@ -105,7 +105,7 @@ public class rpmParser extends AbstractParser implements Parser {
} }
} }
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException { public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
RPMFile rpmFile = null; RPMFile rpmFile = null;
try { try {
String summary = null, description = null, name = sourceFile.getName(); String summary = null, description = null, name = sourceFile.getName();
@ -177,7 +177,7 @@ public class rpmParser extends AbstractParser implements Parser {
rpmParser testParser = new rpmParser(); rpmParser testParser = new rpmParser();
byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null); byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
ByteArrayInputStream input = new ByteArrayInputStream(content); ByteArrayInputStream input = new ByteArrayInputStream(content);
testParser.parse(contentUrl, "application/x-rpm", input); testParser.parse(contentUrl, "application/x-rpm", null, input);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }

@ -100,7 +100,7 @@ public class rssParser extends AbstractParser implements Parser {
parserName = "Rich Site Summary/Atom Feed Parser"; parserName = "Rich Site Summary/Atom Feed Parser";
} }
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try { try {
LinkedList feedSections = new LinkedList(); LinkedList feedSections = new LinkedList();

@ -80,7 +80,7 @@ implements Parser {
parserName = "Rich Text Format Parser"; parserName = "Rich Text Format Parser";
} }
public plasmaParserDocument parse(URL location, String mimeType, public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException, InterruptedException { InputStream source) throws ParserException, InterruptedException {

@ -94,7 +94,7 @@ public class tarParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try { try {
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
@ -153,7 +153,7 @@ public class tarParser extends AbstractParser implements Parser {
checkInterruption(); checkInterruption();
// parsing the content // parsing the content
theDoc = theParser.parseSource(new URL(tempFile),entryMime,tempFile); theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile);
} finally { } finally {
if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){} if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){}
} }

@ -1,55 +1,62 @@
<?xml version="1.0"?> <?xml version="1.0"?>
<project name="YACY - vcfParser" default="dist"> <project name="YACY - vcfParser" default="dist">
<description> <description>A class to parse vCard files</description>
A class to parse vCard files
</description>
<property name="parserShortName" value="vcf"/> <property name="parserShortName" value="vcf"/>
<property name="parserVersion" value="0.1"/> <property name="parserVersion" value="0.1"/>
<property name="parserLongName" value="yacyContentParser_${parserShortName}"/> <property name="parserLongName" value="yacyContentParser_${parserShortName}"/>
<property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.tgz"/> <property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.tgz"/>
<!-- compile the sources of this parser -->
<target name="compile"> <target name="compile">
<javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}" destdir="${build}" source="${javacSource}" target="${javacTarget}" debug="true" debuglevel="lines,vars,source"> <javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}" destdir="${build}" source="${javacSource}" target="${javacTarget}" debug="true" debuglevel="lines,vars,source">
<classpath> <classpath>
<pathelement location="${build}" /> <pathelement location="${build}" />
<!-- libraries needed by this parser -->
<pathelement location="${libx}/commons-codec-1.3.jar" /> <pathelement location="${libx}/commons-codec-1.3.jar" />
</classpath> </classpath>
</javac> </javac>
</target> </target>
<!-- add all parts of this parser into a single zip file -->
<target name="zip" depends="compile"> <target name="zip" depends="compile">
<tar destfile="${parserArchive}" compression="gzip"> <tar destfile="${parserArchive}" compression="gzip">
<!-- needed libraries -->
<tarfileset dir="${libx}" <tarfileset dir="${libx}"
includes="commons-codec-1.3.*" includes="commons-codec-1.3.*"
prefix="${releaseFileParentDir}/libx/" prefix="${releaseFileParentDir}/libx/"
dirmode="755" mode="644"/> dirmode="755" mode="644"/>
<!-- source files of this parser -->
<tarfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" <tarfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}"
prefix="${releaseFileParentDir}/source/de/anomic/plasma/parser/${parserShortName}" prefix="${releaseFileParentDir}/source/de/anomic/plasma/parser/${parserShortName}"
dirmode="755" mode="644"/> dirmode="755" mode="644"/>
<!-- class files of this parser -->
<tarfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" <tarfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}"
prefix="${releaseFileParentDir}/classes/de/anomic/plasma/parser/${parserShortName}" prefix="${releaseFileParentDir}/classes/de/anomic/plasma/parser/${parserShortName}"
dirmode="755" mode="644"/> dirmode="755" mode="644"/>
</tar> </tar>
</target> </target>
<!-- just copy all parts of this parser into the release directory -->
<target name="copy" depends="compile"> <target name="copy" depends="compile">
<!-- copy needed libs -->
<copy todir="${release}/libx/"> <copy todir="${release}/libx/">
<fileset dir="${libx}" includes="commons-codec-1.3.*"/> <fileset dir="${libx}" includes="commons-codec-1.3.*"/>
</copy> </copy>
<!-- copy source code files -->
<copy todir="${release}/source/de/anomic/plasma/parser/${parserShortName}"> <copy todir="${release}/source/de/anomic/plasma/parser/${parserShortName}">
<fileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/> <fileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/>
</copy> </copy>
<!-- copy compiled classes -->
<copy todir="${release}/classes/de/anomic/plasma/parser/${parserShortName}"> <copy todir="${release}/classes/de/anomic/plasma/parser/${parserShortName}">
<fileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/> <fileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/>
</copy> </copy>
</target> </target>
<target name="dist" depends="compile,zip" description="Compile and zip the parser"/> <target name="dist" depends="compile,zip" description="Compile and zip the parser"/>
</project> </project>

@ -90,14 +90,14 @@ public class vcfParser extends AbstractParser implements Parser {
public vcfParser() { public vcfParser() {
super(LIBX_DEPENDENCIES); super(LIBX_DEPENDENCIES);
parserName = "vCard Parser"; this.parserName = "vCard Parser";
} }
public Hashtable getSupportedMimeTypes() { public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try { try {
StringBuffer parsedTitle = new StringBuffer(); StringBuffer parsedTitle = new StringBuffer();
@ -109,7 +109,9 @@ public class vcfParser extends AbstractParser implements Parser {
boolean useLastLine = false; boolean useLastLine = false;
int lineNr = 0; int lineNr = 0;
String line = null; String line = null;
BufferedReader inputReader = new BufferedReader(new InputStreamReader(source)); BufferedReader inputReader = (charset!=null)
? new BufferedReader(new InputStreamReader(source,charset))
: new BufferedReader(new InputStreamReader(source));
while (true) { while (true) {
// check for interruption // check for interruption
checkInterruption(); checkInterruption();
@ -236,21 +238,26 @@ public class vcfParser extends AbstractParser implements Parser {
} }
} }
String[] sections = (String[]) parsedNames.toArray(new String[parsedNames.size()]);
byte[] text = parsedDataText.toString().getBytes();
plasmaParserDocument theDoc = new plasmaParserDocument( plasmaParserDocument theDoc = new plasmaParserDocument(
location, location, // url of the source document
mimeType, mimeType, // the documents mime type
null, null, // a list of extracted keywords
null, null, // a short document title
parsedTitle.toString(), parsedTitle.toString(), // a long document title
(String[]) parsedNames.toArray(new String[parsedNames.size()]), sections, // an array of section headlines
"vCard", "vCard", // an abstract
parsedDataText.toString().getBytes(), text, // the parsed document text
anchors, anchors, // a map of extracted anchors
null); null); // a treeset of image URLs
return theDoc; return theDoc;
} catch (Exception e) { } catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the vcard content. " + e.getMessage());
String errorMsg = "Unable to parse the vcard content. " + e.getMessage();
this.theLogger.logSevere(errorMsg);
throw new ParserException(errorMsg);
} finally { } finally {
} }
} }
@ -267,7 +274,7 @@ public class vcfParser extends AbstractParser implements Parser {
vcfParser testParser = new vcfParser(); vcfParser testParser = new vcfParser();
byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null); byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
ByteArrayInputStream input = new ByteArrayInputStream(content); ByteArrayInputStream input = new ByteArrayInputStream(content);
testParser.parse(contentUrl, "text/x-vcard", input); testParser.parse(contentUrl, "text/x-vcard", "UTF-8",input);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }

@ -91,7 +91,7 @@ public class zipParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES; return SUPPORTED_MIME_TYPES;
} }
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException { public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try { try {
StringBuffer docKeywords = new StringBuffer(); StringBuffer docKeywords = new StringBuffer();
@ -132,7 +132,7 @@ public class zipParser extends AbstractParser implements Parser {
checkInterruption(); checkInterruption();
// parsing the content // parsing the content
plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,ut); plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,null, ut);
if (theDoc == null) continue; if (theDoc == null) continue;
// merging all documents together // merging all documents together

@ -465,12 +465,12 @@ public final class plasmaParser {
} catch (Exception e) { } } catch (Exception e) { }
} }
public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) throws InterruptedException { public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) throws InterruptedException {
File tempFile = null; File tempFile = null;
try { try {
tempFile = File.createTempFile("parseSource", ".tmp"); tempFile = File.createTempFile("parseSource", ".tmp");
serverFileUtils.write(source, tempFile); serverFileUtils.write(source, tempFile);
return parseSource(location, mimeType, tempFile); return parseSource(location, mimeType, charset, tempFile);
} catch (Exception e) { } catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
serverLog.logSevere("PARSER", "parseSource1: " + e.getMessage(), e); serverLog.logSevere("PARSER", "parseSource1: " + e.getMessage(), e);
@ -481,7 +481,7 @@ public final class plasmaParser {
} }
public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) throws InterruptedException { public plasmaParserDocument parseSource(URL location, String mimeType, String charset, File sourceFile) throws InterruptedException {
Parser theParser = null; Parser theParser = null;
try { try {
@ -546,10 +546,12 @@ public final class plasmaParser {
// if a parser was found we use it ... // if a parser was found we use it ...
if (theParser != null) { if (theParser != null) {
return theParser.parse(location, mimeType,sourceFile); return theParser.parse(location, mimeType,charset,sourceFile);
} else if (realtimeParsableMimeTypesContains(mimeType)) { } else if (realtimeParsableMimeTypesContains(mimeType)) {
// ...otherwise we make a scraper and transformer // ...otherwise we make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
scraper.setCharset(PARSER_MODE_URLREDIRECTOR);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.copy(sourceFile, hfos); serverFileUtils.copy(sourceFile, hfos);
hfos.close(); hfos.close();
@ -691,6 +693,7 @@ public final class plasmaParser {
File contentFile = null; File contentFile = null;
URL contentURL = null; URL contentURL = null;
String contentMimeType = "application/octet-stream"; String contentMimeType = "application/octet-stream";
String charSet = "UTF-8";
if (args.length < 2) { if (args.length < 2) {
System.err.println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]"); System.err.println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]");
@ -715,6 +718,10 @@ public final class plasmaParser {
contentMimeType = args[3]; contentMimeType = args[3];
} }
if ((args.length == 6)&&(args[4].equalsIgnoreCase("-c"))) {
charSet = args[5];
}
// creating a plasma parser // creating a plasma parser
plasmaParser theParser = new plasmaParser(); plasmaParser theParser = new plasmaParser();
@ -725,7 +732,7 @@ public final class plasmaParser {
plasmaParser.enableAllParsers(PARSER_MODE_PROXY); plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
// parsing the content // parsing the content
plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, contentFile); plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile);
// printing out all parsed sentences // printing out all parsed sentences
if (document != null) { if (document != null) {

@ -389,12 +389,12 @@ public class plasmaSnippetCache {
supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1)); supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1));
} }
return this.parser.parseSource(url, supposedMime, resource); return this.parser.parseSource(url, supposedMime, null, resource);
} }
return null; return null;
} }
if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) { if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
return this.parser.parseSource(url, docInfo.getMimeType(), resource); return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharSet(), resource);
} }
return null; return null;
} catch (InterruptedException e) { } catch (InterruptedException e) {

@ -1398,6 +1398,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// the mimetype of this entry // the mimetype of this entry
String mimeType = entry.getMimeType(); String mimeType = entry.getMimeType();
String charset = entry.getCharSet();
// the parser logger // the parser logger
serverLog parserLogger = parser.getLogger(); serverLog parserLogger = parser.getLogger();
@ -1409,7 +1410,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
){ ){
if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) { if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) {
parserLogger.logFine("'" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File"); parserLogger.logFine("'" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File");
document = parser.parseSource(entry.url(), mimeType, entry.cacheFile()); document = parser.parseSource(entry.url(), mimeType, charset, entry.cacheFile());
} else { } else {
parserLogger.logFine("'" + entry.normalizedURLString() + "' cannot be parsed, no resource available"); parserLogger.logFine("'" + entry.normalizedURLString() + "' cannot be parsed, no resource available");
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength)); addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength));

@ -320,6 +320,11 @@ public class plasmaSwitchboardQueue {
return (info == null) ? null : info.getMimeType(); return (info == null) ? null : info.getMimeType();
} }
public String getCharSet() {
IResourceInfo info = this.getCachedObjectInfo();
return (info == null) ? null : info.getCharSet();
}
public Date getModificationDate() { public Date getModificationDate() {
IResourceInfo info = this.getCachedObjectInfo(); IResourceInfo info = this.getCachedObjectInfo();
return (info == null) ? new Date() : info.getModificationDate(); return (info == null) ? new Date() : info.getModificationDate();

Loading…
Cancel
Save