*) changes needed for multi-language support

- parsers may need to know the charset of the byte stream 

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2591 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 31d6cdea53
commit d0a5a53789

@ -47,6 +47,8 @@ import de.anomic.server.serverByteBuffer;
import de.anomic.net.URL;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.text.Collator;
import java.util.ArrayList;
import java.util.HashMap;
@ -93,7 +95,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
//private String headline;
private List[] headlines;
private serverByteBuffer content;
private URL root;
private String charset = "UTF-8";
public htmlFilterContentScraper(URL root) {
// the root value here will not be used to load the resource.
@ -109,6 +113,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.content = new serverByteBuffer(1024);
}
public void setCharset(String charset) throws UnsupportedCharsetException {
// testing if charset exists
Charset.forName(charset);
// remember it
this.charset = charset;
}
public void scrapeText(byte[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);

@ -82,6 +82,12 @@ public interface IResourceInfo {
*/
public String getMimeType();
/**
* Returns the charset of the resource
* @return returns the name of the charset or <code>null</code> if unknown
*/
public String getCharSet();
/**
* Returns the modification date of the cached object
* @return the modifiaction date

@ -161,4 +161,8 @@ public class ResourceInfo implements IResourceInfo {
return responseStatus != null && responseStatus.equalsIgnoreCase("OK");
}
public String getCharSet() {
return null;
}
}

@ -110,6 +110,15 @@ public class ResourceInfo implements IResourceInfo {
int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
}
public String getCharSet() {
if (this.responseHeader == null) return null;
String mimeType = this.responseHeader.mime();
int pos = mimeType.indexOf(';');
return ((pos < 0) ? null : mimeType.substring(pos));
}
/**
* @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()

@ -103,6 +103,7 @@ public abstract class AbstractParser implements Parser{
* Parsing a document available as byte array.
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the content byte array
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
@ -113,12 +114,13 @@ public abstract class AbstractParser implements Parser{
public plasmaParserDocument parse(
URL location,
String mimeType,
String charset,
byte[] source
) throws ParserException, InterruptedException {
ByteArrayInputStream contentInputStream = null;
try {
contentInputStream = new ByteArrayInputStream(source);
return this.parse(location,mimeType,contentInputStream);
return this.parse(location,mimeType,charset,contentInputStream);
} finally {
if (contentInputStream != null) {
try {
@ -133,6 +135,7 @@ public abstract class AbstractParser implements Parser{
* Parsing a document stored in a {@link File}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param sourceFile the file containing the content of the document
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
@ -140,12 +143,16 @@ public abstract class AbstractParser implements Parser{
*
* @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File)
*/
public plasmaParserDocument parse(URL location, String mimeType,
File sourceFile) throws ParserException, InterruptedException {
public plasmaParserDocument parse(
URL location,
String mimeType,
String charset,
File sourceFile
) throws ParserException, InterruptedException {
BufferedInputStream contentInputStream = null;
try {
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
return this.parse(location, mimeType, contentInputStream);
return this.parse(location, mimeType, charset, contentInputStream);
} catch (FileNotFoundException e) {
throw new ParserException(e.getMessage());
} finally {
@ -157,6 +164,7 @@ public abstract class AbstractParser implements Parser{
* Parsing a document available as {@link InputStream}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the {@link InputStream} containing the document content
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
@ -164,8 +172,7 @@ public abstract class AbstractParser implements Parser{
*
* @see de.anomic.plasma.parser.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
*/
public abstract plasmaParserDocument parse(URL location, String mimeType,
InputStream source) throws ParserException, InterruptedException;
public abstract plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;
/**
* @return Returns a list of library names that are needed by this parser

@ -64,39 +64,42 @@ public interface Parser {
* Parsing a document available as byte array
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the content byte array
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public plasmaParserDocument parse(URL location, String mimeType, byte[] source)
public plasmaParserDocument parse(URL location, String mimeType, String charset, byte[] source)
throws ParserException, InterruptedException;
/**
* Parsing a document stored in a {@link File}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param sourceFile the file containing the content of the document
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile)
public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile)
throws ParserException, InterruptedException;
/**
* Parsing a document available as {@link InputStream}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the {@link InputStream} containing the document content
* @return a {@link plasmaParserDocument} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public plasmaParserDocument parse(URL location, String mimeType, InputStream source)
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source)
throws ParserException, InterruptedException;
/**

@ -87,7 +87,7 @@ public class bzipParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
File tempFile = null;
try {
@ -126,7 +126,7 @@ public class bzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,null,tempFile);
return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the gzip content. " + e.getMessage());

@ -78,7 +78,7 @@ implements Parser {
parserName = "Word Document Parser";
}
public plasmaParserDocument parse(URL location, String mimeType,
public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException, InterruptedException {

@ -83,7 +83,7 @@ public class gzipParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
File tempFile = null;
try {
@ -110,7 +110,7 @@ public class gzipParser extends AbstractParser implements Parser {
// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,null,tempFile);
return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the gzip content. " + e.getMessage());

@ -125,7 +125,7 @@ implements Parser {
return null;
}
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
String orgMimeType = mimeType;
@ -168,7 +168,7 @@ implements Parser {
// parsing the content using the determined mimetype
plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,mimeType,sourceFile);
return theParser.parseSource(location,mimeType,charset,sourceFile);
}
return null;
@ -185,13 +185,13 @@ implements Parser {
}
}
public plasmaParserDocument parse(URL location, String mimeType,
public plasmaParserDocument parse(URL location, String mimeType,String charset,
InputStream source) throws ParserException {
File dstFile = null;
try {
dstFile = File.createTempFile("mimeTypeParser",".tmp");
serverFileUtils.copy(source,dstFile);
return parse(location,mimeType,dstFile);
return parse(location,mimeType,charset,dstFile);
} catch (Exception e) {
return null;
} finally {

@ -91,7 +91,7 @@ public class odtParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(URL location, String mimeType, File dest) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, File dest) throws ParserException, InterruptedException {
try {
byte[] docContent = null;
@ -168,7 +168,7 @@ public class odtParser extends AbstractParser implements Parser {
}
}
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException {
File dest = null;
try {
// creating a tempfile
@ -179,7 +179,7 @@ public class odtParser extends AbstractParser implements Parser {
serverFileUtils.copy(source, dest);
// parsing the content
return parse(location, mimeType, dest);
return parse(location, mimeType, charset, dest);
} catch (Exception e) {
throw new ParserException("Unable to parse the odt document. " + e.getMessage());
} finally {
@ -210,7 +210,7 @@ public class odtParser extends AbstractParser implements Parser {
ByteArrayInputStream input = new ByteArrayInputStream(content);
// parsing the document
testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", input);
testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input);
} catch (Exception e) {
e.printStackTrace();
}

@ -85,7 +85,7 @@ public class pdfParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
PDDocument theDocument = null;

@ -91,13 +91,13 @@ public class rpmParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(URL location, String mimeType,
public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException {
File dstFile = null;
try {
dstFile = File.createTempFile("rpmParser",".tmp");
serverFileUtils.copy(source,dstFile);
return parse(location,mimeType,dstFile);
return parse(location,mimeType,charset,dstFile);
} catch (Exception e) {
return null;
} finally {
@ -105,7 +105,7 @@ public class rpmParser extends AbstractParser implements Parser {
}
}
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, File sourceFile) throws ParserException, InterruptedException {
RPMFile rpmFile = null;
try {
String summary = null, description = null, name = sourceFile.getName();
@ -177,7 +177,7 @@ public class rpmParser extends AbstractParser implements Parser {
rpmParser testParser = new rpmParser();
byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
ByteArrayInputStream input = new ByteArrayInputStream(content);
testParser.parse(contentUrl, "application/x-rpm", input);
testParser.parse(contentUrl, "application/x-rpm", null, input);
} catch (Exception e) {
e.printStackTrace();
}

@ -100,7 +100,7 @@ public class rssParser extends AbstractParser implements Parser {
parserName = "Rich Site Summary/Atom Feed Parser";
}
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try {
LinkedList feedSections = new LinkedList();

@ -80,7 +80,7 @@ implements Parser {
parserName = "Rich Text Format Parser";
}
public plasmaParserDocument parse(URL location, String mimeType,
public plasmaParserDocument parse(URL location, String mimeType, String charset,
InputStream source) throws ParserException, InterruptedException {

@ -94,7 +94,7 @@ public class tarParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try {
// creating a new parser class to parse the unzipped content
@ -153,7 +153,7 @@ public class tarParser extends AbstractParser implements Parser {
checkInterruption();
// parsing the content
theDoc = theParser.parseSource(new URL(tempFile),entryMime,tempFile);
theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile);
} finally {
if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){}
}

@ -1,55 +1,62 @@
<?xml version="1.0"?>
<project name="YACY - vcfParser" default="dist">
<description>
A class to parse vCard files
</description>
<description>A class to parse vCard files</description>
<property name="parserShortName" value="vcf"/>
<property name="parserVersion" value="0.1"/>
<property name="parserLongName" value="yacyContentParser_${parserShortName}"/>
<property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.tgz"/>
<!-- compile the sources of this parser -->
<target name="compile">
<javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}" destdir="${build}" source="${javacSource}" target="${javacTarget}" debug="true" debuglevel="lines,vars,source">
<classpath>
<pathelement location="${build}" />
<!-- libraries needed by this parser -->
<pathelement location="${libx}/commons-codec-1.3.jar" />
</classpath>
</javac>
</target>
<!-- add all parts of this parser into a single zip file -->
<target name="zip" depends="compile">
<tar destfile="${parserArchive}" compression="gzip">
<!-- needed libraries -->
<tarfileset dir="${libx}"
includes="commons-codec-1.3.*"
prefix="${releaseFileParentDir}/libx/"
dirmode="755" mode="644"/>
<!-- source files of this parser -->
<tarfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}"
prefix="${releaseFileParentDir}/source/de/anomic/plasma/parser/${parserShortName}"
dirmode="755" mode="644"/>
<!-- class files of this parser -->
<tarfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}"
prefix="${releaseFileParentDir}/classes/de/anomic/plasma/parser/${parserShortName}"
dirmode="755" mode="644"/>
</tar>
</target>
<!-- just copy all parts of this parser into the release directory -->
<target name="copy" depends="compile">
<!-- copy needed libs -->
<copy todir="${release}/libx/">
<fileset dir="${libx}" includes="commons-codec-1.3.*"/>
</copy>
</copy>
<!-- copy source code files -->
<copy todir="${release}/source/de/anomic/plasma/parser/${parserShortName}">
<fileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/>
</copy>
<!-- copy compiled classes -->
<copy todir="${release}/classes/de/anomic/plasma/parser/${parserShortName}">
<fileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/>
</copy>
</target>
<target name="dist" depends="compile,zip" description="Compile and zip the parser"/>
</project>

@ -90,14 +90,14 @@ public class vcfParser extends AbstractParser implements Parser {
public vcfParser() {
super(LIBX_DEPENDENCIES);
parserName = "vCard Parser";
this.parserName = "vCard Parser";
}
public Hashtable getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try {
StringBuffer parsedTitle = new StringBuffer();
@ -109,7 +109,9 @@ public class vcfParser extends AbstractParser implements Parser {
boolean useLastLine = false;
int lineNr = 0;
String line = null;
BufferedReader inputReader = new BufferedReader(new InputStreamReader(source));
BufferedReader inputReader = (charset!=null)
? new BufferedReader(new InputStreamReader(source,charset))
: new BufferedReader(new InputStreamReader(source));
while (true) {
// check for interruption
checkInterruption();
@ -236,21 +238,26 @@ public class vcfParser extends AbstractParser implements Parser {
}
}
String[] sections = (String[]) parsedNames.toArray(new String[parsedNames.size()]);
byte[] text = parsedDataText.toString().getBytes();
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
null,
null,
parsedTitle.toString(),
(String[]) parsedNames.toArray(new String[parsedNames.size()]),
"vCard",
parsedDataText.toString().getBytes(),
anchors,
null);
location, // url of the source document
mimeType, // the documents mime type
null, // a list of extracted keywords
null, // a short document title
parsedTitle.toString(), // a long document title
sections, // an array of section headlines
"vCard", // an abstract
text, // the parsed document text
anchors, // a map of extracted anchors
null); // a treeset of image URLs
return theDoc;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
throw new ParserException("Unable to parse the vcard content. " + e.getMessage());
String errorMsg = "Unable to parse the vcard content. " + e.getMessage();
this.theLogger.logSevere(errorMsg);
throw new ParserException(errorMsg);
} finally {
}
}
@ -267,7 +274,7 @@ public class vcfParser extends AbstractParser implements Parser {
vcfParser testParser = new vcfParser();
byte[] content = httpc.singleGET(contentUrl, contentUrl.getHost(), 10000, null, null, null);
ByteArrayInputStream input = new ByteArrayInputStream(content);
testParser.parse(contentUrl, "text/x-vcard", input);
testParser.parse(contentUrl, "text/x-vcard", "UTF-8",input);
} catch (Exception e) {
e.printStackTrace();
}

@ -91,7 +91,7 @@ public class zipParser extends AbstractParser implements Parser {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException, InterruptedException {
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
try {
StringBuffer docKeywords = new StringBuffer();
@ -132,7 +132,7 @@ public class zipParser extends AbstractParser implements Parser {
checkInterruption();
// parsing the content
plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,ut);
plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,null, ut);
if (theDoc == null) continue;
// merging all documents together

@ -465,12 +465,12 @@ public final class plasmaParser {
} catch (Exception e) { }
}
public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) throws InterruptedException {
public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) throws InterruptedException {
File tempFile = null;
try {
tempFile = File.createTempFile("parseSource", ".tmp");
serverFileUtils.write(source, tempFile);
return parseSource(location, mimeType, tempFile);
return parseSource(location, mimeType, charset, tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
serverLog.logSevere("PARSER", "parseSource1: " + e.getMessage(), e);
@ -481,7 +481,7 @@ public final class plasmaParser {
}
public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) throws InterruptedException {
public plasmaParserDocument parseSource(URL location, String mimeType, String charset, File sourceFile) throws InterruptedException {
Parser theParser = null;
try {
@ -546,10 +546,12 @@ public final class plasmaParser {
// if a parser was found we use it ...
if (theParser != null) {
return theParser.parse(location, mimeType,sourceFile);
return theParser.parse(location, mimeType,charset,sourceFile);
} else if (realtimeParsableMimeTypesContains(mimeType)) {
// ...otherwise we make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
scraper.setCharset(PARSER_MODE_URLREDIRECTOR);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.copy(sourceFile, hfos);
hfos.close();
@ -691,6 +693,7 @@ public final class plasmaParser {
File contentFile = null;
URL contentURL = null;
String contentMimeType = "application/octet-stream";
String charSet = "UTF-8";
if (args.length < 2) {
System.err.println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]");
@ -715,6 +718,10 @@ public final class plasmaParser {
contentMimeType = args[3];
}
if ((args.length == 6)&&(args[4].equalsIgnoreCase("-c"))) {
charSet = args[5];
}
// creating a plasma parser
plasmaParser theParser = new plasmaParser();
@ -725,7 +732,7 @@ public final class plasmaParser {
plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
// parsing the content
plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, contentFile);
plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile);
// printing out all parsed sentences
if (document != null) {

@ -389,12 +389,12 @@ public class plasmaSnippetCache {
supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1));
}
return this.parser.parseSource(url, supposedMime, resource);
return this.parser.parseSource(url, supposedMime, null, resource);
}
return null;
}
if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
return this.parser.parseSource(url, docInfo.getMimeType(), resource);
return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharSet(), resource);
}
return null;
} catch (InterruptedException e) {

@ -1398,6 +1398,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// the mimetype of this entry
String mimeType = entry.getMimeType();
String charset = entry.getCharSet();
// the parser logger
serverLog parserLogger = parser.getLogger();
@ -1409,7 +1410,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
){
if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) {
parserLogger.logFine("'" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File");
document = parser.parseSource(entry.url(), mimeType, entry.cacheFile());
document = parser.parseSource(entry.url(), mimeType, charset, entry.cacheFile());
} else {
parserLogger.logFine("'" + entry.normalizedURLString() + "' cannot be parsed, no resource available");
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength));

@ -320,6 +320,11 @@ public class plasmaSwitchboardQueue {
return (info == null) ? null : info.getMimeType();
}
public String getCharSet() {
IResourceInfo info = this.getCachedObjectInfo();
return (info == null) ? null : info.getCharSet();
}
public Date getModificationDate() {
IResourceInfo info = this.getCachedObjectInfo();
return (info == null) ? new Date() : info.getModificationDate();

Loading…
Cancel
Save