- more refactoring

- fixed problem with parsers

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6433 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent e48f3dfb1e
commit 26fafd85a5

@ -41,8 +41,8 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager;
import de.anomic.data.userDB;

@ -43,8 +43,8 @@ import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Client;
import de.anomic.http.client.Cache;

@ -5,9 +5,9 @@ import java.util.Set;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;

@ -49,8 +49,8 @@ import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.MemoryTracker;
import net.yacy.kelondro.util.SetTools;
import net.yacy.kelondro.util.ISO639;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.data.DidYouMean;
import de.anomic.data.LibraryProvider;
import de.anomic.http.server.HeaderFramework;

@ -33,9 +33,9 @@ import java.net.MalformedURLException;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.file.SurrogateReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;

@ -33,8 +33,8 @@ import net.yacy.document.Document;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.retrieval.LoaderDispatcher;
public class MediaSnippet {
public int type;

@ -139,6 +139,7 @@ import net.yacy.kelondro.workflow.InstantBusyThread;
import net.yacy.kelondro.workflow.WorkflowJob;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.kelondro.workflow.WorkflowThread;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlQueues;
@ -154,7 +155,6 @@ import de.anomic.crawler.ZURL;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
import de.anomic.data.DefaultBlacklist;
@ -1618,7 +1618,7 @@ public final class Switchboard extends serverSwitch {
document = TextParser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), b);
assert(document != null) : "Unexpected error. Parser returned null.";
} catch (final ParserException e) {
this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage(), e);
addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage());
if (document != null) {
document.close();

@ -47,8 +47,8 @@ import net.yacy.kelondro.index.ConcurrentARC;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.SetTools;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Cache;
import de.anomic.http.server.ResponseHeader;

@ -51,10 +51,10 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;

@ -32,7 +32,9 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.text.Collator;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
@ -72,8 +74,9 @@ public final class TextParser {
insensitiveCollator.setStrength(Collator.SECONDARY);
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
}
private static final Map<String, Idiom> mime2parser = new TreeMap<String, Idiom>(insensitiveCollator);
private static final Map<String, Idiom> ext2parser = new TreeMap<String, Idiom>(insensitiveCollator);
private static final Map<String, String> ext2mime = new TreeMap<String, String>(insensitiveCollator);
private static final Set<String> denyMime = new TreeSet<String>(insensitiveCollator);
private static final Set<String> denyExtension = new TreeSet<String>(insensitiveCollator);
@ -123,6 +126,15 @@ public final class TextParser {
if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
ext2mime.put(ext, prototypeMime);
}
for (String ext: parser.supportedExtensions()) {
// process the extensions
Idiom p0 = ext2parser.get(ext);
if (p0 != null) log.logSevere("parser for extension '" + ext + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
ext2parser.put(ext, parser);
Log.logInfo("PARSER", "Parser for extension '" + ext + "': " + parser.getName());
}
}
public static Document parseSource(
@ -190,36 +202,37 @@ public final class TextParser {
final long contentLength,
final InputStream sourceStream
) throws InterruptedException, ParserException {
try {
if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType);
final String fileExt = location.getFileExtension();
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
Idiom parser = idiomParser(location, mimeType);
if (parser == null) {
final String errorMsg = "No parser available to parse extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "'";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType);
final String fileExt = location.getFileExtension();
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
List<Idiom> idioms = idiomParser(location, mimeType);
if (idioms.size() == 0) {
final String errorMsg = "No parser available to parse extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "'";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
Document doc = null;
for (Idiom parser: idioms) {
parser.setContentLength(contentLength);
Document doc = parser.parse(location, mimeType, documentCharset, sourceStream);
if (doc == null) {
final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed: document == null";
log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
try {
doc = parser.parse(location, mimeType, documentCharset, sourceStream);
} catch (ParserException e) {
log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
}
return doc;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
final String errorMsg = "Unexpected exception. " + e.getMessage();
log.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
if (doc != null) break;
}
if (doc == null) {
final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed.";
log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
return doc;
}
/**
@ -239,28 +252,46 @@ public final class TextParser {
}
}
private static Idiom idiomParser(final DigestURI url, String mimeType) throws ParserException {
// check mime type
if (mimeType != null) {
mimeType = normalizeMimeType(mimeType);
if (denyMime.contains(mimeType)) throw new ParserException("mime type '" + mimeType + "' is denied", url);
} else {
mimeType = normalizeMimeType(mimeType);
}
Idiom idiom = mime2parser.get(mimeType);
if (idiom != null) return idiom;
/**
* find a parser for a given url and mime type
* because mime types returned by web severs are sometimes wrong, we also compute the mime type again
* from the extension that can be extracted from the url path. That means that there are 3 criteria
* that can be used to select a parser:
* - the given extension
* - the given mime type
* - the mime type computed from the extension
* @param url the given url
* @param mimeType the given mime type
* @return a list of Idiom parsers that may be appropriate for the given criteria
* @throws ParserException
*/
private static List<Idiom> idiomParser(final DigestURI url, String mimeType1) throws ParserException {
List<Idiom> idioms = new ArrayList<Idiom>(2);
// check extension
String ext = url.getFileExtension();
if (ext == null || ext.length() == 0) throw new ParserException("no file extension", url);
if (denyExtension.contains(ext)) throw new ParserException("file extension '" + ext + "' is denied", url);
mimeType = ext2mime.get(ext);
if (mimeType == null) throw new ParserException("no parser available", url);
idiom = mime2parser.get(mimeType);
assert idiom != null;
if (idiom == null) throw new ParserException("no parser available (internal error!)", url);
return idiom;
Idiom idiom;
if (ext != null && ext.length() > 0) {
if (denyExtension.contains(ext)) throw new ParserException("file extension '" + ext + "' is denied", url);
idiom = ext2parser.get(ext);
if (idiom != null) idioms.add(idiom);
}
// check given mime type
if (mimeType1 != null) {
mimeType1 = normalizeMimeType(mimeType1);
if (denyMime.contains(mimeType1)) throw new ParserException("mime type '" + mimeType1 + "' is denied", url);
idiom = mime2parser.get(mimeType1);
if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom);
}
// check mime type computed from extension
String mimeType2 = ext2mime.get(ext);
if (mimeType2 == null || denyMime.contains(mimeType2)) return idioms; // in this case we are a bit more lazy
idiom = mime2parser.get(mimeType2);
if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom);
return idioms;
}
public static String supportsMime(String mimeType) {

@ -40,7 +40,6 @@ import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
public class pptParser extends AbstractParser implements Idiom {
/**
@ -112,7 +111,8 @@ public class pptParser extends AbstractParser implements Idiom {
/*
* an unexpected error occurred, log it and throw a ParserException
*/
*/
e.printStackTrace();
final String errorMsg = "Unable to parse the ppt document '" + location + "':" + e.getMessage();
this.theLogger.logSevere(errorMsg);
throw new ParserException(errorMsg, location);

@ -130,7 +130,8 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
/*
* an unexpected error occurred, log it and throw a ParserException
*/
*/
e.printStackTrace();
final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage();
this.theLogger.logSevere(errorMsg);
throw new ParserException(errorMsg, location);

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler.retrieval;
package net.yacy.repository;
import java.io.ByteArrayInputStream;
import java.io.IOException;
@ -46,6 +46,10 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.FTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Cache;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
Loading…
Cancel
Save