From 26fafd85a5fa8195a095e21f50acb84ec1485062 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 21 Oct 2009 15:12:34 +0000 Subject: [PATCH] - more refactoring - fixed problem with parsers git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6433 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 2 +- htroot/ViewFile.java | 2 +- htroot/api/util/getpageinfo_p.java | 2 +- htroot/yacysearch.java | 2 +- source/de/anomic/crawler/PMHReader.java | 2 +- source/de/anomic/search/MediaSnippet.java | 2 +- source/de/anomic/search/Switchboard.java | 4 +- source/de/anomic/search/TextSnippet.java | 2 +- source/de/anomic/yacy/yacyRelease.java | 2 +- source/net/yacy/document/TextParser.java | 123 +++++++++++------- .../net/yacy/document/parser/pptParser.java | 4 +- .../net/yacy/document/parser/xlsParser.java | 3 +- .../yacy/repository}/LoaderDispatcher.java | 6 +- 13 files changed, 96 insertions(+), 60 deletions(-) rename source/{de/anomic/crawler/retrieval => net/yacy/repository}/LoaderDispatcher.java (96%) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index b0de4efda..3407b23cc 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -41,8 +41,8 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; +import net.yacy.repository.LoaderDispatcher; -import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; import de.anomic.data.userDB; diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 1eb0381b6..d57319a00 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -43,8 +43,8 @@ import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.util.FileUtils; +import net.yacy.repository.LoaderDispatcher; -import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.crawler.retrieval.Response; import de.anomic.http.client.Client; import de.anomic.http.client.Cache; diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index ceadbf74f..aea04309b 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -5,9 +5,9 @@ import java.util.Set; import net.yacy.document.parser.html.ContentScraper; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 42f59446d..4116b610d 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -49,8 +49,8 @@ import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryTracker; import net.yacy.kelondro.util.SetTools; import net.yacy.kelondro.util.ISO639; +import net.yacy.repository.LoaderDispatcher; -import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.data.DidYouMean; import de.anomic.data.LibraryProvider; import de.anomic.http.server.HeaderFramework; diff --git a/source/de/anomic/crawler/PMHReader.java b/source/de/anomic/crawler/PMHReader.java index ead620653..533d8d7b1 100644 --- a/source/de/anomic/crawler/PMHReader.java +++ b/source/de/anomic/crawler/PMHReader.java @@ -33,9 +33,9 @@ import java.net.MalformedURLException; import net.yacy.document.content.DCEntry; import net.yacy.document.content.file.SurrogateReader; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java index 13fcc4c15..c03e2ce00 100644 --- a/source/de/anomic/search/MediaSnippet.java +++ b/source/de/anomic/search/MediaSnippet.java @@ -33,8 +33,8 @@ import net.yacy.document.Document; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.repository.LoaderDispatcher; -import de.anomic.crawler.retrieval.LoaderDispatcher; public class MediaSnippet { public int type; diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index fda0b324c..f3ae9cbdf 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -139,6 +139,7 @@ import net.yacy.kelondro.workflow.InstantBusyThread; import net.yacy.kelondro.workflow.WorkflowJob; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowThread; +import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlQueues; @@ -154,7 +155,6 @@ import de.anomic.crawler.ZURL; import de.anomic.crawler.CrawlProfile.entry; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; -import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.crawler.retrieval.Response; import de.anomic.data.Blacklist; import de.anomic.data.DefaultBlacklist; @@ -1618,7 +1618,7 @@ public final class Switchboard extends serverSwitch { document = TextParser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), b); assert(document != null) : "Unexpected error. Parser returned null."; } catch (final ParserException e) { - this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage()); + this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage(), e); addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.name(), e.getMessage()); if (document != null) { document.close(); diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index da5678838..005e5e8bf 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -47,8 +47,8 @@ import net.yacy.kelondro.index.ConcurrentARC; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.SetTools; +import net.yacy.repository.LoaderDispatcher; -import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.crawler.retrieval.Response; import de.anomic.http.client.Cache; import de.anomic.http.server.ResponseHeader; diff --git a/source/de/anomic/yacy/yacyRelease.java b/source/de/anomic/yacy/yacyRelease.java index 708d6cb08..072d39bd4 100644 --- a/source/de/anomic/yacy/yacyRelease.java +++ b/source/de/anomic/yacy/yacyRelease.java @@ -51,10 +51,10 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.OS; +import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index bec075a4b..80f84289c 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -32,7 +32,9 @@ import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.text.Collator; +import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -72,8 +74,9 @@ public final class TextParser { insensitiveCollator.setStrength(Collator.SECONDARY); insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); } - + private static final Map mime2parser = new TreeMap(insensitiveCollator); + private static final Map ext2parser = new TreeMap(insensitiveCollator); private static final Map ext2mime = new TreeMap(insensitiveCollator); private static final Set denyMime = new TreeSet(insensitiveCollator); private static final Set denyExtension = new TreeSet(insensitiveCollator); @@ -123,6 +126,15 @@ public final class TextParser { if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'."); ext2mime.put(ext, prototypeMime); } + + for (String ext: parser.supportedExtensions()) { + // process the extensions + Idiom p0 = ext2parser.get(ext); + if (p0 != null) log.logSevere("parser for extension '" + ext + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); + ext2parser.put(ext, parser); + Log.logInfo("PARSER", "Parser for extension '" + ext + "': " + parser.getName()); + } + } public static Document parseSource( @@ -190,36 +202,37 @@ public final class TextParser { final long contentLength, final InputStream sourceStream ) throws InterruptedException, ParserException { - try { - if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); - mimeType = normalizeMimeType(mimeType); - final String fileExt = location.getFileExtension(); - final String documentCharset = htmlParser.patchCharsetEncoding(charset); - Idiom parser = idiomParser(location, mimeType); - - if (parser == null) { - final String errorMsg = "No parser available to parse extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "'"; - log.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location); - } - - if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); + if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); + mimeType = normalizeMimeType(mimeType); + final String fileExt = location.getFileExtension(); + final String documentCharset = htmlParser.patchCharsetEncoding(charset); + List idioms = idiomParser(location, mimeType); + + if (idioms.size() == 0) { + final String errorMsg = "No parser available to parse extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "'"; + log.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg, location); + } + + if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); + + Document doc = null; + for (Idiom parser: idioms) { parser.setContentLength(contentLength); - Document doc = parser.parse(location, mimeType, documentCharset, sourceStream); - - if (doc == null) { - final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed: document == null"; - log.logWarning("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location); + try { + doc = parser.parse(location, mimeType, documentCharset, sourceStream); + } catch (ParserException e) { + log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); } - return doc; - } catch (final Exception e) { - if (e instanceof InterruptedException) throw (InterruptedException) e; - if (e instanceof ParserException) throw (ParserException) e; - final String errorMsg = "Unexpected exception. " + e.getMessage(); - log.logSevere("Unable to parse '" + location + "'. " + errorMsg, e); + if (doc != null) break; + } + + if (doc == null) { + final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed."; + log.logWarning("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg, location); } + return doc; } /** @@ -239,28 +252,46 @@ public final class TextParser { } } - private static Idiom idiomParser(final DigestURI url, String mimeType) throws ParserException { - // check mime type - if (mimeType != null) { - mimeType = normalizeMimeType(mimeType); - if (denyMime.contains(mimeType)) throw new ParserException("mime type '" + mimeType + "' is denied", url); - } else { - mimeType = normalizeMimeType(mimeType); - } - - Idiom idiom = mime2parser.get(mimeType); - if (idiom != null) return idiom; + /** + * find a parser for a given url and mime type + * because mime types returned by web severs are sometimes wrong, we also compute the mime type again + * from the extension that can be extracted from the url path. That means that there are 3 criteria + * that can be used to select a parser: + * - the given extension + * - the given mime type + * - the mime type computed from the extension + * @param url the given url + * @param mimeType the given mime type + * @return a list of Idiom parsers that may be appropriate for the given criteria + * @throws ParserException + */ + private static List idiomParser(final DigestURI url, String mimeType1) throws ParserException { + List idioms = new ArrayList(2); // check extension String ext = url.getFileExtension(); - if (ext == null || ext.length() == 0) throw new ParserException("no file extension", url); - if (denyExtension.contains(ext)) throw new ParserException("file extension '" + ext + "' is denied", url); - mimeType = ext2mime.get(ext); - if (mimeType == null) throw new ParserException("no parser available", url); - idiom = mime2parser.get(mimeType); - assert idiom != null; - if (idiom == null) throw new ParserException("no parser available (internal error!)", url); - return idiom; + Idiom idiom; + if (ext != null && ext.length() > 0) { + if (denyExtension.contains(ext)) throw new ParserException("file extension '" + ext + "' is denied", url); + idiom = ext2parser.get(ext); + if (idiom != null) idioms.add(idiom); + } + + // check given mime type + if (mimeType1 != null) { + mimeType1 = normalizeMimeType(mimeType1); + if (denyMime.contains(mimeType1)) throw new ParserException("mime type '" + mimeType1 + "' is denied", url); + idiom = mime2parser.get(mimeType1); + if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom); + } + + // check mime type computed from extension + String mimeType2 = ext2mime.get(ext); + if (mimeType2 == null || denyMime.contains(mimeType2)) return idioms; // in this case we are a bit more lazy + idiom = mime2parser.get(mimeType2); + if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom); + + return idioms; } public static String supportsMime(String mimeType) { diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index 52e956b77..8c7d6f993 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -40,7 +40,6 @@ import net.yacy.kelondro.data.meta.DigestURI; import org.apache.poi.hslf.extractor.PowerPointExtractor; - public class pptParser extends AbstractParser implements Idiom { /** @@ -112,7 +111,8 @@ public class pptParser extends AbstractParser implements Idiom { /* * an unexpected error occurred, log it and throw a ParserException - */ + */ + e.printStackTrace(); final String errorMsg = "Unable to parse the ppt document '" + location + "':" + e.getMessage(); this.theLogger.logSevere(errorMsg); throw new ParserException(errorMsg, location); diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java index ba4cb82d8..bc86a2ada 100644 --- a/source/net/yacy/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -130,7 +130,8 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener { /* * an unexpected error occurred, log it and throw a ParserException - */ + */ + e.printStackTrace(); final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage(); this.theLogger.logSevere(errorMsg); throw new ParserException(errorMsg, location); diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java similarity index 96% rename from source/de/anomic/crawler/retrieval/LoaderDispatcher.java rename to source/net/yacy/repository/LoaderDispatcher.java index 4d6af8aaa..de4a408cc 100644 --- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.crawler.retrieval; +package net.yacy.repository; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -46,6 +46,10 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.retrieval.FTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; +import de.anomic.crawler.retrieval.Request; +import de.anomic.crawler.retrieval.Response; import de.anomic.http.client.Cache; import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework;