using the generic document parser for crawl starts instead of the html

parser. This makes it possible that every type of document can be a crawl start point, not only text documents or html documents. Testet this with a pdf document.
13 years ago · ef5192f8c9
parent 33a71a61fa
commit ef5192f8c9
5 changed files with 56 additions and 29 deletions
--- a/.classpath
+++ b/.classpath
@ -35,7 +35,7 @@
 	<classpathentry kind="lib" path="lib/json-simple-1.1.jar"/>
 	<classpathentry kind="lib" path="lib/fontbox-1.6.0.jar"/>
 	<classpathentry kind="lib" path="lib/jempbox-1.6.0.jar"/>
-	<classpathentry kind="lib" path="lib/pdfbox-1.6.0.jar"/>
+	<classpathentry kind="lib" path="lib/pdfbox-1.6.0.jar" sourcepath="/Users/admin/.m2/repository/org/apache/pdfbox/pdfbox/1.6.0/pdfbox-1.6.0-sources.jar"/>
 	<classpathentry kind="lib" path="lib/commons-io-2.0.1.jar"/>
 	<classpathentry kind="lib" path="lib/xercesImpl.jar"/>
 	<classpathentry kind="lib" path="lib/xml-apis.jar"/>
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -42,6 +42,7 @@ import java.util.regex.PatternSyntaxException;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
+import net.yacy.document.Document;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.data.meta.DigestURI;
@ -312,9 +313,9 @@ public class Crawler_p {
                        sb.crawlQueues.errorURL.remove(urlhash);

                        // get a scraper to get the title
-                        final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
-                        final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
-                        final String description = scraper.getDescription();
+                        final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH);
+                        final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title();
+                        final String description = scraper.dc_description();

                        // stack url
                        sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
@ -357,7 +358,7 @@ public class Crawler_p {
                            //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
                            final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
                            tags.add("crawlStart");
-                            final String[] keywords = scraper.getKeywords();
+                            final String[] keywords = scraper.dc_subject();
                            if (keywords != null) {
                                for (final String k: keywords) {
                                    final String kk = BookmarkHelper.cleanTagsString(k);
@ -534,8 +535,7 @@ public class Crawler_p {
                    try {
                        final DigestURI sitelistURL = new DigestURI(crawlingStart);
                        // download document
-                        ContentScraper scraper = null;
-                        scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
+                        Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH);
                        // String title = scraper.getTitle();
                        // String description = scraper.getDescription();

--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@ -1,3 +1,28 @@
+// getpageinfo_p
+// (C) 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 11.11.2011 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate$
+// $LastChangedRevision$
+// $LastChangedBy$
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 import java.io.IOException;
 import java.net.MalformedURLException;
@ -10,7 +35,6 @@ import javax.xml.parsers.ParserConfigurationException;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
-import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.search.Switchboard;
@ -68,9 +92,9 @@ public class getpageinfo_p {
                } catch (final MalformedURLException e) {
                    Log.logException(e);
                }
-                ContentScraper scraper = null;
+                net.yacy.document.Document scraper = null;
                if (u != null) try {
-                    scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST);
+                    scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST);
                } catch (final IOException e) {
                    Log.logException(e);
                    // bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@ -78,13 +102,13 @@ public class getpageinfo_p {
                }
                if (scraper != null) {
                    // put the document title
-                    prop.putXML("title", scraper.getTitle());
+                    prop.putXML("title", scraper.dc_title());

                    // put the favicon that belongs to the document
                    prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());

                    // put keywords
-                    final String list[] = scraper.getKeywords();
+                    final String list[] = scraper.dc_subject();
                    int count = 0;
                    for (final String element: list) {
                        final String tag = element;
@ -95,7 +119,7 @@ public class getpageinfo_p {
                    }
                    prop.put("tags", count);
                    // put description
-                    prop.putXML("desc", scraper.getDescription());
+                    prop.putXML("desc", scraper.dc_description());
                    // put language
                    final Set<String> languages = scraper.getContentLanguages();
                    prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
--- a/source/net/yacy/peers/operation/yacyRelease.java
+++ b/source/net/yacy/peers/operation/yacyRelease.java
@ -52,7 +52,7 @@ import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
-import net.yacy.document.parser.html.ContentScraper;
+import net.yacy.document.Document;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.io.CharBuffer;
 import net.yacy.kelondro.logging.Log;
@ -234,11 +234,11 @@ public final class yacyRelease extends yacyVersion {
        // this is done by contacting a release location,
        // parsing the content and filtering+parsing links
        // returns the version info if successful, null otherwise
-        ContentScraper scraper;
+        Document scraper;
        try {
            final DigestURI uri = location.getLocationURL();
            Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
-            scraper = Switchboard.getSwitchboard().loader.parseResource(uri, CacheStrategy.NOCACHE);
+            scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE);
        } catch (final IOException e) {
            return null;
        }
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -26,7 +26,6 @@

 package net.yacy.repository;

-import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
@ -50,21 +49,19 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
-import net.yacy.document.parser.htmlParser;
-import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.search.Switchboard;
 import net.yacy.search.index.Segments;
 import de.anomic.crawler.CrawlProfile;
+import de.anomic.crawler.ZURL.FailCategory;
 import de.anomic.crawler.retrieval.FTPLoader;
 import de.anomic.crawler.retrieval.FileLoader;
 import de.anomic.crawler.retrieval.HTTPLoader;
 import de.anomic.crawler.retrieval.Request;
 import de.anomic.crawler.retrieval.Response;
 import de.anomic.crawler.retrieval.SMBLoader;
-import de.anomic.crawler.ZURL.FailCategory;
 import de.anomic.http.client.Cache;

 public final class LoaderDispatcher {
@ -192,7 +189,7 @@ public final class LoaderDispatcher {
        final String host = url.getHost();

        // check if url is in blacklist
-        if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
+        if (checkBlacklist && host != null && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
            this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
            throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
        }
@ -290,7 +287,7 @@ public final class LoaderDispatcher {
        if (response.getContent() == null) {
            throw new IOException("empty response (code " + response.getStatus() + ") for url " + url);
        }
-        
+
        // we got something. Now check if we want to store that to the cache
        // first check looks if we want to store the content to the cache
        if (crawlProfile == null || !crawlProfile.storeHTCache()) {
@ -352,16 +349,22 @@ public final class LoaderDispatcher {
        return response.parse();
    }

-    public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
-        // load page
-        final Response r = this.load(request(location, true, false), cachePolicy, true);
-        final byte[] page = (r == null) ? null : r.getContent();
-        if (page == null) throw new IOException("no response from url " + location.toString());
+    public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
+        // load resource
+        Request request = request(location, true, false);
+        final Response response = this.load(request, cachePolicy, 10000, true);
+        final DigestURI url = request.url();
+        if (response == null) throw new IOException("no Response for url " + url);
+
+        // if it is still not available, report an error
+        if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);

+        // parse resource
        try {
-        	return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
+            Document[] documents = response.parse();
+            return Document.mergeDocuments(location, response.getMimeType(), documents);
        } catch(final Parser.Failure e) {
-        	throw new IOException(e.getMessage());
+            throw new IOException(e.getMessage());
        }
    }