From ef5192f8c9053e7e392f2b3dc21b339f5f3636c7 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 23 Jan 2012 17:27:29 +0100
Subject: [PATCH] using the generic document parser for crawl starts instead of
 the html parser. This makes it possible that every type of document can be a
 crawl start point, not only text documents or html documents. Testet this
 with a pdf document.

---
 .classpath                                    |  2 +-
 htroot/Crawler_p.java                         | 12 +++----
 htroot/api/getpageinfo_p.java                 | 36 +++++++++++++++----
 .../net/yacy/peers/operation/yacyRelease.java |  6 ++--
 .../net/yacy/repository/LoaderDispatcher.java | 29 ++++++++-------
 5 files changed, 56 insertions(+), 29 deletions(-)
diff --git a/.classpath b/.classpath
index f03a811ba..70e8f7fc5 100644
--- a/.classpath
+++ b/.classpath
@@ -35,7 +35,7 @@
 	<classpathentry kind="lib" path="lib/json-simple-1.1.jar"/>
 	<classpathentry kind="lib" path="lib/fontbox-1.6.0.jar"/>
 	<classpathentry kind="lib" path="lib/jempbox-1.6.0.jar"/>
-	<classpathentry kind="lib" path="lib/pdfbox-1.6.0.jar"/>
+	<classpathentry kind="lib" path="lib/pdfbox-1.6.0.jar" sourcepath="/Users/admin/.m2/repository/org/apache/pdfbox/pdfbox/1.6.0/pdfbox-1.6.0-sources.jar"/>
 	<classpathentry kind="lib" path="lib/commons-io-2.0.1.jar"/>
 	<classpathentry kind="lib" path="lib/xercesImpl.jar"/>
 	<classpathentry kind="lib" path="lib/xml-apis.jar"/>
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index ae8814eb1..5a3966c57 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -42,6 +42,7 @@ import java.util.regex.PatternSyntaxException;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
+import net.yacy.document.Document;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.data.meta.DigestURI;
@@ -312,9 +313,9 @@ public class Crawler_p {
                         sb.crawlQueues.errorURL.remove(urlhash);
 
                         // get a scraper to get the title
-                        final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
-                        final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
-                        final String description = scraper.getDescription();
+                        final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH);
+                        final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title();
+                        final String description = scraper.dc_description();
 
                         // stack url
                         sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
@@ -357,7 +358,7 @@ public class Crawler_p {
                             //final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
                             final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
                             tags.add("crawlStart");
-                            final String[] keywords = scraper.getKeywords();
+                            final String[] keywords = scraper.dc_subject();
                             if (keywords != null) {
                                 for (final String k: keywords) {
                                     final String kk = BookmarkHelper.cleanTagsString(k);
@@ -534,8 +535,7 @@ public class Crawler_p {
                     try {
                         final DigestURI sitelistURL = new DigestURI(crawlingStart);
                         // download document
-                        ContentScraper scraper = null;
-                        scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
+                        Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH);
                         // String title = scraper.getTitle();
                         // String description = scraper.getDescription();
 
diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java
index 9a99c21d3..a2c483543 100644
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@@ -1,3 +1,28 @@
+// getpageinfo_p
+// (C) 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 11.11.2011 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate$
+// $LastChangedRevision$
+// $LastChangedBy$
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
 import java.io.IOException;
 import java.net.MalformedURLException;
@@ -10,7 +35,6 @@ import javax.xml.parsers.ParserConfigurationException;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
-import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.search.Switchboard;
@@ -68,9 +92,9 @@ public class getpageinfo_p {
                 } catch (final MalformedURLException e) {
                     Log.logException(e);
                 }
-                ContentScraper scraper = null;
+                net.yacy.document.Document scraper = null;
                 if (u != null) try {
-                    scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST);
+                    scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST);
                 } catch (final IOException e) {
                     Log.logException(e);
                     // bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@@ -78,13 +102,13 @@ public class getpageinfo_p {
                 }
                 if (scraper != null) {
                     // put the document title
-                    prop.putXML("title", scraper.getTitle());
+                    prop.putXML("title", scraper.dc_title());
 
                     // put the favicon that belongs to the document
                     prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
 
                     // put keywords
-                    final String list[] = scraper.getKeywords();
+                    final String list[] = scraper.dc_subject();
                     int count = 0;
                     for (final String element: list) {
                         final String tag = element;
@@ -95,7 +119,7 @@ public class getpageinfo_p {
                     }
                     prop.put("tags", count);
                     // put description
-                    prop.putXML("desc", scraper.getDescription());
+                    prop.putXML("desc", scraper.dc_description());
                     // put language
                     final Set<String> languages = scraper.getContentLanguages();
                     prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java
index 8243cc978..8c0e29ef4 100644
--- a/source/net/yacy/peers/operation/yacyRelease.java
+++ b/source/net/yacy/peers/operation/yacyRelease.java
@@ -52,7 +52,7 @@ import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
-import net.yacy.document.parser.html.ContentScraper;
+import net.yacy.document.Document;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.io.CharBuffer;
 import net.yacy.kelondro.logging.Log;
@@ -234,11 +234,11 @@ public final class yacyRelease extends yacyVersion {
         // this is done by contacting a release location,
         // parsing the content and filtering+parsing links
         // returns the version info if successful, null otherwise
-        ContentScraper scraper;
+        Document scraper;
         try {
             final DigestURI uri = location.getLocationURL();
             Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
-            scraper = Switchboard.getSwitchboard().loader.parseResource(uri, CacheStrategy.NOCACHE);
+            scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE);
         } catch (final IOException e) {
             return null;
         }
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index 765eb0063..b85eba3a0 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -26,7 +26,6 @@
 
 package net.yacy.repository;
 
-import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
@@ -50,21 +49,19 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
-import net.yacy.document.parser.htmlParser;
-import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.search.Switchboard;
 import net.yacy.search.index.Segments;
 import de.anomic.crawler.CrawlProfile;
+import de.anomic.crawler.ZURL.FailCategory;
 import de.anomic.crawler.retrieval.FTPLoader;
 import de.anomic.crawler.retrieval.FileLoader;
 import de.anomic.crawler.retrieval.HTTPLoader;
 import de.anomic.crawler.retrieval.Request;
 import de.anomic.crawler.retrieval.Response;
 import de.anomic.crawler.retrieval.SMBLoader;
-import de.anomic.crawler.ZURL.FailCategory;
 import de.anomic.http.client.Cache;
 
 public final class LoaderDispatcher {
@@ -192,7 +189,7 @@ public final class LoaderDispatcher {
         final String host = url.getHost();
 
         // check if url is in blacklist
-        if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
+        if (checkBlacklist && host != null && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
             this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
             throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
         }
@@ -290,7 +287,7 @@ public final class LoaderDispatcher {
         if (response.getContent() == null) {
             throw new IOException("empty response (code " + response.getStatus() + ") for url " + url);
         }
-        
+
         // we got something. Now check if we want to store that to the cache
         // first check looks if we want to store the content to the cache
         if (crawlProfile == null || !crawlProfile.storeHTCache()) {
@@ -352,16 +349,22 @@ public final class LoaderDispatcher {
         return response.parse();
     }
 
-    public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
-        // load page
-        final Response r = this.load(request(location, true, false), cachePolicy, true);
-        final byte[] page = (r == null) ? null : r.getContent();
-        if (page == null) throw new IOException("no response from url " + location.toString());
+    public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
+        // load resource
+        Request request = request(location, true, false);
+        final Response response = this.load(request, cachePolicy, 10000, true);
+        final DigestURI url = request.url();
+        if (response == null) throw new IOException("no Response for url " + url);
+
+        // if it is still not available, report an error
+        if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
 
+        // parse resource
         try {
-        	return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
+            Document[] documents = response.parse();
+            return Document.mergeDocuments(location, response.getMimeType(), documents);
         } catch(final Parser.Failure e) {
-        	throw new IOException(e.getMessage());
+            throw new IOException(e.getMessage());
         }
     }