diff --git a/.classpath b/.classpath
index f03a811ba..70e8f7fc5 100644
--- a/.classpath
+++ b/.classpath
@@ -35,7 +35,7 @@
-
+
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index ae8814eb1..5a3966c57 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -42,6 +42,7 @@ import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
+import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
@@ -312,9 +313,9 @@ public class Crawler_p {
sb.crawlQueues.errorURL.remove(urlhash);
// get a scraper to get the title
- final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
- final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
- final String description = scraper.getDescription();
+ final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH);
+ final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title();
+ final String description = scraper.dc_description();
// stack url
sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
@@ -357,7 +358,7 @@ public class Crawler_p {
//final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
tags.add("crawlStart");
- final String[] keywords = scraper.getKeywords();
+ final String[] keywords = scraper.dc_subject();
if (keywords != null) {
for (final String k: keywords) {
final String kk = BookmarkHelper.cleanTagsString(k);
@@ -534,8 +535,7 @@ public class Crawler_p {
try {
final DigestURI sitelistURL = new DigestURI(crawlingStart);
// download document
- ContentScraper scraper = null;
- scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
+ Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH);
// String title = scraper.getTitle();
// String description = scraper.getDescription();
diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java
index 9a99c21d3..a2c483543 100644
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@@ -1,3 +1,28 @@
+// getpageinfo_p
+// (C) 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 11.11.2011 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate$
+// $LastChangedRevision$
+// $LastChangedBy$
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import java.net.MalformedURLException;
@@ -10,7 +35,6 @@ import javax.xml.parsers.ParserConfigurationException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
-import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
@@ -68,9 +92,9 @@ public class getpageinfo_p {
} catch (final MalformedURLException e) {
Log.logException(e);
}
- ContentScraper scraper = null;
+ net.yacy.document.Document scraper = null;
if (u != null) try {
- scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST);
+ scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST);
} catch (final IOException e) {
Log.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@@ -78,13 +102,13 @@ public class getpageinfo_p {
}
if (scraper != null) {
// put the document title
- prop.putXML("title", scraper.getTitle());
+ prop.putXML("title", scraper.dc_title());
// put the favicon that belongs to the document
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
// put keywords
- final String list[] = scraper.getKeywords();
+ final String list[] = scraper.dc_subject();
int count = 0;
for (final String element: list) {
final String tag = element;
@@ -95,7 +119,7 @@ public class getpageinfo_p {
}
prop.put("tags", count);
// put description
- prop.putXML("desc", scraper.getDescription());
+ prop.putXML("desc", scraper.dc_description());
// put language
final Set languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java
index 8243cc978..8c0e29ef4 100644
--- a/source/net/yacy/peers/operation/yacyRelease.java
+++ b/source/net/yacy/peers/operation/yacyRelease.java
@@ -52,7 +52,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
-import net.yacy.document.parser.html.ContentScraper;
+import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
@@ -234,11 +234,11 @@ public final class yacyRelease extends yacyVersion {
// this is done by contacting a release location,
// parsing the content and filtering+parsing links
// returns the version info if successful, null otherwise
- ContentScraper scraper;
+ Document scraper;
try {
final DigestURI uri = location.getLocationURL();
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
- scraper = Switchboard.getSwitchboard().loader.parseResource(uri, CacheStrategy.NOCACHE);
+ scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE);
} catch (final IOException e) {
return null;
}
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index 765eb0063..b85eba3a0 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -26,7 +26,6 @@
package net.yacy.repository;
-import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
@@ -50,21 +49,19 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
-import net.yacy.document.parser.htmlParser;
-import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments;
import de.anomic.crawler.CrawlProfile;
+import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.crawler.retrieval.FTPLoader;
import de.anomic.crawler.retrieval.FileLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.crawler.retrieval.SMBLoader;
-import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.http.client.Cache;
public final class LoaderDispatcher {
@@ -192,7 +189,7 @@ public final class LoaderDispatcher {
final String host = url.getHost();
// check if url is in blacklist
- if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
+ if (checkBlacklist && host != null && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@@ -290,7 +287,7 @@ public final class LoaderDispatcher {
if (response.getContent() == null) {
throw new IOException("empty response (code " + response.getStatus() + ") for url " + url);
}
-
+
// we got something. Now check if we want to store that to the cache
// first check looks if we want to store the content to the cache
if (crawlProfile == null || !crawlProfile.storeHTCache()) {
@@ -352,16 +349,22 @@ public final class LoaderDispatcher {
return response.parse();
}
- public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
- // load page
- final Response r = this.load(request(location, true, false), cachePolicy, true);
- final byte[] page = (r == null) ? null : r.getContent();
- if (page == null) throw new IOException("no response from url " + location.toString());
+ public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
+ // load resource
+ Request request = request(location, true, false);
+ final Response response = this.load(request, cachePolicy, 10000, true);
+ final DigestURI url = request.url();
+ if (response == null) throw new IOException("no Response for url " + url);
+
+ // if it is still not available, report an error
+ if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
+ // parse resource
try {
- return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
+ Document[] documents = response.parse();
+ return Document.mergeDocuments(location, response.getMimeType(), documents);
} catch(final Parser.Failure e) {
- throw new IOException(e.getMessage());
+ throw new IOException(e.getMessage());
}
}