From ef5192f8c9053e7e392f2b3dc21b339f5f3636c7 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 23 Jan 2012 17:27:29 +0100 Subject: [PATCH] using the generic document parser for crawl starts instead of the html parser. This makes it possible that every type of document can be a crawl start point, not only text documents or html documents. Testet this with a pdf document. --- .classpath | 2 +- htroot/Crawler_p.java | 12 +++---- htroot/api/getpageinfo_p.java | 36 +++++++++++++++---- .../net/yacy/peers/operation/yacyRelease.java | 6 ++-- .../net/yacy/repository/LoaderDispatcher.java | 29 ++++++++------- 5 files changed, 56 insertions(+), 29 deletions(-) diff --git a/.classpath b/.classpath index f03a811ba..70e8f7fc5 100644 --- a/.classpath +++ b/.classpath @@ -35,7 +35,7 @@ - + diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index ae8814eb1..5a3966c57 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -42,6 +42,7 @@ import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; +import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; @@ -312,9 +313,9 @@ public class Crawler_p { sb.crawlQueues.errorURL.remove(urlhash); // get a scraper to get the title - final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH); - final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle(); - final String description = scraper.getDescription(); + final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH); + final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title(); + final String description = scraper.dc_description(); // stack url sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it @@ -357,7 +358,7 @@ public class Crawler_p { //final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart")); tags.add("crawlStart"); - final String[] keywords = scraper.getKeywords(); + final String[] keywords = scraper.dc_subject(); if (keywords != null) { for (final String k: keywords) { final String kk = BookmarkHelper.cleanTagsString(k); @@ -534,8 +535,7 @@ public class Crawler_p { try { final DigestURI sitelistURL = new DigestURI(crawlingStart); // download document - ContentScraper scraper = null; - scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH); + Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH); // String title = scraper.getTitle(); // String description = scraper.getDescription(); diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index 9a99c21d3..a2c483543 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -1,3 +1,28 @@ +// getpageinfo_p +// (C) 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 11.11.2011 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.io.IOException; import java.net.MalformedURLException; @@ -10,7 +35,6 @@ import javax.xml.parsers.ParserConfigurationException; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; -import net.yacy.document.parser.html.ContentScraper; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; @@ -68,9 +92,9 @@ public class getpageinfo_p { } catch (final MalformedURLException e) { Log.logException(e); } - ContentScraper scraper = null; + net.yacy.document.Document scraper = null; if (u != null) try { - scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST); + scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST); } catch (final IOException e) { Log.logException(e); // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" @@ -78,13 +102,13 @@ public class getpageinfo_p { } if (scraper != null) { // put the document title - prop.putXML("title", scraper.getTitle()); + prop.putXML("title", scraper.dc_title()); // put the favicon that belongs to the document prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString()); // put keywords - final String list[] = scraper.getKeywords(); + final String list[] = scraper.dc_subject(); int count = 0; for (final String element: list) { final String tag = element; @@ -95,7 +119,7 @@ public class getpageinfo_p { } prop.put("tags", count); // put description - prop.putXML("desc", scraper.getDescription()); + prop.putXML("desc", scraper.dc_description()); // put language final Set languages = scraper.getContentLanguages(); prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next()); diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index 8243cc978..8c0e29ef4 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -52,7 +52,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.services.federated.yacy.CacheStrategy; -import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.logging.Log; @@ -234,11 +234,11 @@ public final class yacyRelease extends yacyVersion { // this is done by contacting a release location, // parsing the content and filtering+parsing links // returns the version info if successful, null otherwise - ContentScraper scraper; + Document scraper; try { final DigestURI uri = location.getLocationURL(); Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump - scraper = Switchboard.getSwitchboard().loader.parseResource(uri, CacheStrategy.NOCACHE); + scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE); } catch (final IOException e) { return null; } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 765eb0063..b85eba3a0 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -26,7 +26,6 @@ package net.yacy.repository; -import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; @@ -50,21 +49,19 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; -import net.yacy.document.parser.htmlParser; -import net.yacy.document.parser.html.ContentScraper; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; import net.yacy.search.index.Segments; import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.ZURL.FailCategory; import de.anomic.crawler.retrieval.FTPLoader; import de.anomic.crawler.retrieval.FileLoader; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.SMBLoader; -import de.anomic.crawler.ZURL.FailCategory; import de.anomic.http.client.Cache; public final class LoaderDispatcher { @@ -192,7 +189,7 @@ public final class LoaderDispatcher { final String host = url.getHost(); // check if url is in blacklist - if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) { + if (checkBlacklist && host != null && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) { this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); } @@ -290,7 +287,7 @@ public final class LoaderDispatcher { if (response.getContent() == null) { throw new IOException("empty response (code " + response.getStatus() + ") for url " + url); } - + // we got something. Now check if we want to store that to the cache // first check looks if we want to store the content to the cache if (crawlProfile == null || !crawlProfile.storeHTCache()) { @@ -352,16 +349,22 @@ public final class LoaderDispatcher { return response.parse(); } - public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException { - // load page - final Response r = this.load(request(location, true, false), cachePolicy, true); - final byte[] page = (r == null) ? null : r.getContent(); - if (page == null) throw new IOException("no response from url " + location.toString()); + public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy) throws IOException { + // load resource + Request request = request(location, true, false); + final Response response = this.load(request, cachePolicy, 10000, true); + final DigestURI url = request.url(); + if (response == null) throw new IOException("no Response for url " + url); + + // if it is still not available, report an error + if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url); + // parse resource try { - return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page)); + Document[] documents = response.parse(); + return Document.mergeDocuments(location, response.getMimeType(), documents); } catch(final Parser.Failure e) { - throw new IOException(e.getMessage()); + throw new IOException(e.getMessage()); } }