using the generic document parser for crawl starts instead of the html

parser. This makes it possible that every type of document can be a
crawl start point, not only text documents or html documents. Testet
this with a pdf document.
pull/1/head
Michael Peter Christen 13 years ago
parent 33a71a61fa
commit ef5192f8c9

@ -35,7 +35,7 @@
<classpathentry kind="lib" path="lib/json-simple-1.1.jar"/>
<classpathentry kind="lib" path="lib/fontbox-1.6.0.jar"/>
<classpathentry kind="lib" path="lib/jempbox-1.6.0.jar"/>
<classpathentry kind="lib" path="lib/pdfbox-1.6.0.jar"/>
<classpathentry kind="lib" path="lib/pdfbox-1.6.0.jar" sourcepath="/Users/admin/.m2/repository/org/apache/pdfbox/pdfbox/1.6.0/pdfbox-1.6.0-sources.jar"/>
<classpathentry kind="lib" path="lib/commons-io-2.0.1.jar"/>
<classpathentry kind="lib" path="lib/xercesImpl.jar"/>
<classpathentry kind="lib" path="lib/xml-apis.jar"/>

@ -42,6 +42,7 @@ import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
@ -312,9 +313,9 @@ public class Crawler_p {
sb.crawlQueues.errorURL.remove(urlhash);
// get a scraper to get the title
final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
final String description = scraper.getDescription();
final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH);
final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title();
final String description = scraper.dc_description();
// stack url
sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
@ -357,7 +358,7 @@ public class Crawler_p {
//final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
tags.add("crawlStart");
final String[] keywords = scraper.getKeywords();
final String[] keywords = scraper.dc_subject();
if (keywords != null) {
for (final String k: keywords) {
final String kk = BookmarkHelper.cleanTagsString(k);
@ -534,8 +535,7 @@ public class Crawler_p {
try {
final DigestURI sitelistURL = new DigestURI(crawlingStart);
// download document
ContentScraper scraper = null;
scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH);
// String title = scraper.getTitle();
// String description = scraper.getDescription();

@ -1,3 +1,28 @@
// getpageinfo_p
// (C) 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.11.2011 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import java.net.MalformedURLException;
@ -10,7 +35,6 @@ import javax.xml.parsers.ParserConfigurationException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
@ -68,9 +92,9 @@ public class getpageinfo_p {
} catch (final MalformedURLException e) {
Log.logException(e);
}
ContentScraper scraper = null;
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST);
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST);
} catch (final IOException e) {
Log.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@ -78,13 +102,13 @@ public class getpageinfo_p {
}
if (scraper != null) {
// put the document title
prop.putXML("title", scraper.getTitle());
prop.putXML("title", scraper.dc_title());
// put the favicon that belongs to the document
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
// put keywords
final String list[] = scraper.getKeywords();
final String list[] = scraper.dc_subject();
int count = 0;
for (final String element: list) {
final String tag = element;
@ -95,7 +119,7 @@ public class getpageinfo_p {
}
prop.put("tags", count);
// put description
prop.putXML("desc", scraper.getDescription());
prop.putXML("desc", scraper.dc_description());
// put language
final Set<String> languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());

@ -52,7 +52,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
@ -234,11 +234,11 @@ public final class yacyRelease extends yacyVersion {
// this is done by contacting a release location,
// parsing the content and filtering+parsing links
// returns the version info if successful, null otherwise
ContentScraper scraper;
Document scraper;
try {
final DigestURI uri = location.getLocationURL();
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
scraper = Switchboard.getSwitchboard().loader.parseResource(uri, CacheStrategy.NOCACHE);
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE);
} catch (final IOException e) {
return null;
}

@ -26,7 +26,6 @@
package net.yacy.repository;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
@ -50,21 +49,19 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.crawler.retrieval.FTPLoader;
import de.anomic.crawler.retrieval.FileLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.crawler.retrieval.SMBLoader;
import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.http.client.Cache;
public final class LoaderDispatcher {
@ -192,7 +189,7 @@ public final class LoaderDispatcher {
final String host = url.getHost();
// check if url is in blacklist
if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
if (checkBlacklist && host != null && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@ -290,7 +287,7 @@ public final class LoaderDispatcher {
if (response.getContent() == null) {
throw new IOException("empty response (code " + response.getStatus() + ") for url " + url);
}
// we got something. Now check if we want to store that to the cache
// first check looks if we want to store the content to the cache
if (crawlProfile == null || !crawlProfile.storeHTCache()) {
@ -352,16 +349,22 @@ public final class LoaderDispatcher {
return response.parse();
}
public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
// load page
final Response r = this.load(request(location, true, false), cachePolicy, true);
final byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
// load resource
Request request = request(location, true, false);
final Response response = this.load(request, cachePolicy, 10000, true);
final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
// if it is still not available, report an error
if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
// parse resource
try {
return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
Document[] documents = response.parse();
return Document.mergeDocuments(location, response.getMimeType(), documents);
} catch(final Parser.Failure e) {
throw new IOException(e.getMessage());
throw new IOException(e.getMessage());
}
}

Loading…
Cancel
Save