Advanced Crawl from local file : better processing of large files.

Applied strategy : when there is no restriction on domains or
sub-path(s), stack anchor links once discovered by the content scraper
instead of waiting the complete parsing of the file. 

This makes it possible to handle a crawling start file with thousands of
links in a reasonable amount of time.

Performance limitation : even if the crawl start faster with a large
file, the content of the parsed file still is fully loaded in memory.
pull/88/head
luccioman 8 years ago
parent ee92082a3b
commit 47af33a04c

@ -21,6 +21,7 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
@ -47,6 +48,7 @@ import net.yacy.cora.util.JSONException;
import net.yacy.cora.util.JSONObject;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.FileCrawlStarterTask;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL.StackType;
@ -483,22 +485,16 @@ public class Crawler_p {
if ("file".equals(crawlingMode) && post.containsKey("crawlingFile") && crawlingFile != null) {
final String crawlingFileContent = post.get("crawlingFile$file", "");
try {
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if (crawlingFile != null && crawlingFile.exists()) {
FileUtils.copy(new FileInputStream(crawlingFile), writer);
} else {
FileUtils.copy(crawlingFileContent, writer);
}
writer.close();
// get links and generate filter
hyperlinks_from_file = scraper.getAnchors();
if (newcrawlingdepth > 0) {
if (fullDomain) {
/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now.
* Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent);
newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
} else if (subPath) {
/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now.
* Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent);
newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
}
}
@ -627,17 +623,24 @@ public class Crawler_p {
ConcurrentLog.logException(e);
}
} else if ("file".equals(crawlingMode)) {
if (post.containsKey("crawlingFile") && crawlingFile != null && hyperlinks_from_file != null) {
try {
if (newcrawlingdepth > 0) {
if (fullDomain) {
newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
} else if (subPath) {
newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
if (post.containsKey("crawlingFile") && crawlingFile != null) {
try {
if(newcrawlingdepth > 0 && (fullDomain || subPath)) {
/* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
if(hyperlinks_from_file != null) {
sb.crawler.putActive(handle, profile);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
}
} else {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post.get("crawlingFile$file", "");
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
new VocabularyScraper(), profile.timezoneOffset());
FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
sb.crawler.putActive(handle, profile);
crawlStarterTask.start();
}
sb.crawler.putActive(handle, profile);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
@ -756,6 +759,58 @@ public class Crawler_p {
return prop;
}
/**
* Scrape crawlingFile or crawlingFileContent and get all anchor links from it.
* @param crawlingFile crawl start file (must not be null)
* @param timezoneOffset local timezone offset
* @param crawlingFileContent content of the crawling file (optional : used only when crawlingFile does no exists)
* @return all the anchor links from the crawling file
* @throws MalformedURLException
* @throws IOException
* @throws FileNotFoundException
*/
private static List<AnchorURL> crawlingFileStart(final File crawlingFile, int timezoneOffset,
final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
List<AnchorURL> hyperlinks_from_file;
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/* Let's report here detailed error to help user when he selected a wrong file */
if(!crawlingFile.exists()) {
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists");
}
if(!crawlingFile.isFile()) {
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file");
}
if(!crawlingFile.canRead()) {
throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
}
}
if (crawlingFile != null) {
FileInputStream inStream = null;
try {
inStream = new FileInputStream(crawlingFile);
FileUtils.copy(inStream, writer);
} finally {
if(inStream != null) {
try {
inStream.close();
} catch(IOException ignoredException) {
ConcurrentLog.info("Crawler_p", "Could not close crawlingFile : " + crawlingFile.getAbsolutePath());
}
}
}
} else {
FileUtils.copy(crawlingFileContent, writer);
}
writer.close();
// get links and generate filter
hyperlinks_from_file = scraper.getAnchors();
return hyperlinks_from_file;
}
private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
if (!recrawlIfOlderCheck) return null;
if ("year".equals(unit)) return new Date(System.currentTimeMillis() - number * AbstractFormatter.normalyearMillis);

@ -151,26 +151,47 @@ public final class CrawlStacker {
if (CrawlStacker.log.isFinest()) CrawlStacker.log.finest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth());
this.requestQueue.enQueue(entry);
}
public void enqueueEntriesAsynchronous(
final byte[] initiator,
final String profileHandle,
final List<AnchorURL> hyperlinks,
final int timezoneOffset) {
new Thread() {
new Thread("enqueueEntriesAsynchronous") {
@Override
public void run() {
Thread.currentThread().setName("enqueueEntriesAsynchronous");
enqueueEntries(initiator, profileHandle, hyperlinks, true, timezoneOffset);
}
}.start();
}
private void enqueueEntries(
/**
* Enqueue crawl start entries
* @param initiator Hash of the peer initiating the crawl
* @param profileHandle name of the active crawl profile
* @param hyperlinks crawl starting points links to stack
* @param replace Specify whether old indexed entries should be replaced
* @param timezoneOffset local time-zone offset
*/
public void enqueueEntries(
final byte[] initiator,
final String profileHandle,
final List<AnchorURL> hyperlinks,
final boolean replace,
final int timezoneOffset) {
/* Let's check if the profile is still active before removing any existing entry */
byte[] handle = UTF8.getBytes(profileHandle);
final CrawlProfile profile = this.crawler.get(handle);
if (profile == null) {
String error;
if(hyperlinks.size() == 1) {
error = "Rejected URL : " + hyperlinks.get(0).toNormalform(false) + ". Reason : LOST STACKER PROFILE HANDLE '" + profileHandle + "'";
} else {
error = "Rejected " + hyperlinks.size() + " crawl entries. Reason : LOST STACKER PROFILE HANDLE '" + profileHandle + "'";
}
CrawlStacker.log.info(error); // this is NOT an error but a normal effect when terminating a crawl queue
return;
}
if (replace) {
// delete old entries, if exists to force a re-load of the url (thats wanted here)
Set<String> hosthashes = new HashSet<String>();

@ -0,0 +1,100 @@
// CrawlStarterFromSraper.java
// ---------------------------
// Copyright 2016 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.crawler;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.parser.html.ContentScraperListener;
/**
* Enqueue an entry to the crawlStacker each time an anchor is discovered by the ContentScraper
* @author luccioman
*
*/
public class CrawlStarterFromSraper implements ContentScraperListener {
private final static ConcurrentLog log = new ConcurrentLog(CrawlStarterFromSraper.class.getSimpleName());
/** CrawlStacker instance : will receive anchor links used as crawl starting points */
private CrawlStacker crawlStacker;
/** Hash of the peer initiating the crawl */
private final byte[] initiatorHash;
/** Active crawl profile */
private CrawlProfile profile;
/** Specify whether old indexed entries should be replaced */
private final boolean replace;
/**
* Constructor
* @param crawlStacker CrawlStacker instance : will receive anchor links used as crawl starting points
* @param initiatorHash Hash of the peer initiating the crawl (must not be null)
* @param profile active crawl profile (must not be null)
* @param replace Specify whether old indexed entries should be replaced
* @throws IllegalArgumentException when a required parameter is null
*/
public CrawlStarterFromSraper(final CrawlStacker crawlStacker, final byte[] initiatorHash,
final CrawlProfile profile,
final boolean replace) {
if(crawlStacker == null) {
throw new IllegalArgumentException("crawlStacker parameter must not be null");
}
this.crawlStacker = crawlStacker;
if(initiatorHash == null) {
throw new IllegalArgumentException("initiatorHash parameter must not be null");
}
this.initiatorHash = initiatorHash;
this.replace = replace;
if(profile == null) {
throw new IllegalArgumentException("profile parameter must not be null");
}
this.profile = profile;
}
@Override
public void scrapeTag0(String tagname, Properties tagopts) {
// Nothing to do on this event
}
@Override
public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
// Nothing to do on this event
}
@Override
public void anchorAdded(String anchorURL) {
List<AnchorURL> urls = new ArrayList<>();
try {
urls.add(new AnchorURL(anchorURL));
this.crawlStacker.enqueueEntries(this.initiatorHash, this.profile.handle(), urls, this.replace, this.profile.timezoneOffset());
} catch (MalformedURLException e) {
log.warn("Malformed URL : " + anchorURL);
}
}
}

@ -0,0 +1,173 @@
// FileCrawlStarterTask.java
// ---------------------------
// Copyright 2016 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.crawler;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Writer;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.util.FileUtils;
/**
* A task used trigger crawl starts from a file (HTML or any other supported
* text file) containing anchor links. It does not wait full file parsing before
* sending anchor links to the crawl stacker and thus can handle files with many
* links.
*
* @author luccioman
*/
public class FileCrawlStarterTask extends Thread {
private final static ConcurrentLog log = new ConcurrentLog(FileCrawlStarterTask.class.getSimpleName());
/** A text file containing crawl start links */
private File crawlingFile;
/** Alternative to crawlingFile : holds file content */
private String crawlingFileContent;
/** Content scraper that will scrape file content */
private ContentScraper scraper;
/** Active crawl profile */
private CrawlProfile profile;
/**
* CrawlStacker instance : will receive anchor links used as crawl starting
* points
*/
private CrawlStacker crawlStacker;
/** Hash of the peer initiating the crawl */
private final byte[] initiatorHash;
/**
* Constructor
*
* @param crawlingFile
* a text file containing crawl start links (alternatively,
* crawlingFileContent parameter can be used)
* @param crawlingFileContent
* content of a text file containing crawl start links
* (alternatively, crawlingFile parameter can be used)
* @param scraper
* ContentScraper instance used to scrape links from the file
* @param profile
* active crawl profile (must not be null)
* @param crawlStacker
* CrawlStacker instance : will receive anchor links used as
* crawl starting points (must not be null)
* @param initiatorHash
* Hash of the peer initiating the crawl
* @throws IllegalArgumentException
* when one of the required parameters is null
* @throws IOException
* when crawlingFileContent is null and crawlingFile does not
* exists or can not be read
*/
public FileCrawlStarterTask(final File crawlingFile, final String crawlingFileContent, final ContentScraper scraper,
final CrawlProfile profile, final CrawlStacker crawlStacker, final byte[] initiatorHash)
throws IllegalArgumentException, FileNotFoundException, IOException {
super(FileCrawlStarterTask.class.getSimpleName());
if (crawlingFile == null && crawlingFileContent == null) {
throw new IllegalArgumentException(
"At least one of crawlingFile or crawlingFileContent parameter must not be null");
}
if ((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/*
* Lets check now if the crawlingFile exists and can be read so the
* error can be synchronously reported to the caller
*/
if (!crawlingFile.exists()) {
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists");
}
if (!crawlingFile.isFile()) {
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file");
}
if (!crawlingFile.canRead()) {
throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
}
}
this.crawlingFile = crawlingFile;
this.crawlingFileContent = crawlingFileContent;
if (scraper == null) {
throw new IllegalArgumentException("scraper parameter must not be null");
}
this.scraper = scraper;
if (profile == null) {
throw new IllegalArgumentException("profile parameter must not be null");
}
this.profile = profile;
if (crawlStacker == null) {
throw new IllegalArgumentException("crawlStacker parameter must not be null");
}
this.crawlStacker = crawlStacker;
if (initiatorHash == null) {
throw new IllegalArgumentException("initiatorHash parameter must not be null");
}
this.initiatorHash = initiatorHash;
}
/**
* Run the content scraping on the file and once detected push any anchor
* link to the crawlStacker.
*/
@Override
public void run() {
/*
* This is the listener which makes possible the push of links to the
* crawl stacker without waiting the complete end of content scraping
*/
CrawlStarterFromSraper anchorListener = new CrawlStarterFromSraper(this.crawlStacker, this.initiatorHash,
this.profile, true);
this.scraper.registerHtmlFilterEventListener(anchorListener);
final Writer writer = new TransformerWriter(null, null, this.scraper, null, false);
FileInputStream inStream = null;
try {
if (this.crawlingFile != null && this.crawlingFile.exists()) {
inStream = new FileInputStream(this.crawlingFile);
FileUtils.copy(inStream, writer);
} else {
FileUtils.copy(this.crawlingFileContent, writer);
}
writer.close();
} catch (IOException e) {
log.severe("Error parsing the crawlingFile " + this.crawlingFile.getAbsolutePath(), e);
} catch (Throwable t) {
/* Other errors are likely to occur when the crawl is interrupted : still log this at warning level to avoid polluting regular error log level */
log.warn("Error parsing the crawlingFile " + this.crawlingFile.getAbsolutePath(), t);
} finally {
if (inStream != null) {
try {
inStream.close();
} catch (IOException e) {
log.warn("Could not close crawlingFile : " + this.crawlingFile.getAbsolutePath());
}
}
}
}
}

@ -356,7 +356,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
s = p + 6;
try {
this.anchors.add(new AnchorURL(u));
this.addAnchor(new AnchorURL(u));
continue;
} catch (final MalformedURLException e) {}
}
@ -505,7 +505,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if(src != null) {
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
//this.anchors.add(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition)
//this.addAnchor(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition)
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true));
}
@ -539,7 +539,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if(url != null) {
tag.opts.put("href", url.toNormalform(true));
url.setAll(tag.opts);
this.anchors.add(url);
this.addAnchor(url);
}
}
} else if (tag.name.equalsIgnoreCase("link")) {
@ -574,7 +574,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if (rel.equalsIgnoreCase("canonical")) {
tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
newLink.setAll(tag.opts);
this.anchors.add(newLink);
this.addAnchor(newLink);
this.canonical = newLink;
} else if (rel.equalsIgnoreCase("publisher")) {
this.publisher = newLink;
@ -590,7 +590,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
tag.opts.put("name", linktitle);
newLink.setAll(tag.opts);
this.anchors.add(newLink);
this.addAnchor(newLink);
}
}
} else if(tag.name.equalsIgnoreCase("embed") || tag.name.equalsIgnoreCase("source")) { //html5 tag
@ -605,7 +605,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING));
this.embeds.put(url, ie);
url.setAll(tag.opts);
// this.anchors.add(url); // don't add the embed to the anchors because the webgraph should not contain such links (by definition)
// this.addAnchor(url); // don't add the embed to the anchors because the webgraph should not contain such links (by definition)
}
}
} catch (final NumberFormatException e) {}
@ -615,13 +615,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
tag.opts.put("value", url.toNormalform(true));
url.setAll(tag.opts);
this.anchors.add(url);
this.addAnchor(url);
}
} else if (tag.name.equalsIgnoreCase("iframe")) {
final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
//this.anchors.add(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
//this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
} else if (tag.name.equalsIgnoreCase("html")) {
@ -631,7 +631,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
// fire event
fireScrapeTag0(tag.name, tag.opts);
this.fireScrapeTag0(tag.name, tag.opts);
}
@Override
@ -652,7 +652,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tag.opts);
recursiveParse(url, tag.content.getChars());
this.anchors.add(url);
this.addAnchor(url);
}
this.evaluationScores.match(Element.apath, href);
}
@ -727,7 +727,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
// fire event
fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
this.fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
}
/**
* Add an anchor to the anchors list, and trigger any eventual listener
* @param anchor anchor to add. Must not be null.
*/
protected void addAnchor(AnchorURL anchor) {
this.anchors.add(anchor);
this.fireAddAnchor(anchor.toNormalform(false));
}
@ -755,7 +764,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
for (final AnchorURL entry: scraper.getAnchors()) {
this.anchors.add(entry);
this.addAnchor(entry);
}
String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
StringBuilder altakk = new StringBuilder();
@ -1221,24 +1230,40 @@ public class ContentScraper extends AbstractScraper implements Scraper {
System.out.println("TEXT :" + this.content.toString());
}
/**
* Register a listener for some scrape events
* @param listener ScraperListener implementation
*/
@Override
public void registerHtmlFilterEventListener(final ScraperListener listener) {
if (listener != null) {
this.htmlFilterEventListeners.add(ScraperListener.class, listener);
if(listener instanceof ContentScraperListener) {
this.htmlFilterEventListeners.add(ContentScraperListener.class, (ContentScraperListener)listener);
} else {
this.htmlFilterEventListeners.add(ScraperListener.class, listener);
}
}
}
/**
* Unregister a listener previously registered
* @param listener ScraperListener implementation
*/
@Override
public void deregisterHtmlFilterEventListener(final ScraperListener listener) {
if (listener != null) {
this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
if(listener instanceof ContentScraperListener) {
this.htmlFilterEventListeners.remove(ContentScraperListener.class, (ContentScraperListener)listener);
} else {
this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
}
}
}
private void fireScrapeTag0(final String tagname, final Properties tagopts) {
final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
for (int i=0; i<listeners.length; i+=2) {
if (listeners[i]==ScraperListener.class) {
for (int i = 0; i < listeners.length; i += 2) {
if (listeners[i] == ScraperListener.class || listeners[i] == ContentScraperListener.class) {
((ScraperListener)listeners[i+1]).scrapeTag0(tagname, tagopts);
}
}
@ -1246,12 +1271,25 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private void fireScrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
for (int i=0; i<listeners.length; i+=2) {
if (listeners[i]==ScraperListener.class) {
for (int i = 0; i < listeners.length; i += 2) {
if (listeners[i] == ScraperListener.class || listeners[i] == ContentScraperListener.class) {
((ScraperListener)listeners[i+1]).scrapeTag1(tagname, tagopts, text);
}
}
}
/**
* Fire addAnchor event to any listener implemening {@link ContentScraperListener} interface
* @param url anchor url
*/
private void fireAddAnchor(final String anchorURL) {
final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
for (int i = 0; i < listeners.length; i += 2) {
if (listeners[i] == ContentScraperListener.class) {
((ContentScraperListener)listeners[i+1]).anchorAdded(anchorURL);
}
}
}
public static ContentScraper parseResource(final File file, final int maxLinks, final int timezoneOffset) throws IOException {
// load page

@ -0,0 +1,34 @@
// ContentScraperListener.java
// ---------------------------
// Copyright 2016 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser.html;
/**
* Listener interface to ContentScraper events
*/
public interface ContentScraperListener extends ScraperListener {
/**
* Triggered by {@link ContentScraper#addAnchor(net.yacy.cora.document.id.AnchorURL)} implementations
* @param anchorURL the anchor normalized URL
*/
public void anchorAdded(String anchorURL);
}

@ -26,7 +26,22 @@ package net.yacy.document.parser.html;
import java.util.Properties;
/**
* Listener interface to Scraper events
*/
public interface ScraperListener extends java.util.EventListener {
/**
* Triggered by {@link Scraper#scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)} implementations
* @param tagname tag name
* @param tagopts tag attributes
*/
public void scrapeTag0(String tagname, Properties tagopts);
/**
* Triggered by {@link Scraper#scrapeTag1(net.yacy.document.parser.html.ContentScraper.Tag)} implementations
* @param tagname tag name
* @param tagopts tag attributes
* @param text tag content text
*/
public void scrapeTag1(String tagname, Properties tagopts, char[] text);
}

Loading…
Cancel
Save