Advanced Crawl from local file : better processing of large files.

Applied strategy : when there is no restriction on domains or sub-path(s), stack anchor links once discovered by the content scraper instead of waiting the complete parsing of the file. This makes it possible to handle a crawling start file with thousands of links in a reasonable amount of time. Performance limitation : even if the crawl start faster with a large file, the content of the parsed file still is fully loaded in memory.
9 years ago · 47af33a04c
parent ee92082a3b
commit 47af33a04c
7 changed files with 479 additions and 43 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -21,6 +21,7 @@

 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.Writer;
 import java.net.MalformedURLException;
@ -47,6 +48,7 @@ import net.yacy.cora.util.JSONException;
 import net.yacy.cora.util.JSONObject;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.crawler.CrawlSwitchboard;
+import net.yacy.crawler.FileCrawlStarterTask;
 import net.yacy.crawler.data.Cache;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.NoticedURL.StackType;
@ -483,22 +485,16 @@ public class Crawler_p {
                if ("file".equals(crawlingMode) && post.containsKey("crawlingFile") && crawlingFile != null) {
                    final String crawlingFileContent = post.get("crawlingFile$file", "");
                    try {
-                        // check if the crawl filter works correctly
-                        final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
-                        final Writer writer = new TransformerWriter(null, null, scraper, null, false);
-                        if (crawlingFile != null && crawlingFile.exists()) {
-                            FileUtils.copy(new FileInputStream(crawlingFile), writer);
-                        } else {
-                            FileUtils.copy(crawlingFileContent, writer);
-                        }
-                        writer.close();
-
-                        // get links and generate filter
-                        hyperlinks_from_file = scraper.getAnchors();
                        if (newcrawlingdepth > 0) {
                            if (fullDomain) {
+                            	/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. 
+                            	 * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
+                                hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent);
                                newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
                            } else if (subPath) {
+                            	/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. 
+                            	 * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
+                                hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent);
                                newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
                            }
                        }
@ -627,17 +623,24 @@ public class Crawler_p {
                        ConcurrentLog.logException(e);
                    }
                } else if ("file".equals(crawlingMode)) {
-                    if (post.containsKey("crawlingFile") && crawlingFile != null && hyperlinks_from_file != null) {
-                        try {
-                            if (newcrawlingdepth > 0) {
-                                if (fullDomain) {
-                                    newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
-                                } else if (subPath) {
-                                    newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
+                    if (post.containsKey("crawlingFile") && crawlingFile != null) {
+                         try {
+                            if(newcrawlingdepth > 0 && (fullDomain || subPath)) {
+                            	/* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
+                            	if(hyperlinks_from_file != null) {
+                                    sb.crawler.putActive(handle, profile);
+                                	sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
                                }
+                            } else {
+								/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
+								final String crawlingFileContent = post.get("crawlingFile$file", "");
+								final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
+										new VocabularyScraper(), profile.timezoneOffset());
+								FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
+										sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
+	                            sb.crawler.putActive(handle, profile);
+	                            crawlStarterTask.start();
                            }
-                            sb.crawler.putActive(handle, profile);
-                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); // crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
@ -756,6 +759,58 @@ public class Crawler_p {
        return prop;
    }

+    /**
+     * Scrape crawlingFile or crawlingFileContent and get all anchor links from it.
+     * @param crawlingFile crawl start file (must not be null)
+     * @param timezoneOffset local timezone offset
+     * @param crawlingFileContent content of the crawling file (optional : used only when crawlingFile does no exists)
+     * @return all the anchor links from the crawling file
+     * @throws MalformedURLException
+     * @throws IOException
+     * @throws FileNotFoundException
+     */
+	private static List<AnchorURL> crawlingFileStart(final File crawlingFile, int timezoneOffset,
+			final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
+		List<AnchorURL> hyperlinks_from_file;
+		// check if the crawl filter works correctly
+		final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
+		final Writer writer = new TransformerWriter(null, null, scraper, null, false);
+		if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
+			/* Let's report here detailed error to help user when he selected a wrong file */
+			if(!crawlingFile.exists()) {
+				throw new FileNotFoundException(crawlingFile.getAbsolutePath() +  " does not exists");
+			}
+			if(!crawlingFile.isFile()) {
+				throw new FileNotFoundException(crawlingFile.getAbsolutePath() +  " exists but is not a regular file");
+			}
+			if(!crawlingFile.canRead()) {
+				throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
+			}
+		}
+		if (crawlingFile != null) {
+			FileInputStream inStream = null;
+			try {
+				inStream = new FileInputStream(crawlingFile);
+				FileUtils.copy(inStream, writer);
+			} finally {
+				if(inStream != null) {
+					try {
+						inStream.close();
+					} catch(IOException ignoredException) {
+						ConcurrentLog.info("Crawler_p", "Could not close crawlingFile : " + crawlingFile.getAbsolutePath());
+					}
+				}
+			}
+		} else {
+		    FileUtils.copy(crawlingFileContent, writer);
+		}
+		writer.close();
+
+		// get links and generate filter
+		hyperlinks_from_file = scraper.getAnchors();
+		return hyperlinks_from_file;
+	}
+
    private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
        if (!recrawlIfOlderCheck) return null;
        if ("year".equals(unit)) return new Date(System.currentTimeMillis() - number * AbstractFormatter.normalyearMillis);
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -151,26 +151,47 @@ public final class CrawlStacker {
        if (CrawlStacker.log.isFinest()) CrawlStacker.log.finest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth());
        this.requestQueue.enQueue(entry);
    }
+    
    public void enqueueEntriesAsynchronous(
            final byte[] initiator,
            final String profileHandle,
            final List<AnchorURL> hyperlinks,
            final int timezoneOffset) {
-        new Thread() {
+        new Thread("enqueueEntriesAsynchronous") {
            @Override
            public void run() {
-                Thread.currentThread().setName("enqueueEntriesAsynchronous");
                enqueueEntries(initiator, profileHandle, hyperlinks, true, timezoneOffset);
            }
        }.start();
    }
-
-    private void enqueueEntries(
+    
+    /**
+     * Enqueue crawl start entries
+     * @param initiator Hash of the peer initiating the crawl
+     * @param profileHandle name of the active crawl profile
+     * @param hyperlinks crawl starting points links to stack
+     * @param replace Specify whether old indexed entries should be replaced
+     * @param timezoneOffset local time-zone offset
+     */
+    public void enqueueEntries(
            final byte[] initiator,
            final String profileHandle,
            final List<AnchorURL> hyperlinks,
            final boolean replace,
            final int timezoneOffset) {
+    	/* Let's check if the profile is still active before removing any existing entry */
+        byte[] handle = UTF8.getBytes(profileHandle);
+        final CrawlProfile profile = this.crawler.get(handle);
+        if (profile == null) {
+            String error;
+            if(hyperlinks.size() == 1) {
+            	error = "Rejected URL : " + hyperlinks.get(0).toNormalform(false) + ". Reason : LOST STACKER PROFILE HANDLE '" + profileHandle + "'";  
+            } else {
+            	error = "Rejected " + hyperlinks.size() + " crawl entries. Reason : LOST STACKER PROFILE HANDLE '" + profileHandle + "'";            	
+            }
+            CrawlStacker.log.info(error); // this is NOT an error but a normal effect when terminating a crawl queue
+            return;
+        }
        if (replace) {
            // delete old entries, if exists to force a re-load of the url (thats wanted here)
            Set<String> hosthashes = new HashSet<String>();
--- a/source/net/yacy/crawler/CrawlStarterFromSraper.java
+++ b/source/net/yacy/crawler/CrawlStarterFromSraper.java
@ -0,0 +1,100 @@
+// CrawlStarterFromSraper.java
+// ---------------------------
+// Copyright 2016 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.crawler;
+
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.document.parser.html.ContentScraperListener;
+
+/**
+ * Enqueue an entry to the crawlStacker each time an anchor is discovered by the ContentScraper
+ * @author luccioman
+ *
+ */
+public class CrawlStarterFromSraper implements ContentScraperListener {
+	
+	private final static ConcurrentLog log = new ConcurrentLog(CrawlStarterFromSraper.class.getSimpleName());
+	
+	/** CrawlStacker instance : will receive anchor links used as crawl starting points */
+	private CrawlStacker crawlStacker;
+	/** Hash of the peer initiating the crawl */
+	private final byte[] initiatorHash;
+	/** Active crawl profile */
+	private CrawlProfile profile;
+    /** Specify whether old indexed entries should be replaced */
+    private final boolean replace;
+	
+    /**
+     * Constructor 
+     * @param crawlStacker CrawlStacker instance : will receive anchor links used as crawl starting points
+     * @param initiatorHash Hash of the peer initiating the crawl (must not be null)
+     * @param profile active crawl profile (must not be null)
+     * @param replace Specify whether old indexed entries should be replaced
+     * @throws IllegalArgumentException when a required parameter is null
+     */
+	public CrawlStarterFromSraper(final CrawlStacker crawlStacker, final byte[] initiatorHash,
+        final CrawlProfile profile,
+        final boolean replace) {
+		if(crawlStacker == null) {
+			throw new IllegalArgumentException("crawlStacker parameter must not be null");
+		}
+		this.crawlStacker = crawlStacker;
+		if(initiatorHash == null) {
+			throw new IllegalArgumentException("initiatorHash parameter must not be null");
+		}
+		this.initiatorHash = initiatorHash;
+		this.replace = replace;
+		if(profile == null) {
+			throw new IllegalArgumentException("profile parameter must not be null");
+		}
+		this.profile = profile;
+	}
+
+	@Override
+	public void scrapeTag0(String tagname, Properties tagopts) {
+		// Nothing to do on this event
+	}
+
+	@Override
+	public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
+		// Nothing to do on this event
+	}
+
+	@Override
+	public void anchorAdded(String anchorURL) {
+		List<AnchorURL> urls = new ArrayList<>();
+		try {
+			urls.add(new AnchorURL(anchorURL));
+			this.crawlStacker.enqueueEntries(this.initiatorHash, this.profile.handle(), urls, this.replace, this.profile.timezoneOffset());
+		} catch (MalformedURLException e) {
+			log.warn("Malformed URL : " + anchorURL);
+		}
+	}
+	
+}
--- a/source/net/yacy/crawler/FileCrawlStarterTask.java
+++ b/source/net/yacy/crawler/FileCrawlStarterTask.java
@ -0,0 +1,173 @@
+// FileCrawlStarterTask.java
+// ---------------------------
+// Copyright 2016 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.crawler;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.Writer;
+
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.document.parser.html.ContentScraper;
+import net.yacy.document.parser.html.TransformerWriter;
+import net.yacy.kelondro.util.FileUtils;
+
+/**
+ * A task used trigger crawl starts from a file (HTML or any other supported
+ * text file) containing anchor links. It does not wait full file parsing before
+ * sending anchor links to the crawl stacker and thus can handle files with many
+ * links.
+ * 
+ * @author luccioman
+ */
+public class FileCrawlStarterTask extends Thread {
+
+	private final static ConcurrentLog log = new ConcurrentLog(FileCrawlStarterTask.class.getSimpleName());
+
+	/** A text file containing crawl start links */
+	private File crawlingFile;
+	/** Alternative to crawlingFile : holds file content */
+	private String crawlingFileContent;
+	/** Content scraper that will scrape file content */
+	private ContentScraper scraper;
+	/** Active crawl profile */
+	private CrawlProfile profile;
+	/**
+	 * CrawlStacker instance : will receive anchor links used as crawl starting
+	 * points
+	 */
+	private CrawlStacker crawlStacker;
+	/** Hash of the peer initiating the crawl */
+	private final byte[] initiatorHash;
+
+	/**
+	 * Constructor
+	 * 
+	 * @param crawlingFile
+	 *            a text file containing crawl start links (alternatively,
+	 *            crawlingFileContent parameter can be used)
+	 * @param crawlingFileContent
+	 *            content of a text file containing crawl start links
+	 *            (alternatively, crawlingFile parameter can be used)
+	 * @param scraper
+	 *            ContentScraper instance used to scrape links from the file
+	 * @param profile
+	 *            active crawl profile (must not be null)
+	 * @param crawlStacker
+	 *            CrawlStacker instance : will receive anchor links used as
+	 *            crawl starting points (must not be null)
+	 * @param initiatorHash
+	 *            Hash of the peer initiating the crawl
+	 * @throws IllegalArgumentException
+	 *             when one of the required parameters is null
+	 * @throws IOException
+	 *             when crawlingFileContent is null and crawlingFile does not
+	 *             exists or can not be read
+	 */
+	public FileCrawlStarterTask(final File crawlingFile, final String crawlingFileContent, final ContentScraper scraper,
+			final CrawlProfile profile, final CrawlStacker crawlStacker, final byte[] initiatorHash)
+			throws IllegalArgumentException, FileNotFoundException, IOException {
+		super(FileCrawlStarterTask.class.getSimpleName());
+		if (crawlingFile == null && crawlingFileContent == null) {
+			throw new IllegalArgumentException(
+					"At least one of crawlingFile or crawlingFileContent parameter must not be null");
+		}
+		if ((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
+			/*
+			 * Lets check now if the crawlingFile exists and can be read so the
+			 * error can be synchronously reported to the caller
+			 */
+			if (!crawlingFile.exists()) {
+				throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists");
+			}
+			if (!crawlingFile.isFile()) {
+				throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file");
+			}
+			if (!crawlingFile.canRead()) {
+				throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
+			}
+		}
+		this.crawlingFile = crawlingFile;
+		this.crawlingFileContent = crawlingFileContent;
+		if (scraper == null) {
+			throw new IllegalArgumentException("scraper parameter must not be null");
+		}
+		this.scraper = scraper;
+		if (profile == null) {
+			throw new IllegalArgumentException("profile parameter must not be null");
+		}
+		this.profile = profile;
+		if (crawlStacker == null) {
+			throw new IllegalArgumentException("crawlStacker parameter must not be null");
+		}
+		this.crawlStacker = crawlStacker;
+		if (initiatorHash == null) {
+			throw new IllegalArgumentException("initiatorHash parameter must not be null");
+		}
+		this.initiatorHash = initiatorHash;
+	}
+
+	/**
+	 * Run the content scraping on the file and once detected push any anchor
+	 * link to the crawlStacker.
+	 */
+	@Override
+	public void run() {
+		/*
+		 * This is the listener which makes possible the push of links to the
+		 * crawl stacker without waiting the complete end of content scraping
+		 */
+		CrawlStarterFromSraper anchorListener = new CrawlStarterFromSraper(this.crawlStacker, this.initiatorHash,
+				this.profile, true);
+		this.scraper.registerHtmlFilterEventListener(anchorListener);
+
+		final Writer writer = new TransformerWriter(null, null, this.scraper, null, false);
+		FileInputStream inStream = null;
+
+		try {
+			if (this.crawlingFile != null && this.crawlingFile.exists()) {
+				inStream = new FileInputStream(this.crawlingFile);
+				FileUtils.copy(inStream, writer);
+			} else {
+				FileUtils.copy(this.crawlingFileContent, writer);
+			}
+			writer.close();
+		} catch (IOException e) {
+			log.severe("Error parsing the crawlingFile " + this.crawlingFile.getAbsolutePath(), e);
+		} catch (Throwable t) {
+			/* Other errors are likely to occur when the crawl is interrupted : still log this at warning level to avoid polluting regular error log level */
+			log.warn("Error parsing the crawlingFile " + this.crawlingFile.getAbsolutePath(), t);
+		} finally {
+			if (inStream != null) {
+				try {
+					inStream.close();
+				} catch (IOException e) {
+					log.warn("Could not close crawlingFile : " + this.crawlingFile.getAbsolutePath());
+				}
+			}
+		}
+	}
+
+}
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -356,7 +356,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
            s = p + 6;
            try {
-                this.anchors.add(new AnchorURL(u));
+                this.addAnchor(new AnchorURL(u));
                continue;
            } catch (final MalformedURLException e) {}
        }
@ -505,7 +505,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            if(src != null) {
            	tag.opts.put("src", src.toNormalform(true));
            	src.setAll(tag.opts);
-            	//this.anchors.add(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition)
+            	//this.addAnchor(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition)
            	this.frames.add(src);
            	this.evaluationScores.match(Element.framepath, src.toNormalform(true));
            }
@ -539,7 +539,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                if(url != null) {
                	tag.opts.put("href", url.toNormalform(true));
                	url.setAll(tag.opts);
-                	this.anchors.add(url);
+                	this.addAnchor(url);
                }
            }
        } else if (tag.name.equalsIgnoreCase("link")) {
@ -574,7 +574,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                } else if (rel.equalsIgnoreCase("canonical")) {
                    tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
                    newLink.setAll(tag.opts);
-                    this.anchors.add(newLink);
+                    this.addAnchor(newLink);
                    this.canonical = newLink;
                } else if (rel.equalsIgnoreCase("publisher")) {
                    this.publisher = newLink;
@ -590,7 +590,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
                    tag.opts.put("name", linktitle);
                    newLink.setAll(tag.opts);
-                    this.anchors.add(newLink);
+                    this.addAnchor(newLink);
                }
            }
        } else if(tag.name.equalsIgnoreCase("embed") || tag.name.equalsIgnoreCase("source")) { //html5 tag
@ -605,7 +605,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                        final EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING));
                        this.embeds.put(url, ie);
                        url.setAll(tag.opts);
-                        // this.anchors.add(url); // don't add the embed to the anchors because the webgraph should not contain such links (by definition)
+                        // this.addAnchor(url); // don't add the embed to the anchors because the webgraph should not contain such links (by definition)
                    }
                }
            } catch (final NumberFormatException e) {}
@ -615,13 +615,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
                tag.opts.put("value", url.toNormalform(true));
                url.setAll(tag.opts);
-                this.anchors.add(url);
+                this.addAnchor(url);
            }
        } else if (tag.name.equalsIgnoreCase("iframe")) {
            final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
            tag.opts.put("src", src.toNormalform(true));
            src.setAll(tag.opts);
-            //this.anchors.add(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
+            //this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
            this.iframes.add(src);
            this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
        } else if (tag.name.equalsIgnoreCase("html")) {
@ -631,7 +631,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }

        // fire event
-        fireScrapeTag0(tag.name, tag.opts);
+        this.fireScrapeTag0(tag.name, tag.opts);
    }

    @Override
@ -652,7 +652,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
                url.setAll(tag.opts);
                recursiveParse(url, tag.content.getChars());
-                this.anchors.add(url);
+                this.addAnchor(url);
            }
            this.evaluationScores.match(Element.apath, href);
        }
@ -727,7 +727,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }

        // fire event
-        fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
+        this.fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
+    }
+    
+    /**
+     * Add an anchor to the anchors list, and trigger any eventual listener
+     * @param anchor anchor to add. Must not be null.
+     */
+    protected void addAnchor(AnchorURL anchor) {
+    	this.anchors.add(anchor);
+    	this.fireAddAnchor(anchor.toNormalform(false));
    }


@ -755,7 +764,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            }
        }
        for (final AnchorURL entry: scraper.getAnchors()) {
-            this.anchors.add(entry);
+            this.addAnchor(entry);
        }
        String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
        StringBuilder altakk = new StringBuilder();
@ -1221,24 +1230,40 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        System.out.println("TEXT     :" + this.content.toString());
    }

+    /**
+     * Register a listener for some scrape events
+     * @param listener ScraperListener implementation
+     */
    @Override
    public void registerHtmlFilterEventListener(final ScraperListener listener) {
        if (listener != null) {
-            this.htmlFilterEventListeners.add(ScraperListener.class, listener);
+        	if(listener instanceof ContentScraperListener) {
+        		this.htmlFilterEventListeners.add(ContentScraperListener.class, (ContentScraperListener)listener);
+        	} else {
+        		this.htmlFilterEventListeners.add(ScraperListener.class, listener);
+        	}
        }
    }

+    /**
+     * Unregister a listener previously registered
+     * @param listener ScraperListener implementation
+     */
    @Override
    public void deregisterHtmlFilterEventListener(final ScraperListener listener) {
        if (listener != null) {
-            this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
+        	if(listener instanceof ContentScraperListener) {
+        		this.htmlFilterEventListeners.remove(ContentScraperListener.class, (ContentScraperListener)listener);
+        	} else {
+        		this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
+        	}
        }
    }

    private void fireScrapeTag0(final String tagname, final Properties tagopts) {
        final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
-        for (int i=0; i<listeners.length; i+=2) {
-            if (listeners[i]==ScraperListener.class) {
+        for (int i = 0; i < listeners.length; i += 2) {
+            if (listeners[i] == ScraperListener.class || listeners[i] == ContentScraperListener.class) {
                    ((ScraperListener)listeners[i+1]).scrapeTag0(tagname, tagopts);
            }
        }
@ -1246,12 +1271,25 @@ public class ContentScraper extends AbstractScraper implements Scraper {

    private void fireScrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
        final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
-        for (int i=0; i<listeners.length; i+=2) {
-            if (listeners[i]==ScraperListener.class) {
+        for (int i = 0; i < listeners.length; i += 2) {
+            if (listeners[i] == ScraperListener.class  || listeners[i] == ContentScraperListener.class) {
                    ((ScraperListener)listeners[i+1]).scrapeTag1(tagname, tagopts, text);
            }
        }
    }
+    
+    /**
+     * Fire addAnchor event to any listener implemening {@link ContentScraperListener} interface
+     * @param url anchor url
+     */
+    private void fireAddAnchor(final String anchorURL) {
+        final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
+        for (int i = 0; i < listeners.length; i += 2) {
+            if (listeners[i] == ContentScraperListener.class) {
+                    ((ContentScraperListener)listeners[i+1]).anchorAdded(anchorURL);
+            }
+        }
+    }

    public static ContentScraper parseResource(final File file, final int maxLinks, final int timezoneOffset) throws IOException {
        // load page
--- a/source/net/yacy/document/parser/html/ContentScraperListener.java
+++ b/source/net/yacy/document/parser/html/ContentScraperListener.java
@ -0,0 +1,34 @@
+// ContentScraperListener.java
+// ---------------------------
+// Copyright 2016 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser.html;
+
+/**
+ * Listener interface to ContentScraper events
+ */
+public interface ContentScraperListener extends ScraperListener {
+	/**
+	 * Triggered by {@link ContentScraper#addAnchor(net.yacy.cora.document.id.AnchorURL)} implementations
+	 * @param anchorURL the anchor normalized URL
+	 */
+    public void anchorAdded(String anchorURL);
+}
--- a/source/net/yacy/document/parser/html/ScraperListener.java
+++ b/source/net/yacy/document/parser/html/ScraperListener.java
@ -26,7 +26,22 @@ package net.yacy.document.parser.html;

 import java.util.Properties;

+/**
+ * Listener interface to Scraper events
+ */
 public interface ScraperListener extends java.util.EventListener {
+	/**
+	 * Triggered by {@link Scraper#scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)} implementations
+	 * @param tagname tag name
+	 * @param tagopts tag attributes
+	 */
    public void scrapeTag0(String tagname, Properties tagopts);
+    
+	/**
+	 * Triggered by {@link Scraper#scrapeTag1(net.yacy.document.parser.html.ContentScraper.Tag)} implementations
+	 * @param tagname tag name
+	 * @param tagopts tag attributes
+	 * @param text tag content text
+	 */
    public void scrapeTag1(String tagname, Properties tagopts, char[] text);
 }