From 47af33a04ce832d18a0d460dc1aff32b184ef36c Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Fri, 21 Oct 2016 13:03:31 +0200
Subject: [PATCH] Advanced Crawl from local file : better processing of large
 files.

Applied strategy : when there is no restriction on domains or
sub-path(s), stack anchor links once discovered by the content scraper
instead of waiting the complete parsing of the file.

This makes it possible to handle a crawling start file with thousands of
links in a reasonable amount of time.

Performance limitation : even if the crawl start faster with a large
file, the content of the parsed file still is fully loaded in memory.
---
 htroot/Crawler_p.java                         |  97 +++++++---
 source/net/yacy/crawler/CrawlStacker.java     |  29 ++-
 .../yacy/crawler/CrawlStarterFromSraper.java  | 100 ++++++++++
 .../yacy/crawler/FileCrawlStarterTask.java    | 173 ++++++++++++++++++
 .../document/parser/html/ContentScraper.java  |  74 ++++++--
 .../parser/html/ContentScraperListener.java   |  34 ++++
 .../document/parser/html/ScraperListener.java |  15 ++
 7 files changed, 479 insertions(+), 43 deletions(-)
 create mode 100644 source/net/yacy/crawler/CrawlStarterFromSraper.java
 create mode 100644 source/net/yacy/crawler/FileCrawlStarterTask.java
 create mode 100644 source/net/yacy/document/parser/html/ContentScraperListener.java

diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index c964596e8..1e8409da4 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -21,6 +21,7 @@
 
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.Writer;
 import java.net.MalformedURLException;
@@ -47,6 +48,7 @@ import net.yacy.cora.util.JSONException;
 import net.yacy.cora.util.JSONObject;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.crawler.CrawlSwitchboard;
+import net.yacy.crawler.FileCrawlStarterTask;
 import net.yacy.crawler.data.Cache;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.NoticedURL.StackType;
@@ -483,22 +485,16 @@ public class Crawler_p {
                 if ("file".equals(crawlingMode) && post.containsKey("crawlingFile") && crawlingFile != null) {
                     final String crawlingFileContent = post.get("crawlingFile$file", "");
                     try {
-                        // check if the crawl filter works correctly
-                        final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
-                        final Writer writer = new TransformerWriter(null, null, scraper, null, false);
-                        if (crawlingFile != null && crawlingFile.exists()) {
-                            FileUtils.copy(new FileInputStream(crawlingFile), writer);
-                        } else {
-                            FileUtils.copy(crawlingFileContent, writer);
-                        }
-                        writer.close();
-
-                        // get links and generate filter
-                        hyperlinks_from_file = scraper.getAnchors();
                         if (newcrawlingdepth > 0) {
                             if (fullDomain) {
+                            	/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. 
+                            	 * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
+                                hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent);
                                 newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
                             } else if (subPath) {
+                            	/* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. 
+                            	 * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */
+                                hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent);
                                 newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
                             }
                         }
@@ -627,17 +623,24 @@ public class Crawler_p {
                         ConcurrentLog.logException(e);
                     }
                 } else if ("file".equals(crawlingMode)) {
-                    if (post.containsKey("crawlingFile") && crawlingFile != null && hyperlinks_from_file != null) {
-                        try {
-                            if (newcrawlingdepth > 0) {
-                                if (fullDomain) {
-                                    newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file);
-                                } else if (subPath) {
-                                    newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file);
+                    if (post.containsKey("crawlingFile") && crawlingFile != null) {
+                         try {
+                            if(newcrawlingdepth > 0 && (fullDomain || subPath)) {
+                            	/* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
+                            	if(hyperlinks_from_file != null) {
+                                    sb.crawler.putActive(handle, profile);
+                                	sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
                                 }
+                            } else {
+								/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
+								final String crawlingFileContent = post.get("crawlingFile$file", "");
+								final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
+										new VocabularyScraper(), profile.timezoneOffset());
+								FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
+										sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
+	                            sb.crawler.putActive(handle, profile);
+	                            crawlStarterTask.start();
                             }
-                            sb.crawler.putActive(handle, profile);
-                            sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
                         } catch (final PatternSyntaxException e) {
                             prop.put("info", "4"); // crawlfilter does not match url
                             prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
@@ -756,6 +759,58 @@ public class Crawler_p {
         return prop;
     }
 
+    /**
+     * Scrape crawlingFile or crawlingFileContent and get all anchor links from it.
+     * @param crawlingFile crawl start file (must not be null)
+     * @param timezoneOffset local timezone offset
+     * @param crawlingFileContent content of the crawling file (optional : used only when crawlingFile does no exists)
+     * @return all the anchor links from the crawling file
+     * @throws MalformedURLException
+     * @throws IOException
+     * @throws FileNotFoundException
+     */
+	private static List<AnchorURL> crawlingFileStart(final File crawlingFile, int timezoneOffset,
+			final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException {
+		List<AnchorURL> hyperlinks_from_file;
+		// check if the crawl filter works correctly
+		final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset);
+		final Writer writer = new TransformerWriter(null, null, scraper, null, false);
+		if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
+			/* Let's report here detailed error to help user when he selected a wrong file */
+			if(!crawlingFile.exists()) {
+				throw new FileNotFoundException(crawlingFile.getAbsolutePath() +  " does not exists");
+			}
+			if(!crawlingFile.isFile()) {
+				throw new FileNotFoundException(crawlingFile.getAbsolutePath() +  " exists but is not a regular file");
+			}
+			if(!crawlingFile.canRead()) {
+				throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
+			}
+		}
+		if (crawlingFile != null) {
+			FileInputStream inStream = null;
+			try {
+				inStream = new FileInputStream(crawlingFile);
+				FileUtils.copy(inStream, writer);
+			} finally {
+				if(inStream != null) {
+					try {
+						inStream.close();
+					} catch(IOException ignoredException) {
+						ConcurrentLog.info("Crawler_p", "Could not close crawlingFile : " + crawlingFile.getAbsolutePath());
+					}
+				}
+			}
+		} else {
+		    FileUtils.copy(crawlingFileContent, writer);
+		}
+		writer.close();
+
+		// get links and generate filter
+		hyperlinks_from_file = scraper.getAnchors();
+		return hyperlinks_from_file;
+	}
+
     private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
         if (!recrawlIfOlderCheck) return null;
         if ("year".equals(unit)) return new Date(System.currentTimeMillis() - number * AbstractFormatter.normalyearMillis);
diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
index e71ff1470..efc205516 100644
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@@ -151,26 +151,47 @@ public final class CrawlStacker {
         if (CrawlStacker.log.isFinest()) CrawlStacker.log.finest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth());
         this.requestQueue.enQueue(entry);
     }
+    
     public void enqueueEntriesAsynchronous(
             final byte[] initiator,
             final String profileHandle,
             final List<AnchorURL> hyperlinks,
             final int timezoneOffset) {
-        new Thread() {
+        new Thread("enqueueEntriesAsynchronous") {
             @Override
             public void run() {
-                Thread.currentThread().setName("enqueueEntriesAsynchronous");
                 enqueueEntries(initiator, profileHandle, hyperlinks, true, timezoneOffset);
             }
         }.start();
     }
-
-    private void enqueueEntries(
+    
+    /**
+     * Enqueue crawl start entries
+     * @param initiator Hash of the peer initiating the crawl
+     * @param profileHandle name of the active crawl profile
+     * @param hyperlinks crawl starting points links to stack
+     * @param replace Specify whether old indexed entries should be replaced
+     * @param timezoneOffset local time-zone offset
+     */
+    public void enqueueEntries(
             final byte[] initiator,
             final String profileHandle,
             final List<AnchorURL> hyperlinks,
             final boolean replace,
             final int timezoneOffset) {
+    	/* Let's check if the profile is still active before removing any existing entry */
+        byte[] handle = UTF8.getBytes(profileHandle);
+        final CrawlProfile profile = this.crawler.get(handle);
+        if (profile == null) {
+            String error;
+            if(hyperlinks.size() == 1) {
+            	error = "Rejected URL : " + hyperlinks.get(0).toNormalform(false) + ". Reason : LOST STACKER PROFILE HANDLE '" + profileHandle + "'";  
+            } else {
+            	error = "Rejected " + hyperlinks.size() + " crawl entries. Reason : LOST STACKER PROFILE HANDLE '" + profileHandle + "'";            	
+            }
+            CrawlStacker.log.info(error); // this is NOT an error but a normal effect when terminating a crawl queue
+            return;
+        }
         if (replace) {
             // delete old entries, if exists to force a re-load of the url (thats wanted here)
             Set<String> hosthashes = new HashSet<String>();
diff --git a/source/net/yacy/crawler/CrawlStarterFromSraper.java b/source/net/yacy/crawler/CrawlStarterFromSraper.java
new file mode 100644
index 000000000..afc8d9ba9
--- /dev/null
+++ b/source/net/yacy/crawler/CrawlStarterFromSraper.java
@@ -0,0 +1,100 @@
+// CrawlStarterFromSraper.java
+// ---------------------------
+// Copyright 2016 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.crawler;
+
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.document.parser.html.ContentScraperListener;
+
+/**
+ * Enqueue an entry to the crawlStacker each time an anchor is discovered by the ContentScraper
+ * @author luccioman
+ *
+ */
+public class CrawlStarterFromSraper implements ContentScraperListener {
+	
+	private final static ConcurrentLog log = new ConcurrentLog(CrawlStarterFromSraper.class.getSimpleName());
+	
+	/** CrawlStacker instance : will receive anchor links used as crawl starting points */
+	private CrawlStacker crawlStacker;
+	/** Hash of the peer initiating the crawl */
+	private final byte[] initiatorHash;
+	/** Active crawl profile */
+	private CrawlProfile profile;
+    /** Specify whether old indexed entries should be replaced */
+    private final boolean replace;
+	
+    /**
+     * Constructor 
+     * @param crawlStacker CrawlStacker instance : will receive anchor links used as crawl starting points
+     * @param initiatorHash Hash of the peer initiating the crawl (must not be null)
+     * @param profile active crawl profile (must not be null)
+     * @param replace Specify whether old indexed entries should be replaced
+     * @throws IllegalArgumentException when a required parameter is null
+     */
+	public CrawlStarterFromSraper(final CrawlStacker crawlStacker, final byte[] initiatorHash,
+        final CrawlProfile profile,
+        final boolean replace) {
+		if(crawlStacker == null) {
+			throw new IllegalArgumentException("crawlStacker parameter must not be null");
+		}
+		this.crawlStacker = crawlStacker;
+		if(initiatorHash == null) {
+			throw new IllegalArgumentException("initiatorHash parameter must not be null");
+		}
+		this.initiatorHash = initiatorHash;
+		this.replace = replace;
+		if(profile == null) {
+			throw new IllegalArgumentException("profile parameter must not be null");
+		}
+		this.profile = profile;
+	}
+
+	@Override
+	public void scrapeTag0(String tagname, Properties tagopts) {
+		// Nothing to do on this event
+	}
+
+	@Override
+	public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
+		// Nothing to do on this event
+	}
+
+	@Override
+	public void anchorAdded(String anchorURL) {
+		List<AnchorURL> urls = new ArrayList<>();
+		try {
+			urls.add(new AnchorURL(anchorURL));
+			this.crawlStacker.enqueueEntries(this.initiatorHash, this.profile.handle(), urls, this.replace, this.profile.timezoneOffset());
+		} catch (MalformedURLException e) {
+			log.warn("Malformed URL : " + anchorURL);
+		}
+	}
+	
+}
\ No newline at end of file
diff --git a/source/net/yacy/crawler/FileCrawlStarterTask.java b/source/net/yacy/crawler/FileCrawlStarterTask.java
new file mode 100644
index 000000000..c3aabc991
--- /dev/null
+++ b/source/net/yacy/crawler/FileCrawlStarterTask.java
@@ -0,0 +1,173 @@
+// FileCrawlStarterTask.java
+// ---------------------------
+// Copyright 2016 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.crawler;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.Writer;
+
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.document.parser.html.ContentScraper;
+import net.yacy.document.parser.html.TransformerWriter;
+import net.yacy.kelondro.util.FileUtils;
+
+/**
+ * A task used trigger crawl starts from a file (HTML or any other supported
+ * text file) containing anchor links. It does not wait full file parsing before
+ * sending anchor links to the crawl stacker and thus can handle files with many
+ * links.
+ * 
+ * @author luccioman
+ */
+public class FileCrawlStarterTask extends Thread {
+
+	private final static ConcurrentLog log = new ConcurrentLog(FileCrawlStarterTask.class.getSimpleName());
+
+	/** A text file containing crawl start links */
+	private File crawlingFile;
+	/** Alternative to crawlingFile : holds file content */
+	private String crawlingFileContent;
+	/** Content scraper that will scrape file content */
+	private ContentScraper scraper;
+	/** Active crawl profile */
+	private CrawlProfile profile;
+	/**
+	 * CrawlStacker instance : will receive anchor links used as crawl starting
+	 * points
+	 */
+	private CrawlStacker crawlStacker;
+	/** Hash of the peer initiating the crawl */
+	private final byte[] initiatorHash;
+
+	/**
+	 * Constructor
+	 * 
+	 * @param crawlingFile
+	 *            a text file containing crawl start links (alternatively,
+	 *            crawlingFileContent parameter can be used)
+	 * @param crawlingFileContent
+	 *            content of a text file containing crawl start links
+	 *            (alternatively, crawlingFile parameter can be used)
+	 * @param scraper
+	 *            ContentScraper instance used to scrape links from the file
+	 * @param profile
+	 *            active crawl profile (must not be null)
+	 * @param crawlStacker
+	 *            CrawlStacker instance : will receive anchor links used as
+	 *            crawl starting points (must not be null)
+	 * @param initiatorHash
+	 *            Hash of the peer initiating the crawl
+	 * @throws IllegalArgumentException
+	 *             when one of the required parameters is null
+	 * @throws IOException
+	 *             when crawlingFileContent is null and crawlingFile does not
+	 *             exists or can not be read
+	 */
+	public FileCrawlStarterTask(final File crawlingFile, final String crawlingFileContent, final ContentScraper scraper,
+			final CrawlProfile profile, final CrawlStacker crawlStacker, final byte[] initiatorHash)
+			throws IllegalArgumentException, FileNotFoundException, IOException {
+		super(FileCrawlStarterTask.class.getSimpleName());
+		if (crawlingFile == null && crawlingFileContent == null) {
+			throw new IllegalArgumentException(
+					"At least one of crawlingFile or crawlingFileContent parameter must not be null");
+		}
+		if ((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
+			/*
+			 * Lets check now if the crawlingFile exists and can be read so the
+			 * error can be synchronously reported to the caller
+			 */
+			if (!crawlingFile.exists()) {
+				throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists");
+			}
+			if (!crawlingFile.isFile()) {
+				throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file");
+			}
+			if (!crawlingFile.canRead()) {
+				throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
+			}
+		}
+		this.crawlingFile = crawlingFile;
+		this.crawlingFileContent = crawlingFileContent;
+		if (scraper == null) {
+			throw new IllegalArgumentException("scraper parameter must not be null");
+		}
+		this.scraper = scraper;
+		if (profile == null) {
+			throw new IllegalArgumentException("profile parameter must not be null");
+		}
+		this.profile = profile;
+		if (crawlStacker == null) {
+			throw new IllegalArgumentException("crawlStacker parameter must not be null");
+		}
+		this.crawlStacker = crawlStacker;
+		if (initiatorHash == null) {
+			throw new IllegalArgumentException("initiatorHash parameter must not be null");
+		}
+		this.initiatorHash = initiatorHash;
+	}
+
+	/**
+	 * Run the content scraping on the file and once detected push any anchor
+	 * link to the crawlStacker.
+	 */
+	@Override
+	public void run() {
+		/*
+		 * This is the listener which makes possible the push of links to the
+		 * crawl stacker without waiting the complete end of content scraping
+		 */
+		CrawlStarterFromSraper anchorListener = new CrawlStarterFromSraper(this.crawlStacker, this.initiatorHash,
+				this.profile, true);
+		this.scraper.registerHtmlFilterEventListener(anchorListener);
+
+		final Writer writer = new TransformerWriter(null, null, this.scraper, null, false);
+		FileInputStream inStream = null;
+
+		try {
+			if (this.crawlingFile != null && this.crawlingFile.exists()) {
+				inStream = new FileInputStream(this.crawlingFile);
+				FileUtils.copy(inStream, writer);
+			} else {
+				FileUtils.copy(this.crawlingFileContent, writer);
+			}
+			writer.close();
+		} catch (IOException e) {
+			log.severe("Error parsing the crawlingFile " + this.crawlingFile.getAbsolutePath(), e);
+		} catch (Throwable t) {
+			/* Other errors are likely to occur when the crawl is interrupted : still log this at warning level to avoid polluting regular error log level */
+			log.warn("Error parsing the crawlingFile " + this.crawlingFile.getAbsolutePath(), t);
+		} finally {
+			if (inStream != null) {
+				try {
+					inStream.close();
+				} catch (IOException e) {
+					log.warn("Could not close crawlingFile : " + this.crawlingFile.getAbsolutePath());
+				}
+			}
+		}
+	}
+
+}
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 7d7f8c71c..d970776ca 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -356,7 +356,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
             if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
             s = p + 6;
             try {
-                this.anchors.add(new AnchorURL(u));
+                this.addAnchor(new AnchorURL(u));
                 continue;
             } catch (final MalformedURLException e) {}
         }
@@ -505,7 +505,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
             if(src != null) {
             	tag.opts.put("src", src.toNormalform(true));
             	src.setAll(tag.opts);
-            	//this.anchors.add(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition)
+            	//this.addAnchor(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition)
             	this.frames.add(src);
             	this.evaluationScores.match(Element.framepath, src.toNormalform(true));
             }
@@ -539,7 +539,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                 if(url != null) {
                 	tag.opts.put("href", url.toNormalform(true));
                 	url.setAll(tag.opts);
-                	this.anchors.add(url);
+                	this.addAnchor(url);
                 }
             }
         } else if (tag.name.equalsIgnoreCase("link")) {
@@ -574,7 +574,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                 } else if (rel.equalsIgnoreCase("canonical")) {
                     tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
                     newLink.setAll(tag.opts);
-                    this.anchors.add(newLink);
+                    this.addAnchor(newLink);
                     this.canonical = newLink;
                 } else if (rel.equalsIgnoreCase("publisher")) {
                     this.publisher = newLink;
@@ -590,7 +590,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                 } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
                     tag.opts.put("name", linktitle);
                     newLink.setAll(tag.opts);
-                    this.anchors.add(newLink);
+                    this.addAnchor(newLink);
                 }
             }
         } else if(tag.name.equalsIgnoreCase("embed") || tag.name.equalsIgnoreCase("source")) { //html5 tag
@@ -605,7 +605,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                         final EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING));
                         this.embeds.put(url, ie);
                         url.setAll(tag.opts);
-                        // this.anchors.add(url); // don't add the embed to the anchors because the webgraph should not contain such links (by definition)
+                        // this.addAnchor(url); // don't add the embed to the anchors because the webgraph should not contain such links (by definition)
                     }
                 }
             } catch (final NumberFormatException e) {}
@@ -615,13 +615,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                 AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
                 tag.opts.put("value", url.toNormalform(true));
                 url.setAll(tag.opts);
-                this.anchors.add(url);
+                this.addAnchor(url);
             }
         } else if (tag.name.equalsIgnoreCase("iframe")) {
             final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
             tag.opts.put("src", src.toNormalform(true));
             src.setAll(tag.opts);
-            //this.anchors.add(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
+            //this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
             this.iframes.add(src);
             this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
         } else if (tag.name.equalsIgnoreCase("html")) {
@@ -631,7 +631,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         }
 
         // fire event
-        fireScrapeTag0(tag.name, tag.opts);
+        this.fireScrapeTag0(tag.name, tag.opts);
     }
 
     @Override
@@ -652,7 +652,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                 tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
                 url.setAll(tag.opts);
                 recursiveParse(url, tag.content.getChars());
-                this.anchors.add(url);
+                this.addAnchor(url);
             }
             this.evaluationScores.match(Element.apath, href);
         }
@@ -727,7 +727,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         }
 
         // fire event
-        fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
+        this.fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
+    }
+    
+    /**
+     * Add an anchor to the anchors list, and trigger any eventual listener
+     * @param anchor anchor to add. Must not be null.
+     */
+    protected void addAnchor(AnchorURL anchor) {
+    	this.anchors.add(anchor);
+    	this.fireAddAnchor(anchor.toNormalform(false));
     }
 
 
@@ -755,7 +764,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
             }
         }
         for (final AnchorURL entry: scraper.getAnchors()) {
-            this.anchors.add(entry);
+            this.addAnchor(entry);
         }
         String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
         StringBuilder altakk = new StringBuilder();
@@ -1221,24 +1230,40 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         System.out.println("TEXT     :" + this.content.toString());
     }
 
+    /**
+     * Register a listener for some scrape events
+     * @param listener ScraperListener implementation
+     */
     @Override
     public void registerHtmlFilterEventListener(final ScraperListener listener) {
         if (listener != null) {
-            this.htmlFilterEventListeners.add(ScraperListener.class, listener);
+        	if(listener instanceof ContentScraperListener) {
+        		this.htmlFilterEventListeners.add(ContentScraperListener.class, (ContentScraperListener)listener);
+        	} else {
+        		this.htmlFilterEventListeners.add(ScraperListener.class, listener);
+        	}
         }
     }
 
+    /**
+     * Unregister a listener previously registered
+     * @param listener ScraperListener implementation
+     */
     @Override
     public void deregisterHtmlFilterEventListener(final ScraperListener listener) {
         if (listener != null) {
-            this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
+        	if(listener instanceof ContentScraperListener) {
+        		this.htmlFilterEventListeners.remove(ContentScraperListener.class, (ContentScraperListener)listener);
+        	} else {
+        		this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
+        	}
         }
     }
 
     private void fireScrapeTag0(final String tagname, final Properties tagopts) {
         final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
-        for (int i=0; i<listeners.length; i+=2) {
-            if (listeners[i]==ScraperListener.class) {
+        for (int i = 0; i < listeners.length; i += 2) {
+            if (listeners[i] == ScraperListener.class || listeners[i] == ContentScraperListener.class) {
                     ((ScraperListener)listeners[i+1]).scrapeTag0(tagname, tagopts);
             }
         }
@@ -1246,12 +1271,25 @@ public class ContentScraper extends AbstractScraper implements Scraper {
 
     private void fireScrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
         final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
-        for (int i=0; i<listeners.length; i+=2) {
-            if (listeners[i]==ScraperListener.class) {
+        for (int i = 0; i < listeners.length; i += 2) {
+            if (listeners[i] == ScraperListener.class  || listeners[i] == ContentScraperListener.class) {
                     ((ScraperListener)listeners[i+1]).scrapeTag1(tagname, tagopts, text);
             }
         }
     }
+    
+    /**
+     * Fire addAnchor event to any listener implemening {@link ContentScraperListener} interface
+     * @param url anchor url
+     */
+    private void fireAddAnchor(final String anchorURL) {
+        final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
+        for (int i = 0; i < listeners.length; i += 2) {
+            if (listeners[i] == ContentScraperListener.class) {
+                    ((ContentScraperListener)listeners[i+1]).anchorAdded(anchorURL);
+            }
+        }
+    }
 
     public static ContentScraper parseResource(final File file, final int maxLinks, final int timezoneOffset) throws IOException {
         // load page
diff --git a/source/net/yacy/document/parser/html/ContentScraperListener.java b/source/net/yacy/document/parser/html/ContentScraperListener.java
new file mode 100644
index 000000000..4beb253ca
--- /dev/null
+++ b/source/net/yacy/document/parser/html/ContentScraperListener.java
@@ -0,0 +1,34 @@
+// ContentScraperListener.java
+// ---------------------------
+// Copyright 2016 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser.html;
+
+/**
+ * Listener interface to ContentScraper events
+ */
+public interface ContentScraperListener extends ScraperListener {
+	/**
+	 * Triggered by {@link ContentScraper#addAnchor(net.yacy.cora.document.id.AnchorURL)} implementations
+	 * @param anchorURL the anchor normalized URL
+	 */
+    public void anchorAdded(String anchorURL);
+}
diff --git a/source/net/yacy/document/parser/html/ScraperListener.java b/source/net/yacy/document/parser/html/ScraperListener.java
index 8ac079797..b40833b61 100644
--- a/source/net/yacy/document/parser/html/ScraperListener.java
+++ b/source/net/yacy/document/parser/html/ScraperListener.java
@@ -26,7 +26,22 @@ package net.yacy.document.parser.html;
 
 import java.util.Properties;
 
+/**
+ * Listener interface to Scraper events
+ */
 public interface ScraperListener extends java.util.EventListener {
+	/**
+	 * Triggered by {@link Scraper#scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)} implementations
+	 * @param tagname tag name
+	 * @param tagopts tag attributes
+	 */
     public void scrapeTag0(String tagname, Properties tagopts);
+    
+	/**
+	 * Triggered by {@link Scraper#scrapeTag1(net.yacy.document.parser.html.ContentScraper.Tag)} implementations
+	 * @param tagname tag name
+	 * @param tagopts tag attributes
+	 * @param text tag content text
+	 */
     public void scrapeTag1(String tagname, Properties tagopts, char[] text);
 }