From 47af33a04ce832d18a0d460dc1aff32b184ef36c Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 21 Oct 2016 13:03:31 +0200 Subject: [PATCH] Advanced Crawl from local file : better processing of large files. Applied strategy : when there is no restriction on domains or sub-path(s), stack anchor links once discovered by the content scraper instead of waiting the complete parsing of the file. This makes it possible to handle a crawling start file with thousands of links in a reasonable amount of time. Performance limitation : even if the crawl start faster with a large file, the content of the parsed file still is fully loaded in memory. --- htroot/Crawler_p.java | 97 +++++++--- source/net/yacy/crawler/CrawlStacker.java | 29 ++- .../yacy/crawler/CrawlStarterFromSraper.java | 100 ++++++++++ .../yacy/crawler/FileCrawlStarterTask.java | 173 ++++++++++++++++++ .../document/parser/html/ContentScraper.java | 74 ++++++-- .../parser/html/ContentScraperListener.java | 34 ++++ .../document/parser/html/ScraperListener.java | 15 ++ 7 files changed, 479 insertions(+), 43 deletions(-) create mode 100644 source/net/yacy/crawler/CrawlStarterFromSraper.java create mode 100644 source/net/yacy/crawler/FileCrawlStarterTask.java create mode 100644 source/net/yacy/document/parser/html/ContentScraperListener.java diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index c964596e8..1e8409da4 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.Writer; import java.net.MalformedURLException; @@ -47,6 +48,7 @@ import net.yacy.cora.util.JSONException; import net.yacy.cora.util.JSONObject; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.CrawlSwitchboard; +import net.yacy.crawler.FileCrawlStarterTask; import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.NoticedURL.StackType; @@ -483,22 +485,16 @@ public class Crawler_p { if ("file".equals(crawlingMode) && post.containsKey("crawlingFile") && crawlingFile != null) { final String crawlingFileContent = post.get("crawlingFile$file", ""); try { - // check if the crawl filter works correctly - final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset); - final Writer writer = new TransformerWriter(null, null, scraper, null, false); - if (crawlingFile != null && crawlingFile.exists()) { - FileUtils.copy(new FileInputStream(crawlingFile), writer); - } else { - FileUtils.copy(crawlingFileContent, writer); - } - writer.close(); - - // get links and generate filter - hyperlinks_from_file = scraper.getAnchors(); if (newcrawlingdepth > 0) { if (fullDomain) { + /* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. + * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */ + hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent); newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file); } else if (subPath) { + /* Crawl is restricted to start domains or sub-paths : we have to get all the start links now. + * Otherwise we can get them asynchronously later, thus allowing to handle more efficiently large start crawlingFiles */ + hyperlinks_from_file = crawlingFileStart(crawlingFile, timezoneOffset, crawlingFileContent); newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file); } } @@ -627,17 +623,24 @@ public class Crawler_p { ConcurrentLog.logException(e); } } else if ("file".equals(crawlingMode)) { - if (post.containsKey("crawlingFile") && crawlingFile != null && hyperlinks_from_file != null) { - try { - if (newcrawlingdepth > 0) { - if (fullDomain) { - newcrawlingMustMatch = CrawlProfile.siteFilter(hyperlinks_from_file); - } else if (subPath) { - newcrawlingMustMatch = CrawlProfile.subpathFilter(hyperlinks_from_file); + if (post.containsKey("crawlingFile") && crawlingFile != null) { + try { + if(newcrawlingdepth > 0 && (fullDomain || subPath)) { + /* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */ + if(hyperlinks_from_file != null) { + sb.crawler.putActive(handle, profile); + sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset()); } + } else { + /* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */ + final String crawlingFileContent = post.get("crawlingFile$file", ""); + final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, + new VocabularyScraper(), profile.timezoneOffset()); + FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile, + sb.crawlStacker, sb.peers.mySeed().hash.getBytes()); + sb.crawler.putActive(handle, profile); + crawlStarterTask.start(); } - sb.crawler.putActive(handle, profile); - sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset()); } catch (final PatternSyntaxException e) { prop.put("info", "4"); // crawlfilter does not match url prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); @@ -756,6 +759,58 @@ public class Crawler_p { return prop; } + /** + * Scrape crawlingFile or crawlingFileContent and get all anchor links from it. + * @param crawlingFile crawl start file (must not be null) + * @param timezoneOffset local timezone offset + * @param crawlingFileContent content of the crawling file (optional : used only when crawlingFile does no exists) + * @return all the anchor links from the crawling file + * @throws MalformedURLException + * @throws IOException + * @throws FileNotFoundException + */ + private static List crawlingFileStart(final File crawlingFile, int timezoneOffset, + final String crawlingFileContent) throws MalformedURLException, IOException, FileNotFoundException { + List hyperlinks_from_file; + // check if the crawl filter works correctly + final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, new VocabularyScraper(), timezoneOffset); + final Writer writer = new TransformerWriter(null, null, scraper, null, false); + if((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) { + /* Let's report here detailed error to help user when he selected a wrong file */ + if(!crawlingFile.exists()) { + throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists"); + } + if(!crawlingFile.isFile()) { + throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file"); + } + if(!crawlingFile.canRead()) { + throw new IOException("Can not read : " + crawlingFile.getAbsolutePath()); + } + } + if (crawlingFile != null) { + FileInputStream inStream = null; + try { + inStream = new FileInputStream(crawlingFile); + FileUtils.copy(inStream, writer); + } finally { + if(inStream != null) { + try { + inStream.close(); + } catch(IOException ignoredException) { + ConcurrentLog.info("Crawler_p", "Could not close crawlingFile : " + crawlingFile.getAbsolutePath()); + } + } + } + } else { + FileUtils.copy(crawlingFileContent, writer); + } + writer.close(); + + // get links and generate filter + hyperlinks_from_file = scraper.getAnchors(); + return hyperlinks_from_file; + } + private static Date timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) { if (!recrawlIfOlderCheck) return null; if ("year".equals(unit)) return new Date(System.currentTimeMillis() - number * AbstractFormatter.normalyearMillis); diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index e71ff1470..efc205516 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -151,26 +151,47 @@ public final class CrawlStacker { if (CrawlStacker.log.isFinest()) CrawlStacker.log.finest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth()); this.requestQueue.enQueue(entry); } + public void enqueueEntriesAsynchronous( final byte[] initiator, final String profileHandle, final List hyperlinks, final int timezoneOffset) { - new Thread() { + new Thread("enqueueEntriesAsynchronous") { @Override public void run() { - Thread.currentThread().setName("enqueueEntriesAsynchronous"); enqueueEntries(initiator, profileHandle, hyperlinks, true, timezoneOffset); } }.start(); } - - private void enqueueEntries( + + /** + * Enqueue crawl start entries + * @param initiator Hash of the peer initiating the crawl + * @param profileHandle name of the active crawl profile + * @param hyperlinks crawl starting points links to stack + * @param replace Specify whether old indexed entries should be replaced + * @param timezoneOffset local time-zone offset + */ + public void enqueueEntries( final byte[] initiator, final String profileHandle, final List hyperlinks, final boolean replace, final int timezoneOffset) { + /* Let's check if the profile is still active before removing any existing entry */ + byte[] handle = UTF8.getBytes(profileHandle); + final CrawlProfile profile = this.crawler.get(handle); + if (profile == null) { + String error; + if(hyperlinks.size() == 1) { + error = "Rejected URL : " + hyperlinks.get(0).toNormalform(false) + ". Reason : LOST STACKER PROFILE HANDLE '" + profileHandle + "'"; + } else { + error = "Rejected " + hyperlinks.size() + " crawl entries. Reason : LOST STACKER PROFILE HANDLE '" + profileHandle + "'"; + } + CrawlStacker.log.info(error); // this is NOT an error but a normal effect when terminating a crawl queue + return; + } if (replace) { // delete old entries, if exists to force a re-load of the url (thats wanted here) Set hosthashes = new HashSet(); diff --git a/source/net/yacy/crawler/CrawlStarterFromSraper.java b/source/net/yacy/crawler/CrawlStarterFromSraper.java new file mode 100644 index 000000000..afc8d9ba9 --- /dev/null +++ b/source/net/yacy/crawler/CrawlStarterFromSraper.java @@ -0,0 +1,100 @@ +// CrawlStarterFromSraper.java +// --------------------------- +// Copyright 2016 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.crawler; + +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.data.CrawlProfile; +import net.yacy.document.parser.html.ContentScraperListener; + +/** + * Enqueue an entry to the crawlStacker each time an anchor is discovered by the ContentScraper + * @author luccioman + * + */ +public class CrawlStarterFromSraper implements ContentScraperListener { + + private final static ConcurrentLog log = new ConcurrentLog(CrawlStarterFromSraper.class.getSimpleName()); + + /** CrawlStacker instance : will receive anchor links used as crawl starting points */ + private CrawlStacker crawlStacker; + /** Hash of the peer initiating the crawl */ + private final byte[] initiatorHash; + /** Active crawl profile */ + private CrawlProfile profile; + /** Specify whether old indexed entries should be replaced */ + private final boolean replace; + + /** + * Constructor + * @param crawlStacker CrawlStacker instance : will receive anchor links used as crawl starting points + * @param initiatorHash Hash of the peer initiating the crawl (must not be null) + * @param profile active crawl profile (must not be null) + * @param replace Specify whether old indexed entries should be replaced + * @throws IllegalArgumentException when a required parameter is null + */ + public CrawlStarterFromSraper(final CrawlStacker crawlStacker, final byte[] initiatorHash, + final CrawlProfile profile, + final boolean replace) { + if(crawlStacker == null) { + throw new IllegalArgumentException("crawlStacker parameter must not be null"); + } + this.crawlStacker = crawlStacker; + if(initiatorHash == null) { + throw new IllegalArgumentException("initiatorHash parameter must not be null"); + } + this.initiatorHash = initiatorHash; + this.replace = replace; + if(profile == null) { + throw new IllegalArgumentException("profile parameter must not be null"); + } + this.profile = profile; + } + + @Override + public void scrapeTag0(String tagname, Properties tagopts) { + // Nothing to do on this event + } + + @Override + public void scrapeTag1(String tagname, Properties tagopts, char[] text) { + // Nothing to do on this event + } + + @Override + public void anchorAdded(String anchorURL) { + List urls = new ArrayList<>(); + try { + urls.add(new AnchorURL(anchorURL)); + this.crawlStacker.enqueueEntries(this.initiatorHash, this.profile.handle(), urls, this.replace, this.profile.timezoneOffset()); + } catch (MalformedURLException e) { + log.warn("Malformed URL : " + anchorURL); + } + } + +} \ No newline at end of file diff --git a/source/net/yacy/crawler/FileCrawlStarterTask.java b/source/net/yacy/crawler/FileCrawlStarterTask.java new file mode 100644 index 000000000..c3aabc991 --- /dev/null +++ b/source/net/yacy/crawler/FileCrawlStarterTask.java @@ -0,0 +1,173 @@ +// FileCrawlStarterTask.java +// --------------------------- +// Copyright 2016 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.crawler; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.Writer; + +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.data.CrawlProfile; +import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.TransformerWriter; +import net.yacy.kelondro.util.FileUtils; + +/** + * A task used trigger crawl starts from a file (HTML or any other supported + * text file) containing anchor links. It does not wait full file parsing before + * sending anchor links to the crawl stacker and thus can handle files with many + * links. + * + * @author luccioman + */ +public class FileCrawlStarterTask extends Thread { + + private final static ConcurrentLog log = new ConcurrentLog(FileCrawlStarterTask.class.getSimpleName()); + + /** A text file containing crawl start links */ + private File crawlingFile; + /** Alternative to crawlingFile : holds file content */ + private String crawlingFileContent; + /** Content scraper that will scrape file content */ + private ContentScraper scraper; + /** Active crawl profile */ + private CrawlProfile profile; + /** + * CrawlStacker instance : will receive anchor links used as crawl starting + * points + */ + private CrawlStacker crawlStacker; + /** Hash of the peer initiating the crawl */ + private final byte[] initiatorHash; + + /** + * Constructor + * + * @param crawlingFile + * a text file containing crawl start links (alternatively, + * crawlingFileContent parameter can be used) + * @param crawlingFileContent + * content of a text file containing crawl start links + * (alternatively, crawlingFile parameter can be used) + * @param scraper + * ContentScraper instance used to scrape links from the file + * @param profile + * active crawl profile (must not be null) + * @param crawlStacker + * CrawlStacker instance : will receive anchor links used as + * crawl starting points (must not be null) + * @param initiatorHash + * Hash of the peer initiating the crawl + * @throws IllegalArgumentException + * when one of the required parameters is null + * @throws IOException + * when crawlingFileContent is null and crawlingFile does not + * exists or can not be read + */ + public FileCrawlStarterTask(final File crawlingFile, final String crawlingFileContent, final ContentScraper scraper, + final CrawlProfile profile, final CrawlStacker crawlStacker, final byte[] initiatorHash) + throws IllegalArgumentException, FileNotFoundException, IOException { + super(FileCrawlStarterTask.class.getSimpleName()); + if (crawlingFile == null && crawlingFileContent == null) { + throw new IllegalArgumentException( + "At least one of crawlingFile or crawlingFileContent parameter must not be null"); + } + if ((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) { + /* + * Lets check now if the crawlingFile exists and can be read so the + * error can be synchronously reported to the caller + */ + if (!crawlingFile.exists()) { + throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists"); + } + if (!crawlingFile.isFile()) { + throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file"); + } + if (!crawlingFile.canRead()) { + throw new IOException("Can not read : " + crawlingFile.getAbsolutePath()); + } + } + this.crawlingFile = crawlingFile; + this.crawlingFileContent = crawlingFileContent; + if (scraper == null) { + throw new IllegalArgumentException("scraper parameter must not be null"); + } + this.scraper = scraper; + if (profile == null) { + throw new IllegalArgumentException("profile parameter must not be null"); + } + this.profile = profile; + if (crawlStacker == null) { + throw new IllegalArgumentException("crawlStacker parameter must not be null"); + } + this.crawlStacker = crawlStacker; + if (initiatorHash == null) { + throw new IllegalArgumentException("initiatorHash parameter must not be null"); + } + this.initiatorHash = initiatorHash; + } + + /** + * Run the content scraping on the file and once detected push any anchor + * link to the crawlStacker. + */ + @Override + public void run() { + /* + * This is the listener which makes possible the push of links to the + * crawl stacker without waiting the complete end of content scraping + */ + CrawlStarterFromSraper anchorListener = new CrawlStarterFromSraper(this.crawlStacker, this.initiatorHash, + this.profile, true); + this.scraper.registerHtmlFilterEventListener(anchorListener); + + final Writer writer = new TransformerWriter(null, null, this.scraper, null, false); + FileInputStream inStream = null; + + try { + if (this.crawlingFile != null && this.crawlingFile.exists()) { + inStream = new FileInputStream(this.crawlingFile); + FileUtils.copy(inStream, writer); + } else { + FileUtils.copy(this.crawlingFileContent, writer); + } + writer.close(); + } catch (IOException e) { + log.severe("Error parsing the crawlingFile " + this.crawlingFile.getAbsolutePath(), e); + } catch (Throwable t) { + /* Other errors are likely to occur when the crawl is interrupted : still log this at warning level to avoid polluting regular error log level */ + log.warn("Error parsing the crawlingFile " + this.crawlingFile.getAbsolutePath(), t); + } finally { + if (inStream != null) { + try { + inStream.close(); + } catch (IOException e) { + log.warn("Could not close crawlingFile : " + this.crawlingFile.getAbsolutePath()); + } + } + } + } + +} diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 7d7f8c71c..d970776ca 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -356,7 +356,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above s = p + 6; try { - this.anchors.add(new AnchorURL(u)); + this.addAnchor(new AnchorURL(u)); continue; } catch (final MalformedURLException e) {} } @@ -505,7 +505,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if(src != null) { tag.opts.put("src", src.toNormalform(true)); src.setAll(tag.opts); - //this.anchors.add(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition) + //this.addAnchor(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition) this.frames.add(src); this.evaluationScores.match(Element.framepath, src.toNormalform(true)); } @@ -539,7 +539,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if(url != null) { tag.opts.put("href", url.toNormalform(true)); url.setAll(tag.opts); - this.anchors.add(url); + this.addAnchor(url); } } } else if (tag.name.equalsIgnoreCase("link")) { @@ -574,7 +574,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if (rel.equalsIgnoreCase("canonical")) { tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next()); newLink.setAll(tag.opts); - this.anchors.add(newLink); + this.addAnchor(newLink); this.canonical = newLink; } else if (rel.equalsIgnoreCase("publisher")) { this.publisher = newLink; @@ -590,7 +590,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) { tag.opts.put("name", linktitle); newLink.setAll(tag.opts); - this.anchors.add(newLink); + this.addAnchor(newLink); } } } else if(tag.name.equalsIgnoreCase("embed") || tag.name.equalsIgnoreCase("source")) { //html5 tag @@ -605,7 +605,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING)); this.embeds.put(url, ie); url.setAll(tag.opts); - // this.anchors.add(url); // don't add the embed to the anchors because the webgraph should not contain such links (by definition) + // this.addAnchor(url); // don't add the embed to the anchors because the webgraph should not contain such links (by definition) } } } catch (final NumberFormatException e) {} @@ -615,13 +615,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING)); tag.opts.put("value", url.toNormalform(true)); url.setAll(tag.opts); - this.anchors.add(url); + this.addAnchor(url); } } else if (tag.name.equalsIgnoreCase("iframe")) { final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING)); tag.opts.put("src", src.toNormalform(true)); src.setAll(tag.opts); - //this.anchors.add(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition) + //this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition) this.iframes.add(src); this.evaluationScores.match(Element.iframepath, src.toNormalform(true)); } else if (tag.name.equalsIgnoreCase("html")) { @@ -631,7 +631,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // fire event - fireScrapeTag0(tag.name, tag.opts); + this.fireScrapeTag0(tag.name, tag.opts); } @Override @@ -652,7 +652,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute url.setAll(tag.opts); recursiveParse(url, tag.content.getChars()); - this.anchors.add(url); + this.addAnchor(url); } this.evaluationScores.match(Element.apath, href); } @@ -727,7 +727,16 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // fire event - fireScrapeTag1(tag.name, tag.opts, tag.content.getChars()); + this.fireScrapeTag1(tag.name, tag.opts, tag.content.getChars()); + } + + /** + * Add an anchor to the anchors list, and trigger any eventual listener + * @param anchor anchor to add. Must not be null. + */ + protected void addAnchor(AnchorURL anchor) { + this.anchors.add(anchor); + this.fireAddAnchor(anchor.toNormalform(false)); } @@ -755,7 +764,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } for (final AnchorURL entry: scraper.getAnchors()) { - this.anchors.add(entry); + this.addAnchor(entry); } String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars()))); StringBuilder altakk = new StringBuilder(); @@ -1221,24 +1230,40 @@ public class ContentScraper extends AbstractScraper implements Scraper { System.out.println("TEXT :" + this.content.toString()); } + /** + * Register a listener for some scrape events + * @param listener ScraperListener implementation + */ @Override public void registerHtmlFilterEventListener(final ScraperListener listener) { if (listener != null) { - this.htmlFilterEventListeners.add(ScraperListener.class, listener); + if(listener instanceof ContentScraperListener) { + this.htmlFilterEventListeners.add(ContentScraperListener.class, (ContentScraperListener)listener); + } else { + this.htmlFilterEventListeners.add(ScraperListener.class, listener); + } } } + /** + * Unregister a listener previously registered + * @param listener ScraperListener implementation + */ @Override public void deregisterHtmlFilterEventListener(final ScraperListener listener) { if (listener != null) { - this.htmlFilterEventListeners.remove(ScraperListener.class, listener); + if(listener instanceof ContentScraperListener) { + this.htmlFilterEventListeners.remove(ContentScraperListener.class, (ContentScraperListener)listener); + } else { + this.htmlFilterEventListeners.remove(ScraperListener.class, listener); + } } } private void fireScrapeTag0(final String tagname, final Properties tagopts) { final Object[] listeners = this.htmlFilterEventListeners.getListenerList(); - for (int i=0; i