From ae10ed5613b32257edddaa9a2a794cfef143ce8d Mon Sep 17 00:00:00 2001 From: low012 Date: Fri, 28 Jan 2011 16:24:33 +0000 Subject: [PATCH] *) added a Set to which filter elements are written before mustmatch-filter is created to avoid huge lists of double elements in mustmatch-filter when starting a crawl from a "Link-List of URL" on CrawlStartSite_p.html git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7456 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Crawler_p.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index f44388e47..d1eff945d 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -30,6 +30,7 @@ import java.io.Writer; import java.net.MalformedURLException; import java.util.Date; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; @@ -479,8 +480,12 @@ public class Crawler_p { // get links and generate filter final StringBuilder filter = new StringBuilder(); final Map hyperlinks = scraper.getAnchors(); - for (MultiProtocolURI uri: hyperlinks.keySet()) { - filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); + final Set filterSet = new HashSet(); + for (final MultiProtocolURI uri: hyperlinks.keySet()) { + filterSet.add(new StringBuilder().append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*").toString()); + } + for (final String element : filterSet) { + filter.append('|').append(element); } newcrawlingMustMatch = filter.length() > 0 ? filter.substring(1) : "";