From ca72ed752655c5a62a253849bc38322641fb6641 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 15 Jul 2009 21:07:46 +0000 Subject: [PATCH] -removed superfluous crawl cache -refactoring of crawler classes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6221 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ConfigAppearance_p.java | 2 +- htroot/ConfigLanguage_p.java | 2 +- htroot/IndexCreateIndexingQueue_p.java | 3 +- htroot/IndexCreateLoaderQueue_p.java | 4 +- htroot/IndexCreateWWWGlobalQueue_p.java | 6 +- htroot/IndexCreateWWWLocalQueue_p.java | 10 +-- htroot/IndexCreateWWWRemoteQueue_p.java | 6 +- htroot/Network.java | 2 +- htroot/QuickCrawlLink_p.java | 4 +- htroot/Status.java | 2 +- htroot/ViewFile.java | 4 +- htroot/WatchCrawler_p.java | 8 +-- htroot/api/queues_p.java | 13 ++-- htroot/api/util/getpageinfo_p.java | 2 +- htroot/rct_p.java | 4 +- htroot/sharedBlacklist_p.java | 2 +- htroot/yacy/urls.java | 4 +- source/de/anomic/crawler/Balancer.java | 33 +++++----- source/de/anomic/crawler/CrawlQueues.java | 30 +++++---- source/de/anomic/crawler/CrawlStacker.java | 13 ++-- source/de/anomic/crawler/IndexingStack.java | 65 +++++++++---------- source/de/anomic/crawler/LoaderMessage.java | 10 +-- .../de/anomic/crawler/NoticeURLImporter.java | 5 +- source/de/anomic/crawler/NoticedURL.java | 25 +++---- source/de/anomic/crawler/RobotsTxt.java | 1 + source/de/anomic/crawler/ZURL.java | 11 ++-- .../crawler/{ => retrieval}/FTPLoader.java | 18 ++--- .../crawler/{ => retrieval}/HTTPLoader.java | 16 ++--- .../LoaderDispatcher.java} | 15 ++--- .../Request.java} | 11 ++-- .../retrieval/Response.java} | 11 ++-- source/de/anomic/data/SitemapParser.java | 6 +- source/de/anomic/data/bookmarksDB.java | 4 +- .../document/parser/html/ContentScraper.java | 2 +- .../de/anomic/document/parser/odtParser.java | 2 +- .../de/anomic/document/parser/rpmParser.java | 2 +- .../de/anomic/document/parser/vcfParser.java | 2 +- source/de/anomic/http/httpdProxyHandler.java | 7 +- source/de/anomic/kelondro/text/Metadata.java | 4 +- source/de/anomic/kelondro/text/Segment.java | 6 +- .../metadataPrototype/URLMetadataRow.java | 6 +- source/de/anomic/plasma/plasmaHTCache.java | 4 +- .../de/anomic/plasma/plasmaSwitchboard.java | 30 +++++---- source/de/anomic/search/SnippetCache.java | 8 +-- source/de/anomic/tools/loaderThreads.java | 2 +- source/de/anomic/yacy/yacyClient.java | 2 +- source/de/anomic/yacy/yacyRelease.java | 2 +- source/de/anomic/yacy/yacySeedDB.java | 2 +- source/de/anomic/ymage/ymageOSM.java | 4 +- 49 files changed, 223 insertions(+), 214 deletions(-) rename source/de/anomic/crawler/{ => retrieval}/FTPLoader.java (92%) rename source/de/anomic/crawler/{ => retrieval}/HTTPLoader.java (93%) rename source/de/anomic/crawler/{ProtocolLoader.java => retrieval/LoaderDispatcher.java} (91%) rename source/de/anomic/crawler/{CrawlEntry.java => retrieval/Request.java} (97%) rename source/de/anomic/{http/httpDocument.java => crawler/retrieval/Response.java} (98%) diff --git a/htroot/ConfigAppearance_p.java b/htroot/ConfigAppearance_p.java index 8bd9c8a42..7f503bdf9 100644 --- a/htroot/ConfigAppearance_p.java +++ b/htroot/ConfigAppearance_p.java @@ -34,7 +34,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.data.listManager; import de.anomic.http.httpClient; import de.anomic.http.httpHeader; diff --git a/htroot/ConfigLanguage_p.java b/htroot/ConfigLanguage_p.java index 8a2683ec8..a30f791d8 100644 --- a/htroot/ConfigLanguage_p.java +++ b/htroot/ConfigLanguage_p.java @@ -36,7 +36,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.data.listManager; import de.anomic.data.translator; import de.anomic.http.httpClient; diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java index e587af77d..b8492d687 100644 --- a/htroot/IndexCreateIndexingQueue_p.java +++ b/htroot/IndexCreateIndexingQueue_p.java @@ -86,7 +86,7 @@ public class IndexCreateIndexingQueue_p { yacySeed initiator; boolean dark; - if ((sb.crawler.indexingStack.size() == 0) && (sb.crawler.indexingStack.getActiveQueueSize() == 0)) { + if ((sb.crawler.indexingStack.size() == 0) && (sb.getActiveQueueSize() == 0)) { prop.put("indexing-queue", "0"); //is empty } else { prop.put("indexing-queue", "1"); // there are entries in the queue or in process @@ -98,7 +98,6 @@ public class IndexCreateIndexingQueue_p { // getting all entries that are currently in process final ArrayList entryList = new ArrayList(); - entryList.addAll(sb.crawler.indexingStack.getActiveQueueEntries()); final int inProcessCount = entryList.size(); // getting all enqueued entries diff --git a/htroot/IndexCreateLoaderQueue_p.java b/htroot/IndexCreateLoaderQueue_p.java index 52f3d536d..2763ead12 100644 --- a/htroot/IndexCreateLoaderQueue_p.java +++ b/htroot/IndexCreateLoaderQueue_p.java @@ -24,7 +24,7 @@ // javac -classpath .:../classes IndexCreate_p.java // if the shell's current path is HTROOT -import de.anomic.crawler.CrawlEntry; +import de.anomic.crawler.retrieval.Request; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -44,7 +44,7 @@ public class IndexCreateLoaderQueue_p { } else { prop.put("loader-set", "1"); boolean dark = true; - final CrawlEntry[] w = sb.crawlQueues.activeWorkerEntries(); + final Request[] w = sb.crawlQueues.activeWorkerEntries(); yacySeed initiator; int count = 0; for (int i = 0; i < w.length; i++) { diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index 108546738..f8966f473 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -29,9 +29,9 @@ import java.util.ArrayList; import java.util.Date; import java.util.Locale; -import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.NoticedURL; +import de.anomic.crawler.retrieval.Request; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -85,9 +85,9 @@ public class IndexCreateWWWGlobalQueue_p { prop.put("crawler-queue", "0"); } else { prop.put("crawler-queue", "1"); - final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, showLimit); + final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, showLimit); - CrawlEntry urle; + Request urle; boolean dark = true; yacySeed initiator; String profileHandle; diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index e6e958cd1..a8a8fefd4 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -33,12 +33,12 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.NoticedURL; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.crawler.CrawlSwitchboard; +import de.anomic.crawler.retrieval.Request; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacySeed; @@ -109,8 +109,8 @@ public class IndexCreateWWWLocalQueue_p { } } else { // iterating through the list of URLs - final Iterator iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.STACK_TYPE_CORE); - CrawlEntry entry; + final Iterator iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.STACK_TYPE_CORE); + Request entry; while (iter.hasNext()) { if ((entry = iter.next()) == null) continue; String value = null; @@ -154,9 +154,9 @@ public class IndexCreateWWWLocalQueue_p { prop.put("crawler-queue", "0"); } else { prop.put("crawler-queue", "1"); - final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, (int) (showLimit * 1.20)); + final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, (int) (showLimit * 1.20)); - CrawlEntry urle; + Request urle; boolean dark = true; yacySeed initiator; String profileHandle; diff --git a/htroot/IndexCreateWWWRemoteQueue_p.java b/htroot/IndexCreateWWWRemoteQueue_p.java index 77c4dcc6a..d6c41670b 100644 --- a/htroot/IndexCreateWWWRemoteQueue_p.java +++ b/htroot/IndexCreateWWWRemoteQueue_p.java @@ -29,9 +29,9 @@ import java.util.ArrayList; import java.util.Date; import java.util.Locale; -import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.NoticedURL; +import de.anomic.crawler.retrieval.Request; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -85,9 +85,9 @@ public class IndexCreateWWWRemoteQueue_p { prop.put("crawler-queue", "0"); } else { prop.put("crawler-queue", "1"); - final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_REMOTE, showLimit); + final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_REMOTE, showLimit); - CrawlEntry urle; + Request urle; boolean dark = true; yacySeed initiator; String profileHandle; diff --git a/htroot/Network.java b/htroot/Network.java index 972d5d21b..c747124fb 100644 --- a/htroot/Network.java +++ b/htroot/Network.java @@ -35,7 +35,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.DateFormatter; diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 5a17a1e53..a6b2f380f 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -34,8 +34,8 @@ import java.net.MalformedURLException; import java.net.URLDecoder; import java.util.Date; -import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.retrieval.Request; import de.anomic.http.httpHeader; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; @@ -159,7 +159,7 @@ public class QuickCrawlLink_p { // stack URL String reasonString = null; - reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry( + reasonString = sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash, crawlingStartURL, null, diff --git a/htroot/Status.java b/htroot/Status.java index 7d360261f..4b35899f1 100644 --- a/htroot/Status.java +++ b/htroot/Status.java @@ -286,7 +286,7 @@ public class Status { prop.putNum("connectionsMax", httpd.getMaxSessionCount()); // Queue information - final int indexingJobCount = sb.getThread("80_indexing").getJobCount() + sb.crawler.indexingStack.getActiveQueueSize(); + final int indexingJobCount = sb.getThread("80_indexing").getJobCount() + sb.getActiveQueueSize(); final int indexingMaxCount = (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30); final int indexingPercent = (indexingMaxCount==0)?0:indexingJobCount*100/indexingMaxCount; prop.putNum("indexingQueueSize", indexingJobCount); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index f12aa9599..22793ad5d 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -34,6 +34,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import de.anomic.crawler.retrieval.Response; import de.anomic.document.Condenser; import de.anomic.document.ParserException; import de.anomic.document.Document; @@ -42,7 +43,6 @@ import de.anomic.document.parser.html.ImageEntry; import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; -import de.anomic.http.httpDocument; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaHTCache; @@ -152,7 +152,7 @@ public class ViewFile { // if the resource body was not cached we try to load it from web if (resource == null) { - httpDocument entry = null; + Response entry = null; try { entry = sb.crawlQueues.loadResourceFromWeb(url, true, false); } catch (final Exception e) { diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index 43dafe1ec..08e2d99eb 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -36,10 +36,10 @@ import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.SitemapImporter; import de.anomic.crawler.ZURL; +import de.anomic.crawler.retrieval.Request; import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; import de.anomic.document.parser.html.ContentScraper; @@ -225,7 +225,7 @@ public class WatchCrawler_p { crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); - final String reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry( + final String reasonString = sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash, url, null, @@ -279,7 +279,7 @@ public class WatchCrawler_p { prop.putHTML("info_reasonString", reasonString); final ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry( - new CrawlEntry( + new Request( sb.peers.mySeed().hash, crawlingStartURL, "", @@ -364,7 +364,7 @@ public class WatchCrawler_p { if (nexturl == null) continue; // enqueuing the url for crawling - sb.crawlStacker.enqueueEntry(new CrawlEntry( + sb.crawlStacker.enqueueEntry(new Request( sb.peers.mySeed().hash, nexturl, "", diff --git a/htroot/api/queues_p.java b/htroot/api/queues_p.java index a9506b9c7..904316e28 100755 --- a/htroot/api/queues_p.java +++ b/htroot/api/queues_p.java @@ -4,9 +4,9 @@ import java.util.Date; import java.util.Iterator; import java.util.Locale; -import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.IndexingStack; import de.anomic.crawler.NoticedURL; +import de.anomic.crawler.retrieval.Request; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.kelondroException; import de.anomic.plasma.plasmaSwitchboard; @@ -39,11 +39,11 @@ public class queues_p { yacySeed initiator; //indexing queue - prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.crawler.indexingStack.getActiveQueueSize()); + prop.putNum("indexingSize", sb.getThread(plasmaSwitchboardConstants.INDEXER).getJobCount() + sb.getActiveQueueSize()); prop.putNum("indexingMax", (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30)); prop.putNum("urlpublictextSize", sb.indexSegment.urlMetadata().size()); prop.putNum("rwipublictextSize", sb.indexSegment.termIndex().sizesMax()); - if ((sb.crawler.indexingStack.size() == 0) && (sb.crawler.indexingStack.getActiveQueueSize() == 0)) { + if ((sb.crawler.indexingStack.size() == 0) && (sb.getActiveQueueSize() == 0)) { prop.put("list", "0"); //is empty } else { IndexingStack.QueueEntry pcentry; @@ -52,7 +52,6 @@ public class queues_p { // getting all entries that are currently in process final ArrayList entryList = new ArrayList(); - entryList.addAll(sb.crawler.indexingStack.getActiveQueueEntries()); final int inProcessCount = entryList.size(); // getting all enqueued entries @@ -97,7 +96,7 @@ public class queues_p { if (sb.crawlQueues.size() == 0) { prop.put("list-loader", "0"); } else { - final CrawlEntry[] w = sb.crawlQueues.activeWorkerEntries(); + final Request[] w = sb.crawlQueues.activeWorkerEntries(); int count = 0; for (int i = 0; i < w.length; i++) { if (w[i] == null) continue; @@ -138,10 +137,10 @@ public class queues_p { } - public static final void addNTable(final plasmaSwitchboard sb, final serverObjects prop, final String tableName, final ArrayList crawlerList) { + public static final void addNTable(final plasmaSwitchboard sb, final serverObjects prop, final String tableName, final ArrayList crawlerList) { int showNum = 0; - CrawlEntry urle; + Request urle; yacySeed initiator; for (int i = 0; i < crawlerList.size(); i++) { urle = crawlerList.get(i); diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index cf191a7b4..72cc02eaa 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -3,7 +3,7 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Set; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.document.parser.html.ContentScraper; import de.anomic.http.httpHeader; import de.anomic.http.httpRequestHeader; diff --git a/htroot/rct_p.java b/htroot/rct_p.java index 81b809e10..9d45f2df9 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -31,7 +31,7 @@ import java.util.Date; import java.util.Iterator; import de.anomic.content.RSSMessage; -import de.anomic.crawler.CrawlEntry; +import de.anomic.crawler.retrieval.Request; import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.DateFormatter; @@ -77,7 +77,7 @@ public class rct_p { if (urlRejectReason == null) { // stack url if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); - sb.crawlStacker.enqueueEntry(new CrawlEntry( + sb.crawlStacker.enqueueEntry(new Request( peerhash, url, (referrer == null) ? null : referrer.hash(), diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index 31bdef145..58c7afd42 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -38,7 +38,7 @@ import java.util.Arrays; import java.util.HashSet; import java.util.List; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.data.AbstractBlacklist; import de.anomic.data.listManager; import de.anomic.data.list.ListAccumulator; diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index a4dfb6ab1..105ba0a84 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -27,8 +27,8 @@ import java.io.IOException; import java.util.Date; -import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.NoticedURL; +import de.anomic.crawler.retrieval.Request; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.util.DateFormatter; @@ -62,7 +62,7 @@ public class urls { long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000))); long timeout = System.currentTimeMillis() + maxTime; int c = 0; - CrawlEntry entry; + Request entry; yacyURL referrer; while ((maxCount > 0) && (System.currentTimeMillis() < timeout) && diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 60d7d126d..b6616f6aa 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -33,6 +33,7 @@ import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; +import de.anomic.crawler.retrieval.Request; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.ObjectIndex; import de.anomic.kelondro.order.CloneableIterator; @@ -68,7 +69,7 @@ public class Balancer { if (!(cachePath.exists())) cachePath.mkdir(); // make the path cacheStacksPath.mkdirs(); File f = new File(cacheStacksPath, stackname + indexSuffix); - urlFileIndex = new Table(f, CrawlEntry.rowdef, (fullram) ? Table.tailCacheUsageAuto : Table.tailCacheDenyUsage, EcoFSBufferSize, 0); + urlFileIndex = new Table(f, Request.rowdef, (fullram) ? Table.tailCacheUsageAuto : Table.tailCacheDenyUsage, EcoFSBufferSize, 0); profileErrors = 0; lastDomainStackFill = 0; Log.logInfo("Balancer", "opened balancer file with " + urlFileIndex.size() + " entries from " + f.toString()); @@ -108,12 +109,12 @@ public class Balancer { } } - public CrawlEntry get(final String urlhash) throws IOException { + public Request get(final String urlhash) throws IOException { assert urlhash != null; if (urlFileIndex == null) return null; // case occurs during shutdown final Row.Entry entry = urlFileIndex.get(urlhash.getBytes()); if (entry == null) return null; - return new CrawlEntry(entry); + return new Request(entry); } public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException { @@ -125,11 +126,11 @@ public class Balancer { final Iterator i = urlFileIndex.rows(); final HashSet urlHashes = new HashSet(); Row.Entry rowEntry; - CrawlEntry crawlEntry; + Request crawlEntry; final long terminate = (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE; while (i.hasNext() && (System.currentTimeMillis() < terminate)) { rowEntry = i.next(); - crawlEntry = new CrawlEntry(rowEntry); + crawlEntry = new Request(rowEntry); if (crawlEntry.profileHandle().equals(profileHandle)) { urlHashes.add(crawlEntry.url().hash()); } @@ -215,7 +216,7 @@ public class Balancer { return false; } - public void push(final CrawlEntry entry) throws IOException { + public void push(final Request entry) throws IOException { assert entry != null; String hash = entry.url().hash(); synchronized (this) { @@ -289,7 +290,7 @@ public class Balancer { * @return a url in a CrawlEntry object * @throws IOException */ - public CrawlEntry pop(final boolean delay, final CrawlProfile profile) throws IOException { + public Request pop(final boolean delay, final CrawlProfile profile) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times filltop(delay, -600000, false); @@ -304,7 +305,7 @@ public class Balancer { filltop(delay, 0, true); long sleeptime = 0; - CrawlEntry crawlEntry = null; + Request crawlEntry = null; while (this.urlFileIndex.size() > 0) { // first simply take one of the entries in the top list, that should be one without any delay String result = nextFromDelayed(); @@ -323,7 +324,7 @@ public class Balancer { } //assert urlFileIndex.size() + 1 == s : "urlFileIndex.size() = " + urlFileIndex.size() + ", s = " + s + ", result = " + result; - crawlEntry = new CrawlEntry(rowEntry); + crawlEntry = new Request(rowEntry); //Log.logInfo("Balancer", "fetched next url: " + crawlEntry.url().toNormalform(true, false)); // at this point we must check if the crawlEntry has relevancy because the crawl profile still exists @@ -433,15 +434,15 @@ public class Balancer { } } - public ArrayList top(int count) { + public ArrayList top(int count) { count = Math.min(count, top.size()); - ArrayList cel = new ArrayList(); + ArrayList cel = new ArrayList(); if (count == 0) return cel; for (String n: top) { try { Row.Entry rowEntry = urlFileIndex.get(n.getBytes()); if (rowEntry == null) continue; - final CrawlEntry crawlEntry = new CrawlEntry(rowEntry); + final Request crawlEntry = new Request(rowEntry); cel.add(crawlEntry); count--; if (count <= 0) break; @@ -451,11 +452,11 @@ public class Balancer { return cel; } - public Iterator iterator() throws IOException { + public Iterator iterator() throws IOException { return new EntryIterator(); } - private class EntryIterator implements Iterator { + private class EntryIterator implements Iterator { private Iterator rowIterator; @@ -467,10 +468,10 @@ public class Balancer { return (rowIterator == null) ? false : rowIterator.hasNext(); } - public CrawlEntry next() { + public Request next() { final Row.Entry entry = rowIterator.next(); try { - return (entry == null) ? null : new CrawlEntry(entry); + return (entry == null) ? null : new Request(entry); } catch (final IOException e) { rowIterator = null; return null; diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index c9e091c38..c941bc8d5 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -37,9 +37,11 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import de.anomic.content.RSSMessage; +import de.anomic.crawler.retrieval.Request; +import de.anomic.crawler.retrieval.LoaderDispatcher; +import de.anomic.crawler.retrieval.Response; import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.httpClient; -import de.anomic.http.httpDocument; import de.anomic.kelondro.table.SplitTable; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.FileUtils; @@ -57,7 +59,7 @@ public class CrawlQueues { protected plasmaSwitchboard sb; protected Log log; protected Map workers; // mapping from url hash to Worker thread object - protected ProtocolLoader loader; + protected LoaderDispatcher loader; private final ArrayList remoteCrawlProviderHashes; public NoticedURL noticeURL; @@ -67,7 +69,7 @@ public class CrawlQueues { this.sb = sb; this.log = new Log("CRAWLER"); this.workers = new ConcurrentHashMap(); - this.loader = new ProtocolLoader(sb, log); + this.loader = new LoaderDispatcher(sb, log); this.remoteCrawlProviderHashes = new ArrayList(); // start crawling management @@ -106,7 +108,7 @@ public class CrawlQueues { public yacyURL getURL(final String urlhash) { assert urlhash != null; if (urlhash == null || urlhash.length() == 0) return null; - final CrawlEntry ne = noticeURL.get(urlhash); + final Request ne = noticeURL.get(urlhash); if (ne != null) return ne.url(); ZURL.Entry ee = delegatedURL.getEntry(urlhash); if (ee != null) return ee.url(); @@ -164,9 +166,9 @@ public class CrawlQueues { delegatedURL.close(); } - public CrawlEntry[] activeWorkerEntries() { + public Request[] activeWorkerEntries() { synchronized (workers) { - final CrawlEntry[] e = new CrawlEntry[workers.size()]; + final Request[] e = new Request[workers.size()]; int i = 0; for (final crawlWorker w: workers.values()) e[i++] = w.entry; return e; @@ -203,7 +205,7 @@ public class CrawlQueues { if(isPaused(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) return false; // do a local crawl - CrawlEntry urlEntry = null; + Request urlEntry = null; while (urlEntry == null && noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) > 0) { final String stats = "LOCALCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]"; try { @@ -234,7 +236,7 @@ public class CrawlQueues { * @param stats String for log prefixing * @return */ - private void generateCrawl(CrawlEntry urlEntry, final String stats, final String profileHandle) { + private void generateCrawl(Request urlEntry, final String stats, final String profileHandle) { final CrawlProfile.entry profile = sb.crawler.profilesActiveCrawls.getEntry(profileHandle); if (profile != null) { @@ -443,7 +445,7 @@ public class CrawlQueues { if (urlRejectReason == null) { // stack url if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); - sb.crawlStacker.enqueueEntry(new CrawlEntry( + sb.crawlStacker.enqueueEntry(new Request( hash, url, (referrer == null) ? null : referrer.hash(), @@ -491,7 +493,7 @@ public class CrawlQueues { final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]"; try { - final CrawlEntry urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true, sb.crawler.profilesActiveCrawls); + final Request urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true, sb.crawler.profilesActiveCrawls); final String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + @@ -505,13 +507,13 @@ public class CrawlQueues { } } - public httpDocument loadResourceFromWeb( + public Response loadResourceFromWeb( final yacyURL url, final boolean forText, final boolean global ) throws IOException { - final CrawlEntry centry = new CrawlEntry( + final Request centry = new Request( sb.peers.mySeed().hash, url, "", @@ -539,11 +541,11 @@ public class CrawlQueues { protected final class crawlWorker extends Thread { - protected CrawlEntry entry; + protected Request entry; private final Integer code; private long start; - public crawlWorker(final CrawlEntry entry) { + public crawlWorker(final Request entry) { this.start = System.currentTimeMillis(); this.entry = entry; this.entry.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 6b50db1ab..cf8486844 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -31,6 +31,7 @@ package de.anomic.crawler; import java.net.UnknownHostException; import java.util.Date; +import de.anomic.crawler.retrieval.Request; import de.anomic.data.Blacklist; import de.anomic.kelondro.text.Segment; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; @@ -45,7 +46,7 @@ public final class CrawlStacker { private Log log = new Log("STACKCRAWL"); - private serverProcessor fastQueue, slowQueue; + private serverProcessor fastQueue, slowQueue; private long dnsHit, dnsMiss; private CrawlQueues nextQueue; private CrawlSwitchboard crawler; @@ -71,8 +72,8 @@ public final class CrawlStacker { this.acceptLocalURLs = acceptLocalURLs; this.acceptGlobalURLs = acceptGlobalURLs; - this.fastQueue = new serverProcessor("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2); - this.slowQueue = new serverProcessor("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5); + this.fastQueue = new serverProcessor("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2); + this.slowQueue = new serverProcessor("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5); this.log.logInfo("STACKCRAWL thread initialized."); } @@ -125,7 +126,7 @@ public final class CrawlStacker { } */ - public CrawlEntry job(CrawlEntry entry) { + public Request job(Request entry) { // this is the method that is called by the busy thread from outside if (entry == null) return null; @@ -145,7 +146,7 @@ public final class CrawlStacker { return null; } - public void enqueueEntry(final CrawlEntry entry) { + public void enqueueEntry(final Request entry) { // DEBUG if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + entry.initiator() + ", name=" + entry.name() + ", load=" + entry.loaddate() + ", depth=" + entry.depth()); @@ -167,7 +168,7 @@ public final class CrawlStacker { } } - public String stackCrawl(final CrawlEntry entry) { + public String stackCrawl(final Request entry) { // stacks a crawl item. The position can also be remote // returns null if successful, a reason string if not successful //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); diff --git a/source/de/anomic/crawler/IndexingStack.java b/source/de/anomic/crawler/IndexingStack.java index b27aef6e1..efcfc9bd0 100644 --- a/source/de/anomic/crawler/IndexingStack.java +++ b/source/de/anomic/crawler/IndexingStack.java @@ -29,10 +29,8 @@ package de.anomic.crawler; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.util.Collection; import java.util.Date; import java.util.Iterator; -import java.util.concurrent.ConcurrentHashMap; import de.anomic.document.Parser; import de.anomic.http.httpHeader; @@ -60,7 +58,6 @@ public class IndexingStack { protected final CrawlProfile profiles; protected final RecordStack sbQueueStack; protected final yacySeedDB peers; - protected final ConcurrentHashMap queueInProcess; public IndexingStack( final yacySeedDB peers, @@ -69,7 +66,6 @@ public class IndexingStack { final CrawlProfile profiles) { this.profiles = profiles; this.peers = peers; - this.queueInProcess = new ConcurrentHashMap(); this.sbQueueStack = RecordStack.open(new File(queuesRoot, sbQueueStackName), rowdef); } @@ -199,27 +195,24 @@ public class IndexingStack { } } - public QueueEntry newEntry(final yacyURL url, final String referrer, final Date ifModifiedSince, final boolean requestWithCookie, - final String initiator, final int depth, final String profilehandle, final String anchorName) { - return new QueueEntry(url, referrer, ifModifiedSince, requestWithCookie, initiator, depth, profilehandle, anchorName); - } - - public void store(final QueueEntry entry) { - queueInProcess.put(entry.url().hash(), entry); - } - - public QueueEntry getActiveEntry(final String urlhash) { - // show one entry from the queue - return this.queueInProcess.get(urlhash); - } - - public int getActiveQueueSize() { - return this.queueInProcess.size(); - } - - public Collection getActiveQueueEntries() { - // todo: check dead entries? - return this.queueInProcess.values(); + public QueueEntry newEntry( + final yacyURL url, + final String referrer, + final Date ifModifiedSince, + final boolean requestWithCookie, + final String initiator, + final int depth, + final String profilehandle, + final String anchorName) { + return new QueueEntry( + url, + referrer, + ifModifiedSince, + requestWithCookie, + initiator, + depth, + profilehandle, + anchorName); } public static final int QUEUE_STATE_FRESH = 0; @@ -229,6 +222,9 @@ public class IndexingStack { public static final int QUEUE_STATE_INDEXSTORAGE = 4; public static final int QUEUE_STATE_FINISHED = 5; + /** + * A HarvestResponse is a object that refers to a loaded entity. + */ public class QueueEntry { yacyURL url; // plasmaURL.urlStringLength String referrerHash; // plasmaURL.urlHashLength @@ -245,8 +241,15 @@ public class IndexingStack { private httpResponseHeader responseHeader; private yacyURL referrerURL; - public QueueEntry(final yacyURL url, final String referrer, final Date ifModifiedSince, final boolean requestWithCookie, - final String initiator, final int depth, final String profileHandle, final String anchorName) { + public QueueEntry( + final yacyURL url, + final String referrer, + final Date ifModifiedSince, + final boolean requestWithCookie, + final String initiator, + final int depth, + final String profileHandle, + final String anchorName) { this.url = url; this.referrerHash = referrer; this.ifModifiedSince = ifModifiedSince; @@ -310,14 +313,6 @@ public class IndexingStack { this.status = newStatus; } - public void close() { - queueInProcess.remove(this.url.hash()); - } - - protected void finalize() { - this.close(); - } - public yacyURL url() { return url; } diff --git a/source/de/anomic/crawler/LoaderMessage.java b/source/de/anomic/crawler/LoaderMessage.java index b63236ad1..cc312ee5b 100644 --- a/source/de/anomic/crawler/LoaderMessage.java +++ b/source/de/anomic/crawler/LoaderMessage.java @@ -23,7 +23,7 @@ package de.anomic.crawler; -import de.anomic.http.httpDocument; +import de.anomic.crawler.retrieval.Response; import de.anomic.server.serverSemaphore; import de.anomic.yacy.yacyURL; @@ -41,7 +41,7 @@ public final class LoaderMessage { public final boolean keepInMemory; private serverSemaphore resultSync = null; - private httpDocument result; + private Response result; private String errorMessage; // loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) { @@ -80,7 +80,7 @@ public final class LoaderMessage { return this.errorMessage; } - public void setResult(final httpDocument theResult) { + public void setResult(final Response theResult) { // store the result this.result = theResult; @@ -88,8 +88,8 @@ public final class LoaderMessage { this.resultSync.V(); } - public httpDocument waitForResult() throws InterruptedException { - httpDocument theResult = null; + public Response waitForResult() throws InterruptedException { + Response theResult = null; this.resultSync.P(); /* =====> CRITICAL SECTION <======== */ diff --git a/source/de/anomic/crawler/NoticeURLImporter.java b/source/de/anomic/crawler/NoticeURLImporter.java index 45f09b7bf..ee803227a 100644 --- a/source/de/anomic/crawler/NoticeURLImporter.java +++ b/source/de/anomic/crawler/NoticeURLImporter.java @@ -8,6 +8,7 @@ import java.util.Iterator; import de.anomic.kelondro.util.FileUtils; import de.anomic.crawler.CrawlSwitchboard; +import de.anomic.crawler.retrieval.Request; public class NoticeURLImporter extends AbstractImporter implements Importer { @@ -129,11 +130,11 @@ public class NoticeURLImporter extends AbstractImporter implements Importer { } // getting an iterator and loop through the URL entries - final Iterator entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null; + final Iterator entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null; while (true) { String nextHash = null; - CrawlEntry nextEntry = null; + Request nextEntry = null; try { if (stackTypes[stackType] != -1) { diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index 2fb571868..f91d25973 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -30,6 +30,7 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; +import de.anomic.crawler.retrieval.Request; import de.anomic.yacy.logging.Log; public class NoticedURL { @@ -134,7 +135,7 @@ public class NoticedURL { remoteStack.has(urlhash); } - public void push(final int stackType, final CrawlEntry entry) { + public void push(final int stackType, final Request entry) { try { switch (stackType) { case STACK_TYPE_CORE: @@ -151,8 +152,8 @@ public class NoticedURL { } catch (final IOException er) {} } - public CrawlEntry get(final String urlhash) { - CrawlEntry entry = null; + public Request get(final String urlhash) { + Request entry = null; try {if ((entry = coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = limitStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} @@ -182,7 +183,7 @@ public class NoticedURL { return removed; } - public ArrayList top(final int stackType, final int count) { + public ArrayList top(final int stackType, final int count) { switch (stackType) { case STACK_TYPE_CORE: return top(coreStack, count); case STACK_TYPE_LIMIT: return top(limitStack, count); @@ -191,7 +192,7 @@ public class NoticedURL { } } - public CrawlEntry pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException { + public Request pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException { switch (stackType) { case STACK_TYPE_CORE: return pop(coreStack, delay, profile); case STACK_TYPE_LIMIT: return pop(limitStack, delay, profile); @@ -202,7 +203,7 @@ public class NoticedURL { public void shift(final int fromStack, final int toStack, CrawlProfile profile) { try { - final CrawlEntry entry = pop(fromStack, false, profile); + final Request entry = pop(fromStack, false, profile); if (entry != null) push(toStack, entry); } catch (final IOException e) { return; @@ -219,10 +220,10 @@ public class NoticedURL { } } - private CrawlEntry pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException { + private Request pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException { // this is a filo - pop int s; - CrawlEntry entry; + Request entry; int errors = 0; synchronized (balancer) { while ((s = balancer.size()) > 0) { @@ -241,15 +242,15 @@ public class NoticedURL { return null; } - private ArrayList top(final Balancer balancer, int count) { + private ArrayList top(final Balancer balancer, int count) { // this is a filo - top if (count > balancer.size()) count = balancer.size(); - ArrayList list; + ArrayList list; list = balancer.top(count); return list; } - public Iterator iterator(final int stackType) { + public Iterator iterator(final int stackType) { // returns an iterator of plasmaCrawlBalancerEntry Objects try {switch (stackType) { case STACK_TYPE_CORE: return coreStack.iterator(); @@ -257,7 +258,7 @@ public class NoticedURL { case STACK_TYPE_REMOTE: return remoteStack.iterator(); default: return null; }} catch (final IOException e) { - return new HashSet().iterator(); + return new HashSet().iterator(); } } diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index e090ed61e..0a8690ce3 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -40,6 +40,7 @@ import java.util.LinkedList; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.http.httpClient; import de.anomic.http.httpHeader; import de.anomic.http.httpResponse; diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index 4f63dca1e..399c87d46 100755 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -32,6 +32,7 @@ import java.util.Date; import java.util.Iterator; import java.util.LinkedList; +import de.anomic.crawler.retrieval.Request; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.RowSet; import de.anomic.kelondro.index.ObjectIndex; @@ -53,7 +54,7 @@ public class ZURL { "Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load "Cardinal workcount-4 {b256}, " + // number of load retries "String anycause-132, " + // string describing load failure - "byte[] entry-" + CrawlEntry.rowdef.objectsize, // extra space + "byte[] entry-" + Request.rowdef.objectsize, // extra space Base64Order.enhancedCoder ); @@ -96,7 +97,7 @@ public class ZURL { } public synchronized Entry newEntry( - final CrawlEntry bentry, + final Request bentry, final String executor, final Date workdate, final int workcount, @@ -160,7 +161,7 @@ public class ZURL { public class Entry { - CrawlEntry bentry; // the balancer entry + Request bentry; // the balancer entry private final String executor; // the crawling executor private final Date workdate; // the time when the url was last time tried to load private final int workcount; // number of tryings @@ -168,7 +169,7 @@ public class ZURL { private boolean stored; public Entry( - final CrawlEntry bentry, + final Request bentry, final String executor, final Date workdate, final int workcount, @@ -190,7 +191,7 @@ public class ZURL { this.workdate = new Date(entry.getColLong(2)); this.workcount = (int) entry.getColLong(3); this.anycause = entry.getColString(4, "UTF-8"); - this.bentry = new CrawlEntry(CrawlEntry.rowdef.newEntry(entry.getColBytes(5))); + this.bentry = new Request(Request.rowdef.newEntry(entry.getColBytes(5))); assert ((new String(entry.getColBytes(0))).equals(bentry.url().hash())); this.stored = true; return; diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java similarity index 92% rename from source/de/anomic/crawler/FTPLoader.java rename to source/de/anomic/crawler/retrieval/FTPLoader.java index e0eec059e..8e4f81ca2 100644 --- a/source/de/anomic/crawler/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -25,18 +25,18 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.crawler; +package de.anomic.crawler.retrieval; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.PrintStream; import java.util.Date; +import de.anomic.crawler.Latency; import de.anomic.document.Parser; import de.anomic.http.httpHeader; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; -import de.anomic.http.httpDocument; import de.anomic.kelondro.util.DateFormatter; import de.anomic.net.ftpc; import de.anomic.plasma.plasmaHTCache; @@ -56,14 +56,14 @@ public class FTPLoader { maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l); } - protected httpDocument createCacheEntry(final CrawlEntry entry, final String mimeType, final Date fileDate) { + protected Response createCacheEntry(final Request entry, final String mimeType, final Date fileDate) { if (entry == null) return null; httpRequestHeader requestHeader = new httpRequestHeader(); if (entry.referrerhash() != null) requestHeader.put(httpRequestHeader.REFERER, sb.getURL(entry.referrerhash()).toNormalform(true, false)); httpResponseHeader responseHeader = new httpResponseHeader(); responseHeader.put(httpHeader.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate)); responseHeader.put(httpHeader.CONTENT_TYPE, mimeType); - httpDocument metadata = new httpDocument( + Response metadata = new Response( entry.depth(), entry.url(), entry.name(), "OK", requestHeader, responseHeader, entry.initiator(), sb.crawler.profilesActiveCrawls.getEntry(entry.profileHandle())); @@ -77,14 +77,14 @@ public class FTPLoader { * @param entry * @return */ - public httpDocument load(final CrawlEntry entry) throws IOException { + public Response load(final Request entry) throws IOException { long start = System.currentTimeMillis(); final yacyURL entryUrl = entry.url(); final String fullPath = getPath(entryUrl); // the return value - httpDocument htCache = null; + Response htCache = null; // determine filename and path String file, path; @@ -215,7 +215,7 @@ public class FTPLoader { * @return * @throws Exception */ - private httpDocument getFile(final ftpc ftpClient, final CrawlEntry entry) throws Exception { + private Response getFile(final ftpc ftpClient, final Request entry) throws Exception { // determine the mimetype of the resource final yacyURL entryUrl = entry.url(); final String mimeType = Parser.mimeOf(entryUrl); @@ -223,7 +223,7 @@ public class FTPLoader { // if the mimetype and file extension is supported we start to download // the file - httpDocument htCache = null; + Response htCache = null; String supportError = Parser.supports(entryUrl, mimeType); if (supportError != null) { // reject file @@ -271,7 +271,7 @@ public class FTPLoader { * @param cacheFile * @return */ - private byte[] generateDirlist(final ftpc ftpClient, final CrawlEntry entry, final String path) { + private byte[] generateDirlist(final ftpc ftpClient, final Request entry, final String path) { // getting the dirlist final yacyURL entryUrl = entry.url(); diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java similarity index 93% rename from source/de/anomic/crawler/HTTPLoader.java rename to source/de/anomic/crawler/retrieval/HTTPLoader.java index 94f17361b..e8d53abfa 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -23,11 +23,12 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.crawler; +package de.anomic.crawler.retrieval; import java.io.IOException; import java.util.Date; +import de.anomic.crawler.Latency; import de.anomic.data.Blacklist; import de.anomic.document.Parser; import de.anomic.http.httpClient; @@ -35,7 +36,6 @@ import de.anomic.http.httpHeader; import de.anomic.http.httpResponse; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; -import de.anomic.http.httpDocument; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.yacy.yacyURL; @@ -83,8 +83,8 @@ public final class HTTPLoader { * @param responseStatus Status-Code SPACE Reason-Phrase * @return */ - protected httpDocument createCacheEntry(final CrawlEntry entry, final Date requestDate, final httpRequestHeader requestHeader, final httpResponseHeader responseHeader, final String responseStatus) { - httpDocument metadata = new httpDocument( + protected Response createCacheEntry(final Request entry, final Date requestDate, final httpRequestHeader requestHeader, final httpResponseHeader responseHeader, final String responseStatus) { + Response metadata = new Response( entry.depth(), entry.url(), entry.name(), @@ -98,14 +98,14 @@ public final class HTTPLoader { return metadata; } - public httpDocument load(final CrawlEntry entry) throws IOException { + public Response load(final Request entry) throws IOException { long start = System.currentTimeMillis(); - httpDocument doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT); + Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT); Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start); return doc; } - private httpDocument load(final CrawlEntry entry, final int retryCount) throws IOException { + private Response load(final Request entry, final int retryCount) throws IOException { if (retryCount < 0) { sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "redirection counter exceeded").store(); @@ -134,7 +134,7 @@ public final class HTTPLoader { } // take a file from the net - httpDocument htCache = null; + Response htCache = null; final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE); //try { // create a request header diff --git a/source/de/anomic/crawler/ProtocolLoader.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java similarity index 91% rename from source/de/anomic/crawler/ProtocolLoader.java rename to source/de/anomic/crawler/retrieval/LoaderDispatcher.java index 6b9f8c87a..338e9005b 100644 --- a/source/de/anomic/crawler/ProtocolLoader.java +++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java @@ -1,4 +1,4 @@ -// plasmaProtocolLoader.java +// LoaderDispatcher.java // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 24.10.2007 on http://yacy.net // @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.crawler; +package de.anomic.crawler.retrieval; import java.io.IOException; import java.util.Arrays; @@ -33,13 +33,12 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import de.anomic.http.httpDocument; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; import de.anomic.server.serverProcessorJob; import de.anomic.yacy.logging.Log; -public final class ProtocolLoader { +public final class LoaderDispatcher { private static final long minDelay = 250; // milliseconds; 4 accesses per second private static final ConcurrentHashMap accessTime = new ConcurrentHashMap(); // to protect targets from DDoS @@ -50,7 +49,7 @@ public final class ProtocolLoader { private final HTTPLoader httpLoader; private final FTPLoader ftpLoader; - public ProtocolLoader(final plasmaSwitchboard sb, final Log log) { + public LoaderDispatcher(final plasmaSwitchboard sb, final Log log) { this.sb = sb; this.log = log; this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp"})); @@ -70,7 +69,7 @@ public final class ProtocolLoader { return (HashSet) this.supportedProtocols.clone(); } - public httpDocument load(final CrawlEntry entry) throws IOException { + public Response load(final Request entry) throws IOException { // getting the protocol of the next URL final String protocol = entry.url().getProtocol(); final String host = entry.url().getHost(); @@ -111,10 +110,10 @@ public final class ProtocolLoader { } } - public String process(final CrawlEntry entry) { + public String process(final Request entry) { // load a resource, store it to htcache and push queue entry to switchboard queue // returns null if everything went fine, a fail reason string if a problem occurred - httpDocument h; + Response h; try { entry.setStatus("loading", serverProcessorJob.STATUS_RUNNING); h = load(entry); diff --git a/source/de/anomic/crawler/CrawlEntry.java b/source/de/anomic/crawler/retrieval/Request.java similarity index 97% rename from source/de/anomic/crawler/CrawlEntry.java rename to source/de/anomic/crawler/retrieval/Request.java index 5ff1d91b3..49f04c259 100755 --- a/source/de/anomic/crawler/CrawlEntry.java +++ b/source/de/anomic/crawler/retrieval/Request.java @@ -24,7 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.crawler; +package de.anomic.crawler.retrieval; import java.io.IOException; import java.io.UnsupportedEncodingException; @@ -38,7 +38,7 @@ import de.anomic.server.serverProcessorJob; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; -public class CrawlEntry extends serverProcessorJob { +public class Request extends serverProcessorJob { // row definition for balancer-related NURL-entries public final static Row rowdef = new Row( @@ -80,6 +80,9 @@ public class CrawlEntry extends serverProcessorJob { /** + * A HarvestRequest Entry is a object that is created to provide + * all information to load a specific resource. + * * @param initiator the hash of the initiator peer * @param url the {@link URL} to crawl * @param referrer the hash of the referrer URL @@ -90,7 +93,7 @@ public class CrawlEntry extends serverProcessorJob { * @param anchors number of anchors of the parent * @param forkfactor sum of anchors of all ancestors */ - public CrawlEntry( + public Request( final String initiator, final yacyURL url, final String referrerhash, @@ -126,7 +129,7 @@ public class CrawlEntry extends serverProcessorJob { this.status = serverProcessorJob.STATUS_INITIATED; } - public CrawlEntry(final Row.Entry entry) throws IOException { + public Request(final Row.Entry entry) throws IOException { assert (entry != null); insertEntry(entry); } diff --git a/source/de/anomic/http/httpDocument.java b/source/de/anomic/crawler/retrieval/Response.java similarity index 98% rename from source/de/anomic/http/httpDocument.java rename to source/de/anomic/crawler/retrieval/Response.java index 676cd59f2..aab62bd1e 100755 --- a/source/de/anomic/http/httpDocument.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -24,16 +24,19 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.http; +package de.anomic.crawler.retrieval; import java.util.Date; import de.anomic.crawler.CrawlProfile; +import de.anomic.http.httpHeader; +import de.anomic.http.httpRequestHeader; +import de.anomic.http.httpResponseHeader; import de.anomic.kelondro.util.DateFormatter; import de.anomic.plasma.plasmaHTCache; import de.anomic.yacy.yacyURL; -public class httpDocument { +public class Response { // doctypes: public static final char DT_PDFPS = 'p'; @@ -51,7 +54,7 @@ public class httpDocument { // the class objects private final int depth; // the depth of pre-fetching private final String responseStatus; - private byte[] cacheArray; // or the cache as byte-array + private byte[] cacheArray; // private final yacyURL url; private final String name; // the name of the link, read as anchor from an -tag private final CrawlProfile.entry profile; @@ -130,7 +133,7 @@ public class httpDocument { return doctype; } - public httpDocument( + public Response( final int depth, final yacyURL url, final String name, diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index 6b5ea8e88..75e74ded0 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -38,9 +38,9 @@ import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; +import de.anomic.crawler.retrieval.Request; import de.anomic.http.httpClient; import de.anomic.http.httpHeader; import de.anomic.http.httpResponse; @@ -272,7 +272,7 @@ public class SitemapParser extends DefaultHandler { } // URL needs to crawled - this.sb.crawlStacker.enqueueEntry(new CrawlEntry( + this.sb.crawlStacker.enqueueEntry(new Request( this.sb.peers.mySeed().hash, url, null, // this.siteMapURL.toString(), diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index 690dbd096..59e99eef6 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -61,8 +61,8 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; -import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.retrieval.Request; import de.anomic.document.Word; import de.anomic.document.parser.html.ContentScraper; import de.anomic.document.parser.html.TransformerWriter; @@ -262,7 +262,7 @@ public class bookmarksDB { crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); - sb.crawlStacker.enqueueEntry(new CrawlEntry( + sb.crawlStacker.enqueueEntry(new Request( sb.peers.mySeed().hash, crawlingStartURL, null, diff --git a/source/de/anomic/document/parser/html/ContentScraper.java b/source/de/anomic/document/parser/html/ContentScraper.java index 45644f9e0..0e5bc3866 100644 --- a/source/de/anomic/document/parser/html/ContentScraper.java +++ b/source/de/anomic/document/parser/html/ContentScraper.java @@ -44,7 +44,7 @@ import java.util.Properties; import javax.swing.event.EventListenerList; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.document.parser.htmlParser; import de.anomic.http.httpClient; import de.anomic.http.httpHeader; diff --git a/source/de/anomic/document/parser/odtParser.java b/source/de/anomic/document/parser/odtParser.java index ec7c4336a..9bf1329d2 100644 --- a/source/de/anomic/document/parser/odtParser.java +++ b/source/de/anomic/document/parser/odtParser.java @@ -44,7 +44,7 @@ import com.catcode.odf.ODFMetaFileAnalyzer; import com.catcode.odf.OpenDocumentMetadata; import com.catcode.odf.OpenDocumentTextInputStream; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; import de.anomic.document.ParserException; diff --git a/source/de/anomic/document/parser/rpmParser.java b/source/de/anomic/document/parser/rpmParser.java index 8acec6e59..f9550f1b9 100644 --- a/source/de/anomic/document/parser/rpmParser.java +++ b/source/de/anomic/document/parser/rpmParser.java @@ -37,7 +37,7 @@ import java.util.Set; import com.jguild.jrpm.io.RPMFile; import com.jguild.jrpm.io.datatype.DataTypeIf; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; import de.anomic.document.ParserException; diff --git a/source/de/anomic/document/parser/vcfParser.java b/source/de/anomic/document/parser/vcfParser.java index 853bb3e57..6be8e9c53 100644 --- a/source/de/anomic/document/parser/vcfParser.java +++ b/source/de/anomic/document/parser/vcfParser.java @@ -38,7 +38,7 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.Set; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; import de.anomic.document.ParserException; diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 33c793693..6a2b32641 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -71,7 +71,8 @@ import java.util.logging.LogManager; import java.util.logging.Logger; import java.util.zip.GZIPOutputStream; -import de.anomic.crawler.HTTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; +import de.anomic.crawler.retrieval.Response; import de.anomic.data.Blacklist; import de.anomic.document.Parser; import de.anomic.document.parser.html.ContentTransformer; @@ -377,7 +378,7 @@ public final class httpdProxyHandler { if (theLogger.isFinest()) theLogger.logFinest(reqID + " page not in cache: fulfill request from web"); fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond); } else { - final httpDocument cacheEntry = new httpDocument( + final Response cacheEntry = new Response( 0, // crawling depth url, // url "", // name of the url is unknown @@ -491,7 +492,7 @@ public final class httpdProxyHandler { } // reserver cache entry - final httpDocument cacheEntry = new httpDocument( + final Response cacheEntry = new Response( 0, url, "", diff --git a/source/de/anomic/kelondro/text/Metadata.java b/source/de/anomic/kelondro/text/Metadata.java index 4eccd83b1..c7ac0d222 100644 --- a/source/de/anomic/kelondro/text/Metadata.java +++ b/source/de/anomic/kelondro/text/Metadata.java @@ -28,7 +28,7 @@ package de.anomic.kelondro.text; import java.util.Date; -import de.anomic.crawler.CrawlEntry; +import de.anomic.crawler.retrieval.Request; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.text.Reference; @@ -82,7 +82,7 @@ public interface Metadata { public String toString(final String snippet); - public CrawlEntry toBalancerEntry(final String initiatorHash); + public Request toBalancerEntry(final String initiatorHash); public String toString(); diff --git a/source/de/anomic/kelondro/text/Segment.java b/source/de/anomic/kelondro/text/Segment.java index 9b204bb43..47d22c7cb 100644 --- a/source/de/anomic/kelondro/text/Segment.java +++ b/source/de/anomic/kelondro/text/Segment.java @@ -34,12 +34,12 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeSet; +import de.anomic.crawler.retrieval.Response; import de.anomic.data.Blacklist; import de.anomic.document.Condenser; import de.anomic.document.Word; import de.anomic.document.Document; import de.anomic.document.parser.html.ContentScraper; -import de.anomic.http.httpDocument; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; @@ -269,7 +269,7 @@ public final class Segment { new byte[0], // md5 (int) sourcesize, // size condenser.RESULT_NUMB_WORDS, // word count - httpDocument.docType(document.dc_format()), // doctype + Response.docType(document.dc_format()), // doctype condenser.RESULT_FLAGS, // flags language, // language document.inboundLinks(), // inbound links @@ -292,7 +292,7 @@ public final class Segment { document, // document content condenser, // document condenser language, // document language - httpDocument.docType(document.dc_format()), // document type + Response.docType(document.dc_format()), // document type document.inboundLinks(), // inbound links document.outboundLinks() // outbound links ); diff --git a/source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java b/source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java index 7c5dcbe03..8b959ebbc 100644 --- a/source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java +++ b/source/de/anomic/kelondro/text/metadataPrototype/URLMetadataRow.java @@ -33,7 +33,7 @@ import java.util.ArrayList; import java.util.Date; import java.util.Properties; -import de.anomic.crawler.CrawlEntry; +import de.anomic.crawler.retrieval.Request; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; @@ -468,8 +468,8 @@ public class URLMetadataRow implements Metadata { //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; } - public CrawlEntry toBalancerEntry(final String initiatorHash) { - return new CrawlEntry( + public Request toBalancerEntry(final String initiatorHash) { + return new Request( initiatorHash, metadata().url(), referrerHash(), diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 8da80776c..343f7cbdb 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -41,9 +41,9 @@ import java.io.InputStream; import java.util.HashMap; import java.util.Map; +import de.anomic.crawler.retrieval.Response; import de.anomic.document.Classification; import de.anomic.http.httpResponseHeader; -import de.anomic.http.httpDocument; import de.anomic.kelondro.blob.ArrayStack; import de.anomic.kelondro.blob.Compressor; import de.anomic.kelondro.blob.Heap; @@ -204,7 +204,7 @@ public final class plasmaHTCache { public static void storeMetadata( final httpResponseHeader responseHeader, - httpDocument metadata + Response metadata ) { if (responseHeader != null) try { // store the response header into the header database diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 9d3231216..f6cd96351 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -115,16 +115,13 @@ import java.util.regex.Pattern; import de.anomic.content.DCEntry; import de.anomic.content.RSSMessage; import de.anomic.content.file.SurrogateReader; -import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.CrawlStacker; import de.anomic.crawler.CrawlSwitchboard; -import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.ImporterManager; import de.anomic.crawler.IndexingStack; import de.anomic.crawler.NoticedURL; -import de.anomic.crawler.ProtocolLoader; import de.anomic.crawler.ResourceObserver; import de.anomic.crawler.ResultImages; import de.anomic.crawler.ResultURLs; @@ -132,6 +129,10 @@ import de.anomic.crawler.RobotsTxt; import de.anomic.crawler.ZURL; import de.anomic.crawler.CrawlProfile.entry; import de.anomic.crawler.IndexingStack.QueueEntry; +import de.anomic.crawler.retrieval.HTTPLoader; +import de.anomic.crawler.retrieval.Request; +import de.anomic.crawler.retrieval.LoaderDispatcher; +import de.anomic.crawler.retrieval.Response; import de.anomic.data.Blacklist; import de.anomic.data.URLLicense; import de.anomic.data.blogBoard; @@ -155,7 +156,6 @@ import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; import de.anomic.http.httpd; -import de.anomic.http.httpDocument; import de.anomic.http.httpdRobotsTxtConfig; import de.anomic.kelondro.order.Digest; import de.anomic.kelondro.order.NaturalOrder; @@ -669,7 +669,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch * Then it iterates through all existing {@link CrawlProfile crawl profiles} and removes * all profiles which are not hardcoded. @@ -1088,7 +1095,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch