diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index d0cb0dbd3..849c78f9f 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -32,55 +32,6 @@ - storage: one plasmaStore object with the url-based database - configuration: initialized by properties once, then by external functions - job queues: for parsing, condensing, indexing - - black/blue/whitelists: controls input and output to the index - - this class is also the core of the http crawling. - There are some items that need to be respected when crawling the web: - 1) respect robots.txt - 2) do not access one domain too frequently, wait between accesses - 3) remember crawled URL's and do not access again too early - 4) priorization of specific links should be possible (hot-lists) - 5) attributes for crawling (depth, filters, hot/black-lists, priority) - 6) different crawling jobs with different attributes ('Orders') simultanoulsy - - We implement some specific tasks and use different database to archieve these goals: - - a database 'crawlerDisallow.db' contains all url's that shall not be crawled - - a database 'crawlerDomain.db' holds all domains and access times, where we loaded the disallow tables - this table contains the following entities: - - - four databases for scheduled access: crawlerScheduledHotText.db, crawlerScheduledColdText.db, - crawlerScheduledHotMedia.db and crawlerScheduledColdMedia.db - - two stacks for new URLS: newText.stack and newMedia.stack - - two databases for URL double-check: knownText.db and knownMedia.db - - one database with crawling orders: crawlerOrders.db - - The Information flow of a single URL that is crawled is as follows: - - a html file is loaded from a specific URL within the module httpdProxyServlet as - a process of the proxy. - - the file is passed to httpdProxyCache. Here it's processing is delayed until the proxy is idle. - - The cache entry is passed on to the plasmaSwitchboard. There the URL is stored into plasmaLURL where - the URL is stored under a specific hash. The URL's from the content are stripped off, stored in plasmaLURL - with a 'wrong' date (the date of the URL's are not known at this time, only after fetching) and stacked with - plasmaCrawlerTextStack. The content is read and splitted into rated words in plasmaCondenser. - The splitted words are then integrated into the index with plasmaSearch. - - In plasmaSearch the words are indexed by reversing the relation between URL and words: one URL points - to many words, the words within the document at the URL. After reversing, one word points - to many URL's, all the URL's where the word occurrs. One single word->URL-hash relation is stored in - plasmaIndexEntry. A set of plasmaIndexEntries is a reverse word index. - This reverse word index is stored temporarly in plasmaIndexCache. - - In plasmaIndexCache the single plasmaIndexEntry'ies are collected and stored into a plasmaIndex - entry - These plasmaIndex - Objects are the true reverse words indexes. - - in plasmaIndex the plasmaIndexEntry - objects are stored in a kelondroTree; an indexed file in the file system. - - The information flow of a search request is as follows: - - in httpdFileServlet the user enters a search query, which is passed to plasmaSwitchboard - - in plasmaSwitchboard, the query is passed to plasmaSearch. - - in plasmaSearch, the plasmaSearch.result object is generated by simultanous enumeration of - URL hases in the reverse word indexes plasmaIndex - - (future: the plasmaSearch.result - object is used to identify more key words for a new search) - - - */ package de.anomic.search; @@ -370,20 +321,15 @@ public final class Switchboard extends serverSwitch { fileSizeMax, this.useTailCache, this.exceed134217727); + // set the default segment names + setDefaultSegments(); + + // create a crawler crawler = new CrawlSwitchboard( networkName, log, this.queuesRoot); - // set the default segment names - indexSegments.setSegment(Segments.Process.RECEIPTS, getConfig(SwitchboardConstants.SEGMENT_RECEIPTS, "default")); - indexSegments.setSegment(Segments.Process.QUERIES, getConfig(SwitchboardConstants.SEGMENT_QUERIES, "default")); - indexSegments.setSegment(Segments.Process.DHTIN, getConfig(SwitchboardConstants.SEGMENT_DHTIN, "default")); - indexSegments.setSegment(Segments.Process.DHTOUT, getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default")); - indexSegments.setSegment(Segments.Process.PROXY, getConfig(SwitchboardConstants.SEGMENT_PROXY, "default")); - indexSegments.setSegment(Segments.Process.LOCALCRAWLING, getConfig(SwitchboardConstants.SEGMENT_LOCALCRAWLING, "default")); - indexSegments.setSegment(Segments.Process.REMOTECRAWLING, getConfig(SwitchboardConstants.SEGMENT_REMOTECRAWLING, "default")); - indexSegments.setSegment(Segments.Process.PUBLIC, getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default")); // init crawl results monitor cache crawlResults = new ResultURLs(100); @@ -661,6 +607,17 @@ public final class Switchboard extends serverSwitch { log.logConfig("Finished Switchboard Initialization"); } + private void setDefaultSegments() { + indexSegments.setSegment(Segments.Process.RECEIPTS, getConfig(SwitchboardConstants.SEGMENT_RECEIPTS, "default")); + indexSegments.setSegment(Segments.Process.QUERIES, getConfig(SwitchboardConstants.SEGMENT_QUERIES, "default")); + indexSegments.setSegment(Segments.Process.DHTIN, getConfig(SwitchboardConstants.SEGMENT_DHTIN, "default")); + indexSegments.setSegment(Segments.Process.DHTOUT, getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default")); + indexSegments.setSegment(Segments.Process.PROXY, getConfig(SwitchboardConstants.SEGMENT_PROXY, "default")); + indexSegments.setSegment(Segments.Process.LOCALCRAWLING, getConfig(SwitchboardConstants.SEGMENT_LOCALCRAWLING, "default")); + indexSegments.setSegment(Segments.Process.REMOTECRAWLING, getConfig(SwitchboardConstants.SEGMENT_REMOTECRAWLING, "default")); + indexSegments.setSegment(Segments.Process.PUBLIC, getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default")); + } + public int getIndexingProcessorsQueueSize() { return this.indexingDocumentProcessor.queueSize() + @@ -799,6 +756,9 @@ public final class Switchboard extends serverSwitch { // switch the networks synchronized (this) { // shut down + this.crawler.close(); + this.peers.close(); + this.dhtDispatcher.close(); synchronized (this.indexSegments) { this.indexSegments.close(); } @@ -843,12 +803,21 @@ public final class Switchboard extends serverSwitch { fileSizeMax, this.useTailCache, this.exceed134217727); + // set the default segment names + setDefaultSegments(); - // startup + // create a crawler crawler = new CrawlSwitchboard( networkName, log, this.queuesRoot); + + // init a DHT transmission dispatcher + dhtDispatcher = new Dispatcher( + indexSegments.segment(Segments.Process.LOCALCRAWLING), + peers, + true, + 30000); // create new web structure this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));