some fixes for the network switch

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6591 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · fd7b348973
parent 7d400b17d0
commit fd7b348973
1 changed files with 28 additions and 59 deletions
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -32,55 +32,6 @@
   - storage: one plasmaStore object with the url-based database
   - configuration: initialized by properties once, then by external functions
   - job queues: for parsing, condensing, indexing
-   - black/blue/whitelists: controls input and output to the index
- 
-   this class is also the core of the http crawling.
-   There are some items that need to be respected when crawling the web:
-   1) respect robots.txt
-   2) do not access one domain too frequently, wait between accesses
-   3) remember crawled URL's and do not access again too early
-   4) priorization of specific links should be possible (hot-lists)
-   5) attributes for crawling (depth, filters, hot/black-lists, priority)
-   6) different crawling jobs with different attributes ('Orders') simultanoulsy
- 
-   We implement some specific tasks and use different database to archieve these goals:
-   - a database 'crawlerDisallow.db' contains all url's that shall not be crawled
-   - a database 'crawlerDomain.db' holds all domains and access times, where we loaded the disallow tables
-     this table contains the following entities:
-     <flag: robotes exist/not exist, last access of robots.txt, last access of domain (for access scheduling)>
-   - four databases for scheduled access: crawlerScheduledHotText.db, crawlerScheduledColdText.db,
-     crawlerScheduledHotMedia.db and crawlerScheduledColdMedia.db
-   - two stacks for new URLS: newText.stack and newMedia.stack
-   - two databases for URL double-check: knownText.db and knownMedia.db
-   - one database with crawling orders: crawlerOrders.db
- 
-   The Information flow of a single URL that is crawled is as follows:
-   - a html file is loaded from a specific URL within the module httpdProxyServlet as
-     a process of the proxy.
-   - the file is passed to httpdProxyCache. Here it's processing is delayed until the proxy is idle.
-   - The cache entry is passed on to the plasmaSwitchboard. There the URL is stored into plasmaLURL where
-     the URL is stored under a specific hash. The URL's from the content are stripped off, stored in plasmaLURL
-     with a 'wrong' date (the date of the URL's are not known at this time, only after fetching) and stacked with
-     plasmaCrawlerTextStack. The content is read and splitted into rated words in plasmaCondenser.
-     The splitted words are then integrated into the index with plasmaSearch.
-   - In plasmaSearch the words are indexed by reversing the relation between URL and words: one URL points
-     to many words, the words within the document at the URL. After reversing, one word points
-     to many URL's, all the URL's where the word occurrs. One single word->URL-hash relation is stored in
-     plasmaIndexEntry. A set of plasmaIndexEntries is a reverse word index.
-     This reverse word index is stored temporarly in plasmaIndexCache.
-   - In plasmaIndexCache the single plasmaIndexEntry'ies are collected and stored into a plasmaIndex - entry
-     These plasmaIndex - Objects are the true reverse words indexes.
-   - in plasmaIndex the plasmaIndexEntry - objects are stored in a kelondroTree; an indexed file in the file system.
- 
-   The information flow of a search request is as follows:
-   - in httpdFileServlet the user enters a search query, which is passed to plasmaSwitchboard
-   - in plasmaSwitchboard, the query is passed to plasmaSearch.
-   - in plasmaSearch, the plasmaSearch.result object is generated by simultanous enumeration of
-     URL hases in the reverse word indexes plasmaIndex
-   - (future: the plasmaSearch.result - object is used to identify more key words for a new search)
- 
- 
- 
 */

 package de.anomic.search;
@ -370,20 +321,15 @@ public final class Switchboard extends serverSwitch {
                fileSizeMax,
                this.useTailCache,
                this.exceed134217727);
+        // set the default segment names
+        setDefaultSegments();
+        
+        // create a crawler
        crawler = new CrawlSwitchboard(
                networkName,
                log,
                this.queuesRoot);
 		
-		// set the default segment names
-		indexSegments.setSegment(Segments.Process.RECEIPTS,       getConfig(SwitchboardConstants.SEGMENT_RECEIPTS, "default"));
-		indexSegments.setSegment(Segments.Process.QUERIES,        getConfig(SwitchboardConstants.SEGMENT_QUERIES, "default"));
-		indexSegments.setSegment(Segments.Process.DHTIN,          getConfig(SwitchboardConstants.SEGMENT_DHTIN, "default"));
-		indexSegments.setSegment(Segments.Process.DHTOUT,         getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default"));
-		indexSegments.setSegment(Segments.Process.PROXY,          getConfig(SwitchboardConstants.SEGMENT_PROXY, "default"));
-		indexSegments.setSegment(Segments.Process.LOCALCRAWLING,  getConfig(SwitchboardConstants.SEGMENT_LOCALCRAWLING, "default"));
-		indexSegments.setSegment(Segments.Process.REMOTECRAWLING, getConfig(SwitchboardConstants.SEGMENT_REMOTECRAWLING, "default"));
-		indexSegments.setSegment(Segments.Process.PUBLIC,         getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"));
 		
 		// init crawl results monitor cache
        crawlResults = new ResultURLs(100);
@ -661,6 +607,17 @@ public final class Switchboard extends serverSwitch {
        log.logConfig("Finished Switchboard Initialization");
    }

+    private void setDefaultSegments() {
+        indexSegments.setSegment(Segments.Process.RECEIPTS,       getConfig(SwitchboardConstants.SEGMENT_RECEIPTS, "default"));
+        indexSegments.setSegment(Segments.Process.QUERIES,        getConfig(SwitchboardConstants.SEGMENT_QUERIES, "default"));
+        indexSegments.setSegment(Segments.Process.DHTIN,          getConfig(SwitchboardConstants.SEGMENT_DHTIN, "default"));
+        indexSegments.setSegment(Segments.Process.DHTOUT,         getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default"));
+        indexSegments.setSegment(Segments.Process.PROXY,          getConfig(SwitchboardConstants.SEGMENT_PROXY, "default"));
+        indexSegments.setSegment(Segments.Process.LOCALCRAWLING,  getConfig(SwitchboardConstants.SEGMENT_LOCALCRAWLING, "default"));
+        indexSegments.setSegment(Segments.Process.REMOTECRAWLING, getConfig(SwitchboardConstants.SEGMENT_REMOTECRAWLING, "default"));
+        indexSegments.setSegment(Segments.Process.PUBLIC,         getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"));
+    }
+    
    public int getIndexingProcessorsQueueSize() {
        return
            this.indexingDocumentProcessor.queueSize() + 
@ -799,6 +756,9 @@ public final class Switchboard extends serverSwitch {
        // switch the networks
        synchronized (this) {            
            // shut down
+            this.crawler.close();
+            this.peers.close();
+            this.dhtDispatcher.close();
            synchronized (this.indexSegments) {
                this.indexSegments.close();
            }
@ -843,12 +803,21 @@ public final class Switchboard extends serverSwitch {
                    fileSizeMax,
                    this.useTailCache,
                    this.exceed134217727);
+            // set the default segment names
+            setDefaultSegments();
            
-            // startup
+            // create a crawler
            crawler = new CrawlSwitchboard(
                    networkName,
                    log,
                    this.queuesRoot);
+            
+            // init a DHT transmission dispatcher
+            dhtDispatcher = new Dispatcher(
+                    indexSegments.segment(Segments.Process.LOCALCRAWLING),
+                    peers,
+                    true, 
+                    30000);

            // create new web structure
            this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));