some fixes for the network switch

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6591 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 7d400b17d0
commit fd7b348973

@ -32,55 +32,6 @@
- storage: one plasmaStore object with the url-based database
- configuration: initialized by properties once, then by external functions
- job queues: for parsing, condensing, indexing
- black/blue/whitelists: controls input and output to the index
this class is also the core of the http crawling.
There are some items that need to be respected when crawling the web:
1) respect robots.txt
2) do not access one domain too frequently, wait between accesses
3) remember crawled URL's and do not access again too early
4) priorization of specific links should be possible (hot-lists)
5) attributes for crawling (depth, filters, hot/black-lists, priority)
6) different crawling jobs with different attributes ('Orders') simultanoulsy
We implement some specific tasks and use different database to archieve these goals:
- a database 'crawlerDisallow.db' contains all url's that shall not be crawled
- a database 'crawlerDomain.db' holds all domains and access times, where we loaded the disallow tables
this table contains the following entities:
<flag: robotes exist/not exist, last access of robots.txt, last access of domain (for access scheduling)>
- four databases for scheduled access: crawlerScheduledHotText.db, crawlerScheduledColdText.db,
crawlerScheduledHotMedia.db and crawlerScheduledColdMedia.db
- two stacks for new URLS: newText.stack and newMedia.stack
- two databases for URL double-check: knownText.db and knownMedia.db
- one database with crawling orders: crawlerOrders.db
The Information flow of a single URL that is crawled is as follows:
- a html file is loaded from a specific URL within the module httpdProxyServlet as
a process of the proxy.
- the file is passed to httpdProxyCache. Here it's processing is delayed until the proxy is idle.
- The cache entry is passed on to the plasmaSwitchboard. There the URL is stored into plasmaLURL where
the URL is stored under a specific hash. The URL's from the content are stripped off, stored in plasmaLURL
with a 'wrong' date (the date of the URL's are not known at this time, only after fetching) and stacked with
plasmaCrawlerTextStack. The content is read and splitted into rated words in plasmaCondenser.
The splitted words are then integrated into the index with plasmaSearch.
- In plasmaSearch the words are indexed by reversing the relation between URL and words: one URL points
to many words, the words within the document at the URL. After reversing, one word points
to many URL's, all the URL's where the word occurrs. One single word->URL-hash relation is stored in
plasmaIndexEntry. A set of plasmaIndexEntries is a reverse word index.
This reverse word index is stored temporarly in plasmaIndexCache.
- In plasmaIndexCache the single plasmaIndexEntry'ies are collected and stored into a plasmaIndex - entry
These plasmaIndex - Objects are the true reverse words indexes.
- in plasmaIndex the plasmaIndexEntry - objects are stored in a kelondroTree; an indexed file in the file system.
The information flow of a search request is as follows:
- in httpdFileServlet the user enters a search query, which is passed to plasmaSwitchboard
- in plasmaSwitchboard, the query is passed to plasmaSearch.
- in plasmaSearch, the plasmaSearch.result object is generated by simultanous enumeration of
URL hases in the reverse word indexes plasmaIndex
- (future: the plasmaSearch.result - object is used to identify more key words for a new search)
*/
package de.anomic.search;
@ -370,20 +321,15 @@ public final class Switchboard extends serverSwitch {
fileSizeMax,
this.useTailCache,
this.exceed134217727);
// set the default segment names
setDefaultSegments();
// create a crawler
crawler = new CrawlSwitchboard(
networkName,
log,
this.queuesRoot);
// set the default segment names
indexSegments.setSegment(Segments.Process.RECEIPTS, getConfig(SwitchboardConstants.SEGMENT_RECEIPTS, "default"));
indexSegments.setSegment(Segments.Process.QUERIES, getConfig(SwitchboardConstants.SEGMENT_QUERIES, "default"));
indexSegments.setSegment(Segments.Process.DHTIN, getConfig(SwitchboardConstants.SEGMENT_DHTIN, "default"));
indexSegments.setSegment(Segments.Process.DHTOUT, getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default"));
indexSegments.setSegment(Segments.Process.PROXY, getConfig(SwitchboardConstants.SEGMENT_PROXY, "default"));
indexSegments.setSegment(Segments.Process.LOCALCRAWLING, getConfig(SwitchboardConstants.SEGMENT_LOCALCRAWLING, "default"));
indexSegments.setSegment(Segments.Process.REMOTECRAWLING, getConfig(SwitchboardConstants.SEGMENT_REMOTECRAWLING, "default"));
indexSegments.setSegment(Segments.Process.PUBLIC, getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"));
// init crawl results monitor cache
crawlResults = new ResultURLs(100);
@ -661,6 +607,17 @@ public final class Switchboard extends serverSwitch {
log.logConfig("Finished Switchboard Initialization");
}
private void setDefaultSegments() {
indexSegments.setSegment(Segments.Process.RECEIPTS, getConfig(SwitchboardConstants.SEGMENT_RECEIPTS, "default"));
indexSegments.setSegment(Segments.Process.QUERIES, getConfig(SwitchboardConstants.SEGMENT_QUERIES, "default"));
indexSegments.setSegment(Segments.Process.DHTIN, getConfig(SwitchboardConstants.SEGMENT_DHTIN, "default"));
indexSegments.setSegment(Segments.Process.DHTOUT, getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default"));
indexSegments.setSegment(Segments.Process.PROXY, getConfig(SwitchboardConstants.SEGMENT_PROXY, "default"));
indexSegments.setSegment(Segments.Process.LOCALCRAWLING, getConfig(SwitchboardConstants.SEGMENT_LOCALCRAWLING, "default"));
indexSegments.setSegment(Segments.Process.REMOTECRAWLING, getConfig(SwitchboardConstants.SEGMENT_REMOTECRAWLING, "default"));
indexSegments.setSegment(Segments.Process.PUBLIC, getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"));
}
public int getIndexingProcessorsQueueSize() {
return
this.indexingDocumentProcessor.queueSize() +
@ -799,6 +756,9 @@ public final class Switchboard extends serverSwitch {
// switch the networks
synchronized (this) {
// shut down
this.crawler.close();
this.peers.close();
this.dhtDispatcher.close();
synchronized (this.indexSegments) {
this.indexSegments.close();
}
@ -843,12 +803,21 @@ public final class Switchboard extends serverSwitch {
fileSizeMax,
this.useTailCache,
this.exceed134217727);
// set the default segment names
setDefaultSegments();
// startup
// create a crawler
crawler = new CrawlSwitchboard(
networkName,
log,
this.queuesRoot);
// init a DHT transmission dispatcher
dhtDispatcher = new Dispatcher(
indexSegments.segment(Segments.Process.LOCALCRAWLING),
peers,
true,
30000);
// create new web structure
this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));

Loading…
Cancel
Save