|
|
|
@ -32,55 +32,6 @@
|
|
|
|
|
- storage: one plasmaStore object with the url-based database
|
|
|
|
|
- configuration: initialized by properties once, then by external functions
|
|
|
|
|
- job queues: for parsing, condensing, indexing
|
|
|
|
|
- black/blue/whitelists: controls input and output to the index
|
|
|
|
|
|
|
|
|
|
this class is also the core of the http crawling.
|
|
|
|
|
There are some items that need to be respected when crawling the web:
|
|
|
|
|
1) respect robots.txt
|
|
|
|
|
2) do not access one domain too frequently, wait between accesses
|
|
|
|
|
3) remember crawled URL's and do not access again too early
|
|
|
|
|
4) priorization of specific links should be possible (hot-lists)
|
|
|
|
|
5) attributes for crawling (depth, filters, hot/black-lists, priority)
|
|
|
|
|
6) different crawling jobs with different attributes ('Orders') simultanoulsy
|
|
|
|
|
|
|
|
|
|
We implement some specific tasks and use different database to archieve these goals:
|
|
|
|
|
- a database 'crawlerDisallow.db' contains all url's that shall not be crawled
|
|
|
|
|
- a database 'crawlerDomain.db' holds all domains and access times, where we loaded the disallow tables
|
|
|
|
|
this table contains the following entities:
|
|
|
|
|
<flag: robotes exist/not exist, last access of robots.txt, last access of domain (for access scheduling)>
|
|
|
|
|
- four databases for scheduled access: crawlerScheduledHotText.db, crawlerScheduledColdText.db,
|
|
|
|
|
crawlerScheduledHotMedia.db and crawlerScheduledColdMedia.db
|
|
|
|
|
- two stacks for new URLS: newText.stack and newMedia.stack
|
|
|
|
|
- two databases for URL double-check: knownText.db and knownMedia.db
|
|
|
|
|
- one database with crawling orders: crawlerOrders.db
|
|
|
|
|
|
|
|
|
|
The Information flow of a single URL that is crawled is as follows:
|
|
|
|
|
- a html file is loaded from a specific URL within the module httpdProxyServlet as
|
|
|
|
|
a process of the proxy.
|
|
|
|
|
- the file is passed to httpdProxyCache. Here it's processing is delayed until the proxy is idle.
|
|
|
|
|
- The cache entry is passed on to the plasmaSwitchboard. There the URL is stored into plasmaLURL where
|
|
|
|
|
the URL is stored under a specific hash. The URL's from the content are stripped off, stored in plasmaLURL
|
|
|
|
|
with a 'wrong' date (the date of the URL's are not known at this time, only after fetching) and stacked with
|
|
|
|
|
plasmaCrawlerTextStack. The content is read and splitted into rated words in plasmaCondenser.
|
|
|
|
|
The splitted words are then integrated into the index with plasmaSearch.
|
|
|
|
|
- In plasmaSearch the words are indexed by reversing the relation between URL and words: one URL points
|
|
|
|
|
to many words, the words within the document at the URL. After reversing, one word points
|
|
|
|
|
to many URL's, all the URL's where the word occurrs. One single word->URL-hash relation is stored in
|
|
|
|
|
plasmaIndexEntry. A set of plasmaIndexEntries is a reverse word index.
|
|
|
|
|
This reverse word index is stored temporarly in plasmaIndexCache.
|
|
|
|
|
- In plasmaIndexCache the single plasmaIndexEntry'ies are collected and stored into a plasmaIndex - entry
|
|
|
|
|
These plasmaIndex - Objects are the true reverse words indexes.
|
|
|
|
|
- in plasmaIndex the plasmaIndexEntry - objects are stored in a kelondroTree; an indexed file in the file system.
|
|
|
|
|
|
|
|
|
|
The information flow of a search request is as follows:
|
|
|
|
|
- in httpdFileServlet the user enters a search query, which is passed to plasmaSwitchboard
|
|
|
|
|
- in plasmaSwitchboard, the query is passed to plasmaSearch.
|
|
|
|
|
- in plasmaSearch, the plasmaSearch.result object is generated by simultanous enumeration of
|
|
|
|
|
URL hases in the reverse word indexes plasmaIndex
|
|
|
|
|
- (future: the plasmaSearch.result - object is used to identify more key words for a new search)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
package de.anomic.search;
|
|
|
|
@ -370,20 +321,15 @@ public final class Switchboard extends serverSwitch {
|
|
|
|
|
fileSizeMax,
|
|
|
|
|
this.useTailCache,
|
|
|
|
|
this.exceed134217727);
|
|
|
|
|
// set the default segment names
|
|
|
|
|
setDefaultSegments();
|
|
|
|
|
|
|
|
|
|
// create a crawler
|
|
|
|
|
crawler = new CrawlSwitchboard(
|
|
|
|
|
networkName,
|
|
|
|
|
log,
|
|
|
|
|
this.queuesRoot);
|
|
|
|
|
|
|
|
|
|
// set the default segment names
|
|
|
|
|
indexSegments.setSegment(Segments.Process.RECEIPTS, getConfig(SwitchboardConstants.SEGMENT_RECEIPTS, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.QUERIES, getConfig(SwitchboardConstants.SEGMENT_QUERIES, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.DHTIN, getConfig(SwitchboardConstants.SEGMENT_DHTIN, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.DHTOUT, getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.PROXY, getConfig(SwitchboardConstants.SEGMENT_PROXY, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.LOCALCRAWLING, getConfig(SwitchboardConstants.SEGMENT_LOCALCRAWLING, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.REMOTECRAWLING, getConfig(SwitchboardConstants.SEGMENT_REMOTECRAWLING, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.PUBLIC, getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"));
|
|
|
|
|
|
|
|
|
|
// init crawl results monitor cache
|
|
|
|
|
crawlResults = new ResultURLs(100);
|
|
|
|
@ -661,6 +607,17 @@ public final class Switchboard extends serverSwitch {
|
|
|
|
|
log.logConfig("Finished Switchboard Initialization");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void setDefaultSegments() {
|
|
|
|
|
indexSegments.setSegment(Segments.Process.RECEIPTS, getConfig(SwitchboardConstants.SEGMENT_RECEIPTS, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.QUERIES, getConfig(SwitchboardConstants.SEGMENT_QUERIES, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.DHTIN, getConfig(SwitchboardConstants.SEGMENT_DHTIN, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.DHTOUT, getConfig(SwitchboardConstants.SEGMENT_DHTOUT, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.PROXY, getConfig(SwitchboardConstants.SEGMENT_PROXY, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.LOCALCRAWLING, getConfig(SwitchboardConstants.SEGMENT_LOCALCRAWLING, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.REMOTECRAWLING, getConfig(SwitchboardConstants.SEGMENT_REMOTECRAWLING, "default"));
|
|
|
|
|
indexSegments.setSegment(Segments.Process.PUBLIC, getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int getIndexingProcessorsQueueSize() {
|
|
|
|
|
return
|
|
|
|
|
this.indexingDocumentProcessor.queueSize() +
|
|
|
|
@ -799,6 +756,9 @@ public final class Switchboard extends serverSwitch {
|
|
|
|
|
// switch the networks
|
|
|
|
|
synchronized (this) {
|
|
|
|
|
// shut down
|
|
|
|
|
this.crawler.close();
|
|
|
|
|
this.peers.close();
|
|
|
|
|
this.dhtDispatcher.close();
|
|
|
|
|
synchronized (this.indexSegments) {
|
|
|
|
|
this.indexSegments.close();
|
|
|
|
|
}
|
|
|
|
@ -843,12 +803,21 @@ public final class Switchboard extends serverSwitch {
|
|
|
|
|
fileSizeMax,
|
|
|
|
|
this.useTailCache,
|
|
|
|
|
this.exceed134217727);
|
|
|
|
|
// set the default segment names
|
|
|
|
|
setDefaultSegments();
|
|
|
|
|
|
|
|
|
|
// startup
|
|
|
|
|
// create a crawler
|
|
|
|
|
crawler = new CrawlSwitchboard(
|
|
|
|
|
networkName,
|
|
|
|
|
log,
|
|
|
|
|
this.queuesRoot);
|
|
|
|
|
|
|
|
|
|
// init a DHT transmission dispatcher
|
|
|
|
|
dhtDispatcher = new Dispatcher(
|
|
|
|
|
indexSegments.segment(Segments.Process.LOCALCRAWLING),
|
|
|
|
|
peers,
|
|
|
|
|
true,
|
|
|
|
|
30000);
|
|
|
|
|
|
|
|
|
|
// create new web structure
|
|
|
|
|
this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));
|
|
|
|
|