refactoring of parsing-condensing-indexing process:

- separated parts
- removed storagePeer function
next step will be parallelization of processes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4600 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent d3b06913ec
commit 8d6a13bc07

@ -767,9 +767,6 @@ CRDist1Method = 9
CRDist1Percent = 30 CRDist1Percent = 30
CRDist1Target = kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080 CRDist1Target = kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080
# Hash of the peer, you would like to store to the data your installation collected.
storagePeerHash =
# Search sequence settings # Search sequence settings
# collection: # collection:
# time = time to get a RWI out of RAM cache, assortments and WORDS files # time = time to get a RWI out of RAM cache, assortments and WORDS files

@ -117,14 +117,11 @@ import de.anomic.data.messageBoard;
import de.anomic.data.userDB; import de.anomic.data.userDB;
import de.anomic.data.wikiBoard; import de.anomic.data.wikiBoard;
import de.anomic.data.wiki.wikiParser; import de.anomic.data.wiki.wikiParser;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.http.httpd; import de.anomic.http.httpd;
import de.anomic.http.httpdRobotsTxtConfig; import de.anomic.http.httpdRobotsTxtConfig;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIRowEntry;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCache;
@ -137,7 +134,6 @@ import de.anomic.plasma.crawler.plasmaCrawlQueues;
import de.anomic.plasma.crawler.plasmaProtocolLoader; import de.anomic.plasma.crawler.plasmaProtocolLoader;
import de.anomic.plasma.dbImport.dbImportManager; import de.anomic.plasma.dbImport.dbImportManager;
import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.plasmaCondenser.wordStatProp;
import de.anomic.plasma.urlPattern.defaultURLPattern; import de.anomic.plasma.urlPattern.defaultURLPattern;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverAbstractSwitch; import de.anomic.server.serverAbstractSwitch;
@ -175,9 +171,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
// 5) local prefetch/crawling (initiator is own seedHash) // 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknown initiator) // 6) local fetching for global crawling (other known or unknown initiator)
public static final int PROCESSCASE_0_UNKNOWN = 0; public static final int PROCESSCASE_0_UNKNOWN = 0;
public static final int PROCESSCASE_1_GLOBAL_CRAWLING = 1;
public static final int PROCESSCASE_2_SEARCH_QUERY_RESULT = 2;
public static final int PROCESSCASE_3_INDEX_TRANSFER_RESULT = 3;
public static final int PROCESSCASE_4_PROXY_LOAD = 4; public static final int PROCESSCASE_4_PROXY_LOAD = 4;
public static final int PROCESSCASE_5_LOCAL_CRAWLING = 5; public static final int PROCESSCASE_5_LOCAL_CRAWLING = 5;
public static final int PROCESSCASE_6_GLOBAL_CRAWLING = 6; public static final int PROCESSCASE_6_GLOBAL_CRAWLING = 6;
@ -653,14 +646,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"; public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads";
public static final String OWN_SEED_FILE = "yacyOwnSeedFile"; public static final String OWN_SEED_FILE = "yacyOwnSeedFile";
/**
* <p><code>public static final String <strong>STORAGE_PEER_HASH</strong> = "storagePeerHash"</code></p>
* <p>Name of the setting holding the Peer-Hash where indexes shall be transferred after indexing a webpage. If this setting
* is empty, the Storage Peer function is disabled</p>
*/
public static final String STORAGE_PEER_HASH = "storagePeerHash";
public static final String YACY_MODE_DEBUG = "yacyDebugMode"; public static final String YACY_MODE_DEBUG = "yacyDebugMode";
public static final String WORDCACHE_INIT_COUNT = "wordCacheInitCount"; public static final String WORDCACHE_INIT_COUNT = "wordCacheInitCount";
/** /**
* <p><code>public static final String <strong>WORDCACHE_MAX_COUNT</strong> = "wordCacheMaxCount"</code></p> * <p><code>public static final String <strong>WORDCACHE_MAX_COUNT</strong> = "wordCacheMaxCount"</code></p>
@ -1825,15 +1811,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
} }
// generate a dht chunk // generate a dht chunk
if ( if ((dhtShallTransfer() == null) && (
(dhtShallTransfer() == null) &&
(
(this.dhtTransferChunk == null) || (this.dhtTransferChunk == null) ||
(this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_UNDEFINED) || (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_UNDEFINED) ||
// (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE) || // (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_COMPLETE) ||
(this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_FAILED) (this.dhtTransferChunk.getStatus() == plasmaDHTChunk.chunkStatus_FAILED)
) )) {
) {
// generate new chunk // generate new chunk
int minChunkSize = (int) getConfigLong(INDEX_DIST_CHUNK_SIZE_MIN, 30); int minChunkSize = (int) getConfigLong(INDEX_DIST_CHUNK_SIZE_MIN, 30);
dhtTransferChunk = new plasmaDHTChunk(this.log, wordIndex, minChunkSize, dhtTransferIndexCount, 5000); dhtTransferChunk = new plasmaDHTChunk(this.log, wordIndex, minChunkSize, dhtTransferIndexCount, 5000);
@ -1867,7 +1850,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
} }
// parse and index the resource // parse and index the resource
processResourceStack(nextentry); plasmaParserDocument document = parseDocument(nextentry);
if (document != null) {
plasmaCondenser condensement = condenseDocument(nextentry, document);
if (condensement != null) {
indexDocument(nextentry, document, condensement);
}
}
return true; return true;
} catch (InterruptedException e) { } catch (InterruptedException e) {
log.logInfo("DEQUEUE: Shutdown detected."); log.logInfo("DEQUEUE: Shutdown detected.");
@ -2078,7 +2067,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
} }
} }
private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException, ParserException { private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry) throws InterruptedException, ParserException {
// the mimetype of this entry // the mimetype of this entry
String mimeType = entry.getMimeType(); String mimeType = entry.getMimeType();
@ -2097,71 +2086,43 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
return doc; return doc;
} }
private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException { private plasmaParserDocument parseDocument(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
plasmaParserDocument document = null; plasmaParserDocument document = null;
try { try {
// work off one stack entry with a fresh resource
long stackStartTime = 0, stackEndTime = 0, long stackStartTime = 0, stackEndTime = 0,
parsingStartTime = 0, parsingEndTime = 0, parsingStartTime = 0, parsingEndTime = 0;
indexingStartTime = 0, indexingEndTime = 0, int processCase = entry.processCase();
storageStartTime = 0, storageEndTime = 0;
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
// 3) result of index transfer, some of them are here (not possible here)
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknwon initiator)
int processCase = PROCESSCASE_0_UNKNOWN;
yacySeed initiatorPeer = null;
String initiatorPeerHash = (entry.proxy()) ? yacyURL.dummyHash : entry.initiator();
if (initiatorPeerHash.equals(yacyURL.dummyHash)) {
// proxy-load
processCase = PROCESSCASE_4_PROXY_LOAD;
} else if (initiatorPeerHash.equals(yacyCore.seedDB.mySeed().hash)) {
// normal crawling
processCase = PROCESSCASE_5_LOCAL_CRAWLING;
} else {
// this was done for remote peer (a global crawl)
initiatorPeer = yacyCore.seedDB.getConnected(initiatorPeerHash);
processCase = PROCESSCASE_6_GLOBAL_CRAWLING;
}
log.logFine("processResourceStack processCase=" + processCase + log.logFine("processResourceStack processCase=" + processCase +
", depth=" + entry.depth() + ", depth=" + entry.depth() +
", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().generalDepth())) + ", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().generalDepth())) +
", filter=" + ((entry.profile() == null) ? "null" : entry.profile().generalFilter()) + ", filter=" + ((entry.profile() == null) ? "null" : entry.profile().generalFilter()) +
", initiatorHash=" + initiatorPeerHash + ", initiatorHash=" + entry.initiator() +
//", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) + //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
", url=" + entry.url()); // DEBUG ", url=" + entry.url()); // DEBUG
/* ========================================================================= // PARSE CONTENT
* PARSE CONTENT
* ========================================================================= */
parsingStartTime = System.currentTimeMillis(); parsingStartTime = System.currentTimeMillis();
try { try {
document = this.parseResource(entry, initiatorPeerHash); document = this.parseResource(entry);
if (document == null) return; if (document == null) return null;
} catch (ParserException e) { } catch (ParserException e) {
this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage()); this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new kelondroBitfield()); addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.anchorName(), e.getErrorCode(), new kelondroBitfield());
if (document != null) { if (document != null) {
document.close(); document.close();
document = null; document = null;
} }
return; return null;
} }
parsingEndTime = System.currentTimeMillis(); parsingEndTime = System.currentTimeMillis();
// getting the document date // get the document date
Date docDate = entry.getModificationDate(); Date docDate = entry.getModificationDate();
/* ========================================================================= // put anchors on crawl stack
* put anchors on crawl stack
* ========================================================================= */
stackStartTime = System.currentTimeMillis(); stackStartTime = System.currentTimeMillis();
if ( if (
((processCase == PROCESSCASE_4_PROXY_LOAD) || (processCase == PROCESSCASE_5_LOCAL_CRAWLING)) && ((processCase == PROCESSCASE_4_PROXY_LOAD) || (processCase == PROCESSCASE_5_LOCAL_CRAWLING)) &&
@ -2179,18 +2140,47 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
nextEntry = i.next(); nextEntry = i.next();
nextUrl = nextEntry.getKey(); nextUrl = nextEntry.getKey();
// enqueue the hyperlink into the pre-notice-url db // enqueue the hyperlink into the pre-notice-url db
crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), initiatorPeerHash, nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile()); crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), entry.initiator(), nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
} }
log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) + if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
", NEW CRAWL STACK SIZE IS " + crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); ", NEW CRAWL STACK SIZE IS " + crawlQueues.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) +
", STACKING TIME = " + (stackEndTime-stackStartTime) +
", PARSING TIME = " + (parsingEndTime-parsingStartTime));
} }
stackEndTime = System.currentTimeMillis(); stackEndTime = System.currentTimeMillis();
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException)e;
this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
} catch (Error e) {
this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
} finally {
checkInterruption();
/* ========================================================================= // The following code must be into the finally block, otherwise it will not be executed
* CREATE INDEX // on errors!
* ========================================================================= */
// removing current entry from in process list
synchronized (this.indexingTasksInProcess) {
this.indexingTasksInProcess.remove(entry.urlHash());
}
// explicit delete/free resources
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) {
plasmaHTCache.filesInUse.remove(entry.cacheFile());
//plasmaHTCache.deleteURLfromCache(entry.url());
}
entry = null;
if (document != null) try { document.close(); } catch (Exception e) {}
}
return document;
}
private plasmaCondenser condenseDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document) throws InterruptedException {
// CREATE INDEX
String dc_title = document.dc_title(); String dc_title = document.dc_title();
yacyURL referrerURL = entry.referrerURL(); yacyURL referrerURL = entry.referrerURL();
int processCase = entry.processCase();
String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR; String noIndexReason = plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR;
if (processCase == PROCESSCASE_4_PROXY_LOAD) { if (processCase == PROCESSCASE_4_PROXY_LOAD) {
@ -2201,18 +2191,48 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
noIndexReason = entry.shallIndexCacheForCrawler(); noIndexReason = entry.shallIndexCacheForCrawler();
} }
if (noIndexReason == null) { if (noIndexReason != null) {
// strip out words // check for interruption
indexingStartTime = System.currentTimeMillis(); checkInterruption();
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), entry.initiator(), dc_title, noIndexReason, new kelondroBitfield());
/*
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
}
*/
document.close();
document = null;
return null;
}
// strip out words
checkInterruption(); checkInterruption();
log.logFine("Condensing for '" + entry.url().toNormalform(false, true) + "'"); log.logFine("Condensing for '" + entry.url().toNormalform(false, true) + "'");
plasmaCondenser condenser = new plasmaCondenser(document, entry.profile().indexText(), entry.profile().indexMedia()); plasmaCondenser condenser;
try {
condenser = new plasmaCondenser(document, entry.profile().indexText(), entry.profile().indexMedia());
} catch (UnsupportedEncodingException e) {
return null;
}
return condenser;
}
private void indexDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) throws InterruptedException {
long indexingStartTime = 0, indexingEndTime = 0,
storageStartTime = 0, storageEndTime = 0;
// CREATE INDEX
String dc_title = document.dc_title();
yacyURL referrerURL = entry.referrerURL();
Date docDate = entry.getModificationDate();
int processCase = entry.processCase();
// generate citation reference // generate citation reference
Integer[] ioLinks = webStructure.generateCitationReference(entry.url(), entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther] Integer[] ioLinks = webStructure.generateCitationReference(entry.url(), entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
try {
// check for interruption // check for interruption
checkInterruption(); checkInterruption();
@ -2241,13 +2261,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
document.getVideolinks().size(), // lvideo document.getVideolinks().size(), // lvideo
document.getApplinks().size() // lapp document.getApplinks().size() // lapp
); );
/* ========================================================================
* STORE URL TO LOADED-URL-DB // STORE URL TO LOADED-URL-DB
* ======================================================================== */ try {
wordIndex.loadedURL.store(newEntry); wordIndex.loadedURL.store(newEntry);
} catch (IOException e) {
log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, "error storing url: " + e.getMessage(), new kelondroBitfield());
return;
}
wordIndex.loadedURL.stack( wordIndex.loadedURL.stack(
newEntry, // loaded url db entry newEntry, // loaded url db entry
initiatorPeerHash, // initiator peer hash entry.initiator(), // initiator peer hash
yacyCore.seedDB.mySeed().hash, // executor peer hash yacyCore.seedDB.mySeed().hash, // executor peer hash
processCase // process case processCase // process case
); );
@ -2255,18 +2281,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
// check for interruption // check for interruption
checkInterruption(); checkInterruption();
/* ======================================================================== // STORE WORD INDEX
* STORE WORD INDEX if ((!entry.profile().indexText()) && (!entry.profile().indexMedia())) {
* ======================================================================== */ log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
if ( addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
( return;
(processCase == PROCESSCASE_4_PROXY_LOAD) || }
(processCase == PROCESSCASE_5_LOCAL_CRAWLING) ||
(processCase == PROCESSCASE_6_GLOBAL_CRAWLING)
) &&
((entry.profile().indexText()) || (entry.profile().indexMedia()))
) {
String urlHash = newEntry.hash();
// remove stopwords // remove stopwords
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url()); log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());
@ -2274,18 +2294,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
storageStartTime = System.currentTimeMillis(); storageStartTime = System.currentTimeMillis();
int words = 0; int words = 0;
String storagePeerHash;
yacySeed seed;
if ( // STORE PAGE INDEX INTO WORD INDEX DB
((storagePeerHash = getConfig(STORAGE_PEER_HASH, null)) == null) ||
(storagePeerHash.trim().length() == 0) ||
((seed = yacyCore.seedDB.getConnected(storagePeerHash)) == null)
){
/* ========================================================================
* STORE PAGE INDEX INTO WORD INDEX DB
* ======================================================================== */
words = wordIndex.addPageIndex( words = wordIndex.addPageIndex(
entry.url(), // document url entry.url(), // document url
docDate, // document mod date docDate, // document mod date
@ -2297,86 +2307,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
ioLinks[0].intValue(), // outlinkSame ioLinks[0].intValue(), // outlinkSame
ioLinks[1].intValue() // outlinkOthers ioLinks[1].intValue() // outlinkOthers
); );
} else {
/* ========================================================================
* SEND PAGE INDEX TO STORAGE PEER
* ======================================================================== */
HashMap<String, indexURLEntry> urlCache = new HashMap<String, indexURLEntry>(1);
urlCache.put(newEntry.hash(), newEntry);
ArrayList<indexContainer> tmpContainers = new ArrayList<indexContainer>(condenser.words().size());
String language = yacyURL.language(entry.url());
char doctype = plasmaHTCache.docType(document.dc_format());
indexURLEntry.Components comp = newEntry.comp();
int urlLength = comp.url().toNormalform(true, true).length();
int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform(true, true)).length;
// iterate over all words
Iterator<Map.Entry<String, wordStatProp>> i = condenser.words().entrySet().iterator();
Map.Entry<String, wordStatProp> wentry;
plasmaCondenser.wordStatProp wordStat;
while (i.hasNext()) {
wentry = i.next();
String word = wentry.getKey();
wordStat = wentry.getValue();
String wordHash = plasmaCondenser.word2hash(word);
indexRWIRowEntry wordIdxEntry = new indexRWIRowEntry(
urlHash,
urlLength, urlComps,
wordStat.count,
document.dc_title().length(),
condenser.words().size(),
condenser.sentences().size(),
wordStat.posInText,
wordStat.posInPhrase,
wordStat.numOfPhrase,
docDate.getTime(),
System.currentTimeMillis(),
language,
doctype,
ioLinks[0].intValue(),
ioLinks[1].intValue(),
condenser.RESULT_FLAGS
);
indexContainer wordIdxContainer = plasmaWordIndex.emptyContainer(wordHash, 1);
wordIdxContainer.add(wordIdxEntry);
tmpContainers.add(wordIdxContainer);
}
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
words = condenser.words().size();
// transfering the index to the storage peer
indexContainer[] indexData = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]);
HashMap<String, Object> resultObj = yacyClient.transferIndex(
seed, // target seed
indexData, // word index data
urlCache, // urls
true, // gzip body
120000 // transfer timeout
);
// check for interruption
checkInterruption();
// if the transfer failed we try to store the index locally
String error = (String) resultObj.get("result");
if (error != null) {
words = wordIndex.addPageIndex(
entry.url(),
docDate,
(int) entry.size(),
document,
condenser,
yacyURL.language(entry.url()),
plasmaHTCache.docType(document.dc_format()),
ioLinks[0].intValue(),
ioLinks[1].intValue()
);
}
tmpContainers = null;
} //end: SEND PAGE INDEX TO STORAGE PEER
storageEndTime = System.currentTimeMillis(); storageEndTime = System.currentTimeMillis();
@ -2392,9 +2322,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
"\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " + "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.getTextLength() + " bytes | " + "Size: " + document.getTextLength() + " bytes | " +
"Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) + "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
"\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " + "\n\tIndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " +
"ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " +
"IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " +
"StorageTime: " + (storageEndTime-storageStartTime) + " ms"); "StorageTime: " + (storageEndTime-storageStartTime) + " ms");
} }
@ -2403,6 +2331,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
// check for interruption // check for interruption
checkInterruption(); checkInterruption();
yacySeed initiatorPeer = entry.initiatorPeer();
// if this was performed for a remote crawl request, notify requester // if this was performed for a remote crawl request, notify requester
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
@ -2410,70 +2339,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash)); if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, ""); yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
} }
} else {
log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
}
} catch (Exception ee) {
if (ee instanceof InterruptedException) throw (InterruptedException)ee;
// check for interruption
checkInterruption();
log.logSevere("Could not index URL " + entry.url() + ": " + ee.getMessage(), ee);
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, "");
}
addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new kelondroBitfield());
}
} else {
// check for interruption
checkInterruption();
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, noIndexReason, new kelondroBitfield());
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
}
}
document.close();
document = null;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException)e;
this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
} catch (Error e) {
this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
} finally {
checkInterruption();
// The following code must be into the finally block, otherwise it will not be executed
// on errors!
// removing current entry from in process list
synchronized (this.indexingTasksInProcess) {
this.indexingTasksInProcess.remove(entry.urlHash());
}
// removing current entry from notice URL queue
/*
boolean removed = noticeURL.remove(entry.urlHash()); // worked-off
if (!removed) {
log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect.");
}
*/
// explicit delete/free resources
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) {
plasmaHTCache.filesInUse.remove(entry.cacheFile());
//plasmaHTCache.deleteURLfromCache(entry.url());
}
entry = null;
if (document != null) try { document.close(); } catch (Exception e) { /* ignore this */ }
}
} }
private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy"); private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy");

@ -57,6 +57,8 @@ import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack; import de.anomic.kelondro.kelondroStack;
import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
@ -287,6 +289,17 @@ public class plasmaSwitchboardQueue {
return initiator; return initiator;
} }
public yacySeed initiatorPeer() {
if ((initiator == null) || (initiator.equals(yacyURL.dummyHash))) return null;
if (initiator.equals(yacyCore.seedDB.mySeed().hash)) {
// normal crawling
return null;
} else {
// this was done for remote peer (a global crawl)
return yacyCore.seedDB.getConnected(initiator);
}
}
public int depth() { public int depth() {
return depth; return depth;
} }
@ -342,6 +355,28 @@ public class plasmaSwitchboardQueue {
return anchorName; return anchorName;
} }
public int processCase() {
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
// 3) result of index transfer, some of them are here (not possible here)
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknwon initiator)
int processCase = plasmaSwitchboard.PROCESSCASE_0_UNKNOWN;
if ((initiator == null) || (initiator.equals(yacyURL.dummyHash))) {
// proxy-load
processCase = plasmaSwitchboard.PROCESSCASE_4_PROXY_LOAD;
} else if ((initiator != null) && (initiator.equals(yacyCore.seedDB.mySeed().hash))) {
// normal crawling
processCase = plasmaSwitchboard.PROCESSCASE_5_LOCAL_CRAWLING;
} else {
// this was done for remote peer (a global crawl)
processCase = plasmaSwitchboard.PROCESSCASE_6_GLOBAL_CRAWLING;
}
return processCase;
}
/** /**
* decide upon header information if a specific file should be indexed * decide upon header information if a specific file should be indexed
* this method returns null if the answer is 'YES'! * this method returns null if the answer is 'YES'!

Loading…
Cancel
Save