diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index d3fb279a5..2b7edc334 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -828,7 +828,8 @@ public class Response { // 3) result of index transfer, some of them are here (not possible here) // 4) proxy-load (initiator is "------------") // 5) local prefetch/crawling (initiator is own seedHash) - // 6) local fetching for global crawling (other known or unknwon initiator) + // 6) local fetching for global crawling (other known or unknown initiator) + // 7) local surrogates processing (can not be known here : crawl profile is required) EventOrigin processCase = EventOrigin.UNKNOWN; // FIXME the equals seems to be incorrect: String.equals(boolean) if (initiator() == null || initiator().length == 0 || ASCII.String(initiator()).equals("------------")) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 5bed37940..00a3ee3fd 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2114,14 +2114,23 @@ public final class Switchboard extends serverSwitch { // enrich the surrogate final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName()); final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); + final DigestURL rootURL = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id)); if (text != null && text.length() > 0 && id != null ) { - final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id)); // run the tokenizer on the text to get vocabularies and synonyms - final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper); + final Tokenizer tokenizer = new Tokenizer(rootURL, text, LibraryProvider.dymLib, true, scraper); final Map> facets = Document.computeGenericFacets(tokenizer.tags()); // overwrite the given vocabularies and synonyms with new computed ones Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets); } + + /* Update the ResultURLS stack for monitoring */ + final byte[] myPeerHash = ASCII.getBytes(peers.mySeed().hash); + ResultURLs.stack( + ASCII.String(rootURL.hash()), + rootURL.getHost(), + myPeerHash, + myPeerHash, + EventOrigin.SURROGATES); } catch (MalformedURLException e) { ConcurrentLog.logException(e); } @@ -3034,6 +3043,12 @@ public final class Switchboard extends serverSwitch { final DigestURL url = document.dc_source(); final DigestURL referrerURL = queueEntry.referrerURL(); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); + + /* This entry may have been locally created by the MediaWiki dump reader : + * we can distinguish the case here from a regular local crawl with the crawl profile used */ + if(this.crawler != null && queueEntry.profile() == this.crawler.defaultSurrogateProfile) { + processCase = EventOrigin.SURROGATES; + } CrawlProfile profile = queueEntry.profile(); if (condenser == null || (document.indexingDenied() && profile.obeyHtmlRobotsNoindex())) {