From 8d6a13bc07941a1f53cece8d9811f29cb70a0679 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 24 Mar 2008 22:51:26 +0000 Subject: [PATCH] refactoring of parsing-condensing-indexing process: - separated parts - removed storagePeer function next step will be parallelization of processes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4600 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 3 - .../anomic/plasma/plasmaParserDocument.java | 22 +- .../de/anomic/plasma/plasmaSwitchboard.java | 531 +++++++----------- .../anomic/plasma/plasmaSwitchboardQueue.java | 35 ++ 4 files changed, 244 insertions(+), 347 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 659bfcaa4..33aff65be 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -767,9 +767,6 @@ CRDist1Method = 9 CRDist1Percent = 30 CRDist1Target = kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080 -# Hash of the peer, you would like to store to the data your installation collected. -storagePeerHash = - # Search sequence settings # collection: # time = time to get a RWI out of RAM cache, assortments and WORDS files diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index e9c87581b..a3932eab3 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -67,17 +67,17 @@ import de.anomic.plasma.parser.Parser; public class plasmaParserDocument { - private yacyURL source; // the source url - private String mimeType; // mimeType as taken from http header - private String charset; // the charset of the document - private List keywords; // most resources provide a keyword field - private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result - private StringBuffer creator; // author or copyright - private List sections; // if present: more titles/headlines appearing in the document - private StringBuffer description; // an abstract, if present: short content description - private Object text; // the clear text, all that is visible - private Map anchors; // all links embedded as clickeable entities (anchor tags) - private HashMap images; // all visible pictures in document + private yacyURL source; // the source url + private String mimeType; // mimeType as taken from http header + private String charset; // the charset of the document + private List keywords; // most resources provide a keyword field + private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result + private StringBuffer creator; // author or copyright + private List sections; // if present: more titles/headlines appearing in the document + private StringBuffer description; // an abstract, if present: short content description + private Object text; // the clear text, all that is visible + private Map anchors; // all links embedded as clickeable entities (anchor tags) + private HashMap images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 7bbcd8a75..fb682e2c1 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -117,14 +117,11 @@ import de.anomic.data.messageBoard; import de.anomic.data.userDB; import de.anomic.data.wikiBoard; import de.anomic.data.wiki.wikiParser; -import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpc; import de.anomic.http.httpd; import de.anomic.http.httpdRobotsTxtConfig; -import de.anomic.index.indexContainer; -import de.anomic.index.indexRWIRowEntry; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroCache; @@ -137,7 +134,6 @@ import de.anomic.plasma.crawler.plasmaCrawlQueues; import de.anomic.plasma.crawler.plasmaProtocolLoader; import de.anomic.plasma.dbImport.dbImportManager; import de.anomic.plasma.parser.ParserException; -import de.anomic.plasma.plasmaCondenser.wordStatProp; import de.anomic.plasma.urlPattern.defaultURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverAbstractSwitch; @@ -175,9 +171,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitchName of the setting how many active crawler-threads may maximal be running on the same time

*/ public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"; - + public static final String OWN_SEED_FILE = "yacyOwnSeedFile"; - /** - *

public static final String STORAGE_PEER_HASH = "storagePeerHash"

- *

Name of the setting holding the Peer-Hash where indexes shall be transferred after indexing a webpage. If this setting - * is empty, the Storage Peer function is disabled

- */ - public static final String STORAGE_PEER_HASH = "storagePeerHash"; public static final String YACY_MODE_DEBUG = "yacyDebugMode"; - public static final String WORDCACHE_INIT_COUNT = "wordCacheInitCount"; /** *

public static final String WORDCACHE_MAX_COUNT = "wordCacheMaxCount"

@@ -1825,15 +1811,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch urlCache = new HashMap(1); - urlCache.put(newEntry.hash(), newEntry); - - ArrayList tmpContainers = new ArrayList(condenser.words().size()); - - String language = yacyURL.language(entry.url()); - char doctype = plasmaHTCache.docType(document.dc_format()); - indexURLEntry.Components comp = newEntry.comp(); - int urlLength = comp.url().toNormalform(true, true).length(); - int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform(true, true)).length; - - // iterate over all words - Iterator> i = condenser.words().entrySet().iterator(); - Map.Entry wentry; - plasmaCondenser.wordStatProp wordStat; - while (i.hasNext()) { - wentry = i.next(); - String word = wentry.getKey(); - wordStat = wentry.getValue(); - String wordHash = plasmaCondenser.word2hash(word); - indexRWIRowEntry wordIdxEntry = new indexRWIRowEntry( - urlHash, - urlLength, urlComps, - wordStat.count, - document.dc_title().length(), - condenser.words().size(), - condenser.sentences().size(), - wordStat.posInText, - wordStat.posInPhrase, - wordStat.numOfPhrase, - docDate.getTime(), - System.currentTimeMillis(), - language, - doctype, - ioLinks[0].intValue(), - ioLinks[1].intValue(), - condenser.RESULT_FLAGS - ); - indexContainer wordIdxContainer = plasmaWordIndex.emptyContainer(wordHash, 1); - wordIdxContainer.add(wordIdxEntry); - tmpContainers.add(wordIdxContainer); - } - //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries"); - words = condenser.words().size(); - - // transfering the index to the storage peer - indexContainer[] indexData = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]); - HashMap resultObj = yacyClient.transferIndex( - seed, // target seed - indexData, // word index data - urlCache, // urls - true, // gzip body - 120000 // transfer timeout - ); - - // check for interruption - checkInterruption(); - - // if the transfer failed we try to store the index locally - String error = (String) resultObj.get("result"); - if (error != null) { - words = wordIndex.addPageIndex( - entry.url(), - docDate, - (int) entry.size(), - document, - condenser, - yacyURL.language(entry.url()), - plasmaHTCache.docType(document.dc_format()), - ioLinks[0].intValue(), - ioLinks[1].intValue() - ); - } - - tmpContainers = null; - } //end: SEND PAGE INDEX TO STORAGE PEER - - storageEndTime = System.currentTimeMillis(); - - //increment number of indexed urls - indexedPages++; - - if (log.isInfo()) { - // TODO: UTF-8 docDescription seems not to be displayed correctly because - // of string concatenation - log.logInfo("*Indexed " + words + " words in URL " + entry.url() + - " [" + entry.urlHash() + "]" + - "\n\tDescription: " + dc_title + - "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " + - "Size: " + document.getTextLength() + " bytes | " + - "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) + - "\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " + - "ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " + - "IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " + - "StorageTime: " + (storageEndTime-storageStartTime) + " ms"); - } - - // update profiling info - plasmaProfiling.updateIndexedPage(entry); - - // check for interruption - checkInterruption(); - - // if this was performed for a remote crawl request, notify requester - if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { - log.logInfo("Sending crawl receipt for '" + entry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName()); - if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash)); - yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, ""); - } - } else { - log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase); - addURLtoErrorDB(entry.url(), referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield()); - } - } catch (Exception ee) { - if (ee instanceof InterruptedException) throw (InterruptedException)ee; - - // check for interruption - checkInterruption(); - - log.logSevere("Could not index URL " + entry.url() + ": " + ee.getMessage(), ee); - if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { - if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash)); - yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, ""); - } - addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new kelondroBitfield()); - } - - } else { - // check for interruption - checkInterruption(); - - log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); - addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, noIndexReason, new kelondroBitfield()); - if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { - if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash)); - yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, ""); - } - } - document.close(); - document = null; } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException)e; this.log.logSevere("Unexpected exception while parsing/indexing URL ",e); @@ -2456,14 +2163,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch