From 8d6a13bc07941a1f53cece8d9811f29cb70a0679 Mon Sep 17 00:00:00 2001
From: orbiter
Date: Mon, 24 Mar 2008 22:51:26 +0000
Subject: [PATCH] refactoring of parsing-condensing-indexing process: -
separated parts - removed storagePeer function next step will be
parallelization of processes
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4600 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
defaults/yacy.init | 3 -
.../anomic/plasma/plasmaParserDocument.java | 22 +-
.../de/anomic/plasma/plasmaSwitchboard.java | 531 +++++++-----------
.../anomic/plasma/plasmaSwitchboardQueue.java | 35 ++
4 files changed, 244 insertions(+), 347 deletions(-)
diff --git a/defaults/yacy.init b/defaults/yacy.init
index 659bfcaa4..33aff65be 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -767,9 +767,6 @@ CRDist1Method = 9
CRDist1Percent = 30
CRDist1Target = kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080
-# Hash of the peer, you would like to store to the data your installation collected.
-storagePeerHash =
-
# Search sequence settings
# collection:
# time = time to get a RWI out of RAM cache, assortments and WORDS files
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index e9c87581b..a3932eab3 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -67,17 +67,17 @@ import de.anomic.plasma.parser.Parser;
public class plasmaParserDocument {
- private yacyURL source; // the source url
- private String mimeType; // mimeType as taken from http header
- private String charset; // the charset of the document
- private List keywords; // most resources provide a keyword field
- private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result
- private StringBuffer creator; // author or copyright
- private List sections; // if present: more titles/headlines appearing in the document
- private StringBuffer description; // an abstract, if present: short content description
- private Object text; // the clear text, all that is visible
- private Map anchors; // all links embedded as clickeable entities (anchor tags)
- private HashMap images; // all visible pictures in document
+ private yacyURL source; // the source url
+ private String mimeType; // mimeType as taken from http header
+ private String charset; // the charset of the document
+ private List keywords; // most resources provide a keyword field
+ private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result
+ private StringBuffer creator; // author or copyright
+ private List sections; // if present: more titles/headlines appearing in the document
+ private StringBuffer description; // an abstract, if present: short content description
+ private Object text; // the clear text, all that is visible
+ private Map anchors; // all links embedded as clickeable entities (anchor tags)
+ private HashMap images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 7bbcd8a75..fb682e2c1 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -117,14 +117,11 @@ import de.anomic.data.messageBoard;
import de.anomic.data.userDB;
import de.anomic.data.wikiBoard;
import de.anomic.data.wiki.wikiParser;
-import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.http.httpd;
import de.anomic.http.httpdRobotsTxtConfig;
-import de.anomic.index.indexContainer;
-import de.anomic.index.indexRWIRowEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache;
@@ -137,7 +134,6 @@ import de.anomic.plasma.crawler.plasmaCrawlQueues;
import de.anomic.plasma.crawler.plasmaProtocolLoader;
import de.anomic.plasma.dbImport.dbImportManager;
import de.anomic.plasma.parser.ParserException;
-import de.anomic.plasma.plasmaCondenser.wordStatProp;
import de.anomic.plasma.urlPattern.defaultURLPattern;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverAbstractSwitch;
@@ -175,9 +171,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitchName of the setting how many active crawler-threads may maximal be running on the same time
*/
public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads";
-
+
public static final String OWN_SEED_FILE = "yacyOwnSeedFile";
- /**
- * public static final String STORAGE_PEER_HASH = "storagePeerHash"
- * Name of the setting holding the Peer-Hash where indexes shall be transferred after indexing a webpage. If this setting
- * is empty, the Storage Peer function is disabled
- */
- public static final String STORAGE_PEER_HASH = "storagePeerHash";
public static final String YACY_MODE_DEBUG = "yacyDebugMode";
-
public static final String WORDCACHE_INIT_COUNT = "wordCacheInitCount";
/**
* public static final String WORDCACHE_MAX_COUNT = "wordCacheMaxCount"
@@ -1825,15 +1811,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch urlCache = new HashMap(1);
- urlCache.put(newEntry.hash(), newEntry);
-
- ArrayList tmpContainers = new ArrayList(condenser.words().size());
-
- String language = yacyURL.language(entry.url());
- char doctype = plasmaHTCache.docType(document.dc_format());
- indexURLEntry.Components comp = newEntry.comp();
- int urlLength = comp.url().toNormalform(true, true).length();
- int urlComps = htmlFilterContentScraper.urlComps(comp.url().toNormalform(true, true)).length;
-
- // iterate over all words
- Iterator> i = condenser.words().entrySet().iterator();
- Map.Entry wentry;
- plasmaCondenser.wordStatProp wordStat;
- while (i.hasNext()) {
- wentry = i.next();
- String word = wentry.getKey();
- wordStat = wentry.getValue();
- String wordHash = plasmaCondenser.word2hash(word);
- indexRWIRowEntry wordIdxEntry = new indexRWIRowEntry(
- urlHash,
- urlLength, urlComps,
- wordStat.count,
- document.dc_title().length(),
- condenser.words().size(),
- condenser.sentences().size(),
- wordStat.posInText,
- wordStat.posInPhrase,
- wordStat.numOfPhrase,
- docDate.getTime(),
- System.currentTimeMillis(),
- language,
- doctype,
- ioLinks[0].intValue(),
- ioLinks[1].intValue(),
- condenser.RESULT_FLAGS
- );
- indexContainer wordIdxContainer = plasmaWordIndex.emptyContainer(wordHash, 1);
- wordIdxContainer.add(wordIdxEntry);
- tmpContainers.add(wordIdxContainer);
- }
- //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
- words = condenser.words().size();
-
- // transfering the index to the storage peer
- indexContainer[] indexData = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]);
- HashMap resultObj = yacyClient.transferIndex(
- seed, // target seed
- indexData, // word index data
- urlCache, // urls
- true, // gzip body
- 120000 // transfer timeout
- );
-
- // check for interruption
- checkInterruption();
-
- // if the transfer failed we try to store the index locally
- String error = (String) resultObj.get("result");
- if (error != null) {
- words = wordIndex.addPageIndex(
- entry.url(),
- docDate,
- (int) entry.size(),
- document,
- condenser,
- yacyURL.language(entry.url()),
- plasmaHTCache.docType(document.dc_format()),
- ioLinks[0].intValue(),
- ioLinks[1].intValue()
- );
- }
-
- tmpContainers = null;
- } //end: SEND PAGE INDEX TO STORAGE PEER
-
- storageEndTime = System.currentTimeMillis();
-
- //increment number of indexed urls
- indexedPages++;
-
- if (log.isInfo()) {
- // TODO: UTF-8 docDescription seems not to be displayed correctly because
- // of string concatenation
- log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
- " [" + entry.urlHash() + "]" +
- "\n\tDescription: " + dc_title +
- "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
- "Size: " + document.getTextLength() + " bytes | " +
- "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
- "\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " +
- "ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " +
- "IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " +
- "StorageTime: " + (storageEndTime-storageStartTime) + " ms");
- }
-
- // update profiling info
- plasmaProfiling.updateIndexedPage(entry);
-
- // check for interruption
- checkInterruption();
-
- // if this was performed for a remote crawl request, notify requester
- if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
- log.logInfo("Sending crawl receipt for '" + entry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
- if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
- yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
- }
- } else {
- log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
- addURLtoErrorDB(entry.url(), referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
- }
- } catch (Exception ee) {
- if (ee instanceof InterruptedException) throw (InterruptedException)ee;
-
- // check for interruption
- checkInterruption();
-
- log.logSevere("Could not index URL " + entry.url() + ": " + ee.getMessage(), ee);
- if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
- if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
- yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, "");
- }
- addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new kelondroBitfield());
- }
-
- } else {
- // check for interruption
- checkInterruption();
-
- log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
- addURLtoErrorDB(entry.url(), (referrerURL == null) ? null : referrerURL.hash(), initiatorPeerHash, dc_title, noIndexReason, new kelondroBitfield());
- if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
- if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
- yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
- }
- }
- document.close();
- document = null;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException)e;
this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
@@ -2456,14 +2163,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch