From 9b7f37fc37ab2520b04753dd920bb99c712c9f4d Mon Sep 17 00:00:00 2001 From: theli Date: Tue, 27 Sep 2005 07:10:24 +0000 Subject: [PATCH] *) Minor changes - more debugging output: storageTime for indexed document is logged now - saving memory in plasmaParserDocument.java, plasmaWordIndexEntryContainer.java (not a big deal) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@798 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../anomic/plasma/plasmaParserDocument.java | 4 +-- .../de/anomic/plasma/plasmaSwitchboard.java | 32 +++++++++++++------ .../plasma/plasmaWordIndexEntryContainer.java | 14 +++++--- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 75cb6981a..dab1accf3 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -81,8 +81,8 @@ public class plasmaParserDocument { this.sections = (sections==null)?new String[0]:sections; this.abstrct = (abstrct==null)?"":abstrct; this.text = (text==null)?new byte[0]:text; - this.anchors = (anchors==null)?new HashMap():anchors; - this.images = (images==null)?new HashMap():images; + this.anchors = (anchors==null)?new HashMap(0):anchors; + this.images = (images==null)?new HashMap(0):images; this.hyperlinks = null; this.medialinks = null; this.emaillinks = null; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 569b0f590..9106db6f0 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -118,6 +118,8 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.logging.Level; + import de.anomic.data.messageBoard; import de.anomic.data.robotsParser; import de.anomic.data.wikiBoard; @@ -470,7 +472,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } - private static String ppRamString(int bytes) { + private static String ppRamString(long bytes) { if (bytes < 1024) return bytes + " KByte"; bytes = bytes / 1024; if (bytes < 1024) return bytes + " MByte"; @@ -942,7 +944,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser private void processResourceStack(plasmaSwitchboardQueue.Entry entry) { // work off one stack entry with a fresh resource try { - long stackStartTime = 0, stackEndTime = 0, parsingStartTime = 0, parsingEndTime = 0, indexingStartTime = 0, indexingEndTime; + long stackStartTime = 0, stackEndTime = 0, + parsingStartTime = 0, parsingEndTime = 0, + indexingStartTime = 0, indexingEndTime = 0, + storageStartTime = 0, storageEndTime = 0; // we must distinguish the following cases: resource-load was initiated by // 1) global crawling: the index is extern, not here (not possible here) @@ -1085,18 +1090,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser (entry.profile().localIndexing())) { // remove stopwords log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url()); + indexingEndTime = System.currentTimeMillis(); // do indexing //log.logDebug("Create Index for '" + entry.normalizedURLString() + "'"); + storageStartTime = System.currentTimeMillis(); int words = searchManager.addPageIndex(entry.url(), urlHash, loadDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); - indexingEndTime = System.currentTimeMillis(); - log.logInfo("*Indexed " + words + " words in URL " + entry.url() + - "\n\tDescription: " + descr + "\n\t" + - "MimeType: " + document.getMimeType() + " | " + - "Size: " + document.text.length + " bytes | " + - "StackingTime: " + (stackEndTime-stackStartTime) + " ms | " + - "ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " + - "IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms"); + storageEndTime = System.currentTimeMillis(); + + if (log.isLoggable(Level.INFO)) { + log.logInfo("*Indexed " + words + " words in URL " + entry.url() + + "\n\tDescription: " + descr + + "\n\tMimeType: " + document.getMimeType() + " | " + + "Size: " + document.text.length + " bytes | " + + "Anchors: " + ((document.anchors==null)?0:document.anchors.size()) + + "\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " + + "ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " + + "IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " + + "StorageTime: " + (storageEndTime-storageStartTime) + " ms"); + } // if this was performed for a remote crawl request, notify requester if ((processCase == 6) && (initiator != null)) { diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java index 1c15d35b8..fa54ce1d5 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java @@ -57,16 +57,20 @@ import java.util.Iterator; import de.anomic.server.serverCodings; -public class plasmaWordIndexEntryContainer implements Comparable { +public final class plasmaWordIndexEntryContainer implements Comparable { - private String wordHash; - private HashMap container; // urlHash/plasmaWordIndexEntry - Mapping + private final String wordHash; + private final HashMap container; // urlHash/plasmaWordIndexEntry - Mapping private long updateTime; public plasmaWordIndexEntryContainer(String wordHash) { + this(wordHash,16); + } + + public plasmaWordIndexEntryContainer(String wordHash, int initContainerSize) { this.wordHash = wordHash; this.updateTime = 0; - container = new HashMap(); // a urlhash/plasmaWordIndexEntry - relation + container = new HashMap(initContainerSize); // a urlhash/plasmaWordIndexEntry - relation } public int size() { @@ -123,7 +127,7 @@ public class plasmaWordIndexEntryContainer implements Comparable { } public static plasmaWordIndexEntryContainer instantContainer(String wordHash, long creationTime, plasmaWordIndexEntry entry) { - plasmaWordIndexEntryContainer c = new plasmaWordIndexEntryContainer(wordHash); + plasmaWordIndexEntryContainer c = new plasmaWordIndexEntryContainer(wordHash,1); c.add(entry); c.updateTime = creationTime; return c;