From 68d5ff2ef122deec0e99cbd76ae207028003bee0 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 22 Sep 2005 23:43:45 +0000 Subject: [PATCH] added stringbuffer in condenser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@782 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 2 +- source/de/anomic/plasma/plasmaCondenser.java | 52 +++++++++++-------- .../de/anomic/plasma/plasmaSwitchboard.java | 8 ++- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 4c1745edd..91c87362d 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -165,7 +165,7 @@ public class CacheAdmin_p { } catch (Exception e) { info.append("- This file is not cached -"); info.append(e.toString()); - // e.printStackTrace(); + e.printStackTrace(); } } diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 3cb24fcaa..b89cccd54 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -138,7 +138,7 @@ public final class plasmaCondenser { words = new TreeMap(kelondroMSetTools.fastStringComparator); sentences = new HashMap(); HashSet currsentwords = new HashSet(); - String sentence = ""; + StringBuffer sentence = new StringBuffer(100); String word = ""; String k; int wordlen; @@ -163,7 +163,7 @@ public final class plasmaCondenser { if (sentence.length() > 0) { // we store the punctuation symbol as first element of the sentence vector allsentencecounter++; - sentence = word + sentence; + sentence.insert(0, word); // append at beginning if (sentences.containsKey(sentence)) { // sentence already exists sp = (statProp) sentences.get(sentence); @@ -184,7 +184,7 @@ public final class plasmaCondenser { words.put(k,sp); } } - sentence = ""; + sentence = new StringBuffer(100); currsentwords.clear(); } else { // store word @@ -202,13 +202,13 @@ public final class plasmaCondenser { } words.put(word, sp); // we now have the unique handle of the word, put it into the sentence: - sentence = sentence + intString(wordHandle, numlength); // thread hang error here + sentence.append(intString(wordHandle, numlength)); } } // finnish last sentence if (sentence.length() > 0) { allsentencecounter++; - sentence = "." + sentence; + sentence.insert(0, "."); // append at beginning if (sentences.containsKey(sentence)) { sp = (statProp) sentences.get(sentence); sp.inc(); @@ -226,20 +226,24 @@ public final class plasmaCondenser { Object[] orderedSentences = new Object[sentenceHandleCount]; String[] s; int wc; + Object o; it = sentences.keySet().iterator(); - while (it.hasNext()) { - sentence = (String) it.next(); - wc = (sentence.length() - 1) / numlength; - s = new String[wc + 2]; - sp = (statProp) sentences.get(sentence); - s[0] = intString(sp.count, numlength); // number of occurrences of this sentence - s[1] = sentence.substring(0,1); // the termination symbol of this sentence - for (int i = 0; i < wc; i++) { - k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1); - s[i + 2] = k; - } - orderedSentences[sp.handle] = s; - } + while (it.hasNext()) { + o = it.next(); + if (o != null) { + sentence = (StringBuffer) o; + wc = (sentence.length() - 1) / numlength; + s = new String[wc + 2]; + sp = (statProp) sentences.get(sentence); + s[0] = intString(sp.count, numlength); // number of occurrences of this sentence + s[1] = sentence.substring(0,1); // the termination symbol of this sentence + for (int i = 0; i < wc; i++) { + k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1); + s[i + 2] = k; + } + orderedSentences[sp.handle] = s; + } + } Map.Entry entry; // we search for similar words and reorganize the corresponding sentences @@ -280,10 +284,12 @@ public final class plasmaCondenser { // depending on the orderedSentences structure, we rebuild the sentence HashMap to // eliminate double occuring sentences sentences = new HashMap(); + int le; for (int i = 0; i < orderedSentences.length; i++) { - sentence = ""; - for (int j = 1; j < ((String[]) orderedSentences[i]).length; j++) sentence = sentence + ((String[]) orderedSentences[i])[j]; - if (sentences.containsKey(sentence)) { + le = ((String[]) orderedSentences[i]).length; + sentence = new StringBuffer(le * 10); + for (int j = 1; j < le; j++) sentence.append(((String[]) orderedSentences[i])[j]); + if (sentences.containsKey(sentence)) { // add sentence counter to counter of found sentence sp = (statProp) sentences.get(sentence); sp.count = sp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]); @@ -374,12 +380,12 @@ public final class plasmaCondenser { Iterator it; statProp sp; String[] s; - String sentence; + StringBuffer sentence; Object[] orderedSentences = new Object[sentences.size()]; for (int i = 0; i < sentences.size(); i++) orderedSentences[i] = null; // this array must be initialized it = sentences.keySet().iterator(); while (it.hasNext()) { - sentence = (String) it.next(); + sentence = (StringBuffer) it.next(); wc = (sentence.length() - 1) / numlength; s = new String[wc + 2]; sp = (statProp) sentences.get(sentence); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 8f005ca7c..023360a28 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -292,7 +292,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String cache = getConfig("proxyCache", "DATA/HTCACHE"); cache = cache.replace('\\', '/'); if (cache.endsWith("/")) { cache = cache.substring(0, cache.length() - 1); } - File htCachePath = new File(cache); + File htCachePath = new File(rootPath, cache); long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig("proxyCacheSize", "2")); // this is megabyte this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP); @@ -936,7 +936,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser private void processResourceStack(plasmaSwitchboardQueue.Entry entry) { // work off one stack entry with a fresh resource try { - long parsingStartTime = 0, parsingEndTime = 0, indexingStartTime = 0, indexingEndTime; + long stackStartTime = 0, stackEndTime = 0, parsingStartTime = 0, parsingEndTime = 0, indexingStartTime = 0, indexingEndTime; // we must distinguish the following cases: resource-load was initiated by // 1) global crawling: the index is extern, not here (not possible here) @@ -999,6 +999,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (loadDate == null) loadDate = new Date(); // put anchors on crawl stack + stackStartTime = System.currentTimeMillis(); if (((processCase == 4) || (processCase == 5)) && ((entry.profile() == null) || (entry.depth() < entry.profile().generalDepth()))) { Map hl = document.getHyperlinks(); @@ -1020,8 +1021,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } log.logInfo("CRAWL: ADDED " + c + " LINKS FROM " + entry.normalizedURLString() + + ", " + (hl.size() - c) + " LINKS DOUBLE" + ", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE)); } + stackEndTime = System.currentTimeMillis(); // create index String descr = document.getMainLongTitle(); @@ -1085,6 +1088,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser "\n\tDescription: " + descr + "\n\t" + "MimeType: " + document.getMimeType() + " | " + "Size: " + document.text.length + " bytes | " + + "StackingTime: " + (stackEndTime-stackStartTime) + " ms | " + "ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " + "IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms");