From b9a2a2d28799e8891ccef72741a780da11de4703 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 24 Apr 2008 15:09:06 +0000 Subject: [PATCH] more search performance hacks git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4735 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../htmlFilter/htmlFilterAbstractScraper.java | 18 +++++++++++------- .../htmlFilter/htmlFilterContentScraper.java | 18 +++++------------- .../anomic/kelondro/kelondroRowCollection.java | 15 ++++++++++----- source/de/anomic/plasma/plasmaCondenser.java | 8 +++----- .../plasma/plasmaSearchRankingProcess.java | 10 +++++----- .../de/anomic/plasma/plasmaSnippetCache.java | 14 ++++++++------ source/de/anomic/yacy/yacyURL.java | 8 +++++++- 7 files changed, 49 insertions(+), 42 deletions(-) diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index 642c290e6..8aa8842bd 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -27,8 +27,8 @@ package de.anomic.htmlFilter; import java.util.HashMap; +import java.util.HashSet; import java.util.Properties; -import java.util.TreeSet; import de.anomic.server.serverCharBuffer; @@ -38,8 +38,8 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { public static final char rb = '>'; public static final char sl = '/'; - private TreeSet tags0; - private TreeSet tags1; + private HashSet tags0; + private HashSet tags1; // define a translation table for html character codings private static HashMap trans = new HashMap(300); @@ -289,18 +289,22 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { trans.put("›", ""); //angewinkeltes einzelnes Anf.zeichen rechts } - - public htmlFilterAbstractScraper(TreeSet tags0, TreeSet tags1) { + /** + * create a scraper. the tag sets must contain tags in lowercase! + * @param tags0 + * @param tags1 + */ + public htmlFilterAbstractScraper(HashSet tags0, HashSet tags1) { this.tags0 = tags0; this.tags1 = tags1; } public boolean isTag0(String tag) { - return (tags0 != null) && (tags0.contains(tag)); + return (tags0 != null) && (tags0.contains(tag.toLowerCase())); } public boolean isTag1(String tag) { - return (tags1 != null) && (tags1.contains(tag)); + return (tags1 != null) && (tags1.contains(tag.toLowerCase())); } //the 'missing' method that shall be implemented: diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index e07ebb812..3e48b5f3e 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -51,15 +51,13 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.MalformedURLException; -import java.text.Collator; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Properties; -import java.util.TreeSet; import javax.swing.event.EventListenerList; @@ -71,17 +69,11 @@ import de.anomic.yacy.yacyURL; public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { // statics: for initialization of the HTMLFilterAbstractScraper - private static TreeSet linkTags0; - private static TreeSet linkTags1; + private static HashSet linkTags0; + private static HashSet linkTags1; - private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); static { - insensitiveCollator.setStrength(Collator.SECONDARY); - insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); - } - - static { - linkTags0 = new TreeSet(insensitiveCollator); + linkTags0 = new HashSet(); linkTags0.add("img"); linkTags0.add("base"); linkTags0.add("frame"); @@ -91,7 +83,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen linkTags0.add("embed"); //added by [MN] linkTags0.add("param"); //added by [MN] - linkTags1 = new TreeSet(insensitiveCollator); + linkTags1 = new HashSet(); linkTags1.add("a"); linkTags1.add("h1"); linkTags1.add("h2"); diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java index 71246df93..1f7587ed6 100644 --- a/source/de/anomic/kelondro/kelondroRowCollection.java +++ b/source/de/anomic/kelondro/kelondroRowCollection.java @@ -234,7 +234,7 @@ public class kelondroRowCollection { // grow instead of shrink, simply ignore the growfactor if (serverMemory.available() + 1000 < needed) return; // if the swap buffer is not available, we must give up. - // This is not critical. Othervise we provoke a serious + // This is not critical. Otherwise we provoke a serious // problem with OOM byte[] newChunkcache = new byte[needed]; System.arraycopy(chunkcache, 0, newChunkcache, 0, Math.min( @@ -264,15 +264,20 @@ public class kelondroRowCollection { return b; } - public synchronized final kelondroRow.Entry get(int index, boolean clone) { + public final kelondroRow.Entry get(int index, boolean clone) { assert (index >= 0) : "get: access with index " + index + " is below zero"; assert (index < chunkcount) : "get: access with index " + index + " is above chunkcount " + chunkcount + "; sortBound = " + sortBound; assert (index * rowdef.objectsize < chunkcache.length); if ((chunkcache == null) || (rowdef == null)) return null; // case may appear during shutdown - if (index >= chunkcount) return null; - if ((index + 1) * rowdef.objectsize > chunkcache.length) return null; // the whole chunk does not fit into the chunkcache + kelondroRow.Entry entry; + int addr = index * rowdef.objectsize; + synchronized (this) { + if (index >= chunkcount) return null; + if (addr + rowdef.objectsize > chunkcache.length) return null; // the whole chunk does not fit into the chunkcache + entry = rowdef.newEntry(chunkcache, addr, clone); + } this.lastTimeRead = System.currentTimeMillis(); - return rowdef.newEntry(chunkcache, index * rowdef.objectsize, clone); + return entry; } public synchronized final void set(int index, kelondroRow.Entry a) { diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 4f87c2fb2..9c9a86b79 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -611,11 +611,9 @@ public final class plasmaCondenser { } - public static StringBuffer trim(StringBuffer sb) { - synchronized (sb) { - while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0); - while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1); - } + private static StringBuffer trim(StringBuffer sb) { + while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0); + while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1); return sb; } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 0bb9f8e19..ce9230d23 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -246,12 +246,13 @@ public final class plasmaSearchRankingProcess { // - root-domain guessing to prefer the root domain over other urls if search word appears in domain name - private synchronized kelondroSortStack.stackElement bestRWI(boolean skipDoubleDom) { - // returns from the current RWI list the best entry and removed this entry from the list + private kelondroSortStack.stackElement bestRWI(boolean skipDoubleDom) { + // returns from the current RWI list the best entry and removes this entry from the list kelondroSortStack m; kelondroSortStack.stackElement rwi; while (stack.size() > 0) { rwi = stack.pop(); + if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it if (!skipDoubleDom) return rwi; // check doubledom String domhash = rwi.element.urlHash().substring(6); @@ -272,6 +273,7 @@ public final class plasmaSearchRankingProcess { kelondroSortStack.stackElement o; while (i.hasNext()) { m = i.next(); + if (m == null) continue; if (m.size() == 0) continue; if (bestEntry == null) { bestEntry = m.top(); @@ -293,7 +295,6 @@ public final class plasmaSearchRankingProcess { public indexURLReference bestURL(boolean skipDoubleDom) { // returns from the current RWI list the best URL entry and removed this entry from the list while ((stack.size() > 0) || (size() > 0)) { - synchronized (this) { if (((stack.size() == 0) && (size() == 0))) break; kelondroSortStack.stackElement obrwi = bestRWI(skipDoubleDom); indexURLReference u = wordIndex.getURL(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue()); @@ -303,12 +304,11 @@ public final class plasmaSearchRankingProcess { return u; } misses.add(obrwi.element.urlHash()); - } } return null; } - public synchronized int size() { + public int size() { //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size(); int c = stack.size(); Iterator> i = this.doubleDomCache.values().iterator(); diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 7db01e9ce..57e28584b 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -366,16 +366,16 @@ public class plasmaSnippetCache { Set remainingHashes = (tsr == null) ? queryhashes : (Set) tsr[1]; // compute snippet from media - String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes); - String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes); - String appline = computeMediaSnippet(document.getApplinks(), queryhashes); + //String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes); + //String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes); + //String appline = computeMediaSnippet(document.getApplinks(), queryhashes); //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes); //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes); line = ""; - if (audioline != null) line += (line.length() == 0) ? audioline : "
" + audioline; - if (videoline != null) line += (line.length() == 0) ? videoline : "
" + videoline; - if (appline != null) line += (line.length() == 0) ? appline : "
" + appline; + //if (audioline != null) line += (line.length() == 0) ? audioline : "
" + audioline; + //if (videoline != null) line += (line.length() == 0) ? videoline : "
" + videoline; + //if (appline != null) line += (line.length() == 0) ? appline : "
" + appline; //if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline; if (textline != null) line += (line.length() == 0) ? textline : "
" + textline; @@ -494,6 +494,7 @@ public class plasmaSnippetCache { return snippetsCache.get(key); } + /* private static String computeMediaSnippet(Map media, Set queryhashes) { Iterator> i = media.entrySet().iterator(); Map.Entry entry; @@ -519,6 +520,7 @@ public class plasmaSnippetCache { if (result.length() == 0) return null; return result.substring(6); } + */ @SuppressWarnings("unchecked") private static Object[] /*{String - the snippet, Set - remaining hashes}*/ diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index 47ba8aac3..0d4f2c62a 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -904,8 +904,14 @@ public class yacyURL { ((this.port == other.port ))); } + /** + * hash code computation for yacyURL: please don't mix this up with the YaCy-Hash + * this hash here is only used by hashing data structures, like a HashMap + * We do not use tha yacy hash here, because this needs the computation of a DNS + * which is very time-intensive + */ public int hashCode() { - return this.hash().hashCode(); + return this.toNormalform(true, false).hashCode(); } public int compareTo(Object h) {