more search performance hacks

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4735 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent ff755fb858
commit b9a2a2d287

@ -27,8 +27,8 @@
package de.anomic.htmlFilter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Properties;
import java.util.TreeSet;
import de.anomic.server.serverCharBuffer;
@ -38,8 +38,8 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
public static final char rb = '>';
public static final char sl = '/';
private TreeSet<String> tags0;
private TreeSet<String> tags1;
private HashSet<String> tags0;
private HashSet<String> tags1;
// define a translation table for html character codings
private static HashMap<String, String> trans = new HashMap<String, String>(300);
@ -289,18 +289,22 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
trans.put("&rsaquo;", ""); //angewinkeltes einzelnes Anf.zeichen rechts
}
public htmlFilterAbstractScraper(TreeSet<String> tags0, TreeSet<String> tags1) {
/**
* create a scraper. the tag sets must contain tags in lowercase!
* @param tags0
* @param tags1
*/
public htmlFilterAbstractScraper(HashSet<String> tags0, HashSet<String> tags1) {
this.tags0 = tags0;
this.tags1 = tags1;
}
public boolean isTag0(String tag) {
return (tags0 != null) && (tags0.contains(tag));
return (tags0 != null) && (tags0.contains(tag.toLowerCase()));
}
public boolean isTag1(String tag) {
return (tags1 != null) && (tags1.contains(tag));
return (tags1 != null) && (tags1.contains(tag.toLowerCase()));
}
//the 'missing' method that shall be implemented:

@ -51,15 +51,13 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.text.Collator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.TreeSet;
import javax.swing.event.EventListenerList;
@ -71,17 +69,11 @@ import de.anomic.yacy.yacyURL;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
// statics: for initialization of the HTMLFilterAbstractScraper
private static TreeSet<String> linkTags0;
private static TreeSet<String> linkTags1;
private static HashSet<String> linkTags0;
private static HashSet<String> linkTags1;
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
static {
insensitiveCollator.setStrength(Collator.SECONDARY);
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
}
static {
linkTags0 = new TreeSet<String>(insensitiveCollator);
linkTags0 = new HashSet<String>();
linkTags0.add("img");
linkTags0.add("base");
linkTags0.add("frame");
@ -91,7 +83,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
linkTags0.add("embed"); //added by [MN]
linkTags0.add("param"); //added by [MN]
linkTags1 = new TreeSet<String>(insensitiveCollator);
linkTags1 = new HashSet<String>();
linkTags1.add("a");
linkTags1.add("h1");
linkTags1.add("h2");

@ -234,7 +234,7 @@ public class kelondroRowCollection {
// grow instead of shrink, simply ignore the growfactor
if (serverMemory.available() + 1000 < needed)
return; // if the swap buffer is not available, we must give up.
// This is not critical. Othervise we provoke a serious
// This is not critical. Otherwise we provoke a serious
// problem with OOM
byte[] newChunkcache = new byte[needed];
System.arraycopy(chunkcache, 0, newChunkcache, 0, Math.min(
@ -264,15 +264,20 @@ public class kelondroRowCollection {
return b;
}
public synchronized final kelondroRow.Entry get(int index, boolean clone) {
public final kelondroRow.Entry get(int index, boolean clone) {
assert (index >= 0) : "get: access with index " + index + " is below zero";
assert (index < chunkcount) : "get: access with index " + index + " is above chunkcount " + chunkcount + "; sortBound = " + sortBound;
assert (index * rowdef.objectsize < chunkcache.length);
if ((chunkcache == null) || (rowdef == null)) return null; // case may appear during shutdown
if (index >= chunkcount) return null;
if ((index + 1) * rowdef.objectsize > chunkcache.length) return null; // the whole chunk does not fit into the chunkcache
kelondroRow.Entry entry;
int addr = index * rowdef.objectsize;
synchronized (this) {
if (index >= chunkcount) return null;
if (addr + rowdef.objectsize > chunkcache.length) return null; // the whole chunk does not fit into the chunkcache
entry = rowdef.newEntry(chunkcache, addr, clone);
}
this.lastTimeRead = System.currentTimeMillis();
return rowdef.newEntry(chunkcache, index * rowdef.objectsize, clone);
return entry;
}
public synchronized final void set(int index, kelondroRow.Entry a) {

@ -611,11 +611,9 @@ public final class plasmaCondenser {
}
public static StringBuffer trim(StringBuffer sb) {
synchronized (sb) {
while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
}
private static StringBuffer trim(StringBuffer sb) {
while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
return sb;
}

@ -246,12 +246,13 @@ public final class plasmaSearchRankingProcess {
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
private synchronized kelondroSortStack<indexRWIVarEntry>.stackElement bestRWI(boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removed this entry from the list
private kelondroSortStack<indexRWIVarEntry>.stackElement bestRWI(boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removes this entry from the list
kelondroSortStack<indexRWIVarEntry> m;
kelondroSortStack<indexRWIVarEntry>.stackElement rwi;
while (stack.size() > 0) {
rwi = stack.pop();
if (rwi == null) continue; // in case that a synchronization problem occurred just go lazy over it
if (!skipDoubleDom) return rwi;
// check doubledom
String domhash = rwi.element.urlHash().substring(6);
@ -272,6 +273,7 @@ public final class plasmaSearchRankingProcess {
kelondroSortStack<indexRWIVarEntry>.stackElement o;
while (i.hasNext()) {
m = i.next();
if (m == null) continue;
if (m.size() == 0) continue;
if (bestEntry == null) {
bestEntry = m.top();
@ -293,7 +295,6 @@ public final class plasmaSearchRankingProcess {
public indexURLReference bestURL(boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removed this entry from the list
while ((stack.size() > 0) || (size() > 0)) {
synchronized (this) {
if (((stack.size() == 0) && (size() == 0))) break;
kelondroSortStack<indexRWIVarEntry>.stackElement obrwi = bestRWI(skipDoubleDom);
indexURLReference u = wordIndex.getURL(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
@ -303,12 +304,11 @@ public final class plasmaSearchRankingProcess {
return u;
}
misses.add(obrwi.element.urlHash());
}
}
return null;
}
public synchronized int size() {
public int size() {
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
int c = stack.size();
Iterator<kelondroSortStack<indexRWIVarEntry>> i = this.doubleDomCache.values().iterator();

@ -366,16 +366,16 @@ public class plasmaSnippetCache {
Set<String> remainingHashes = (tsr == null) ? queryhashes : (Set<String>) tsr[1];
// compute snippet from media
String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
//String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
//String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
//String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
//String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
//String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
line = "";
if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
if (appline != null) line += (line.length() == 0) ? appline : "<br />" + appline;
//if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
//if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
//if (appline != null) line += (line.length() == 0) ? appline : "<br />" + appline;
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "<br />" + textline;
@ -494,6 +494,7 @@ public class plasmaSnippetCache {
return snippetsCache.get(key);
}
/*
private static String computeMediaSnippet(Map<yacyURL, String> media, Set<String> queryhashes) {
Iterator<Map.Entry<yacyURL, String>> i = media.entrySet().iterator();
Map.Entry<yacyURL, String> entry;
@ -519,6 +520,7 @@ public class plasmaSnippetCache {
if (result.length() == 0) return null;
return result.substring(6);
}
*/
@SuppressWarnings("unchecked")
private static Object[] /*{String - the snippet, Set - remaining hashes}*/

@ -904,8 +904,14 @@ public class yacyURL {
((this.port == other.port )));
}
/**
* hash code computation for yacyURL: please don't mix this up with the YaCy-Hash
* this hash here is only used by hashing data structures, like a HashMap
* We do not use tha yacy hash here, because this needs the computation of a DNS
* which is very time-intensive
*/
public int hashCode() {
return this.hash().hashCode();
return this.toNormalform(true, false).hashCode();
}
public int compareTo(Object h) {

Loading…
Cancel
Save