From ff755fb8589c2db6d8c722698f69a0e7409e545e Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 24 Apr 2008 13:31:55 +0000 Subject: [PATCH] small corrections and enhancements after search timing profiling search should be a little bit faster now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4734 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../htmlFilter/htmlFilterContentScraper.java | 2 +- .../htmlFilter/htmlFilterInputStream.java | 4 ++-- source/de/anomic/http/httpHeader.java | 4 ++-- .../de/anomic/index/indexRWIEntryOrder.java | 8 +++---- .../de/anomic/kelondro/kelondroEcoTable.java | 8 +++---- .../kelondro/kelondroRowCollection.java | 16 ++++++------- source/de/anomic/kelondro/kelondroRowSet.java | 8 +++---- .../de/anomic/plasma/plasmaSearchEvent.java | 11 ++++----- .../plasma/plasmaSearchRankingProcess.java | 23 +++++++++++-------- source/de/anomic/plasma/plasmaWordIndex.java | 2 +- 10 files changed, 44 insertions(+), 42 deletions(-) diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index ff4c6931e..e07ebb812 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -70,7 +70,7 @@ import de.anomic.yacy.yacyURL; public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { - // statics: for initialisation of the HTMLFilterAbstractScraper + // statics: for initialization of the HTMLFilterAbstractScraper private static TreeSet linkTags0; private static TreeSet linkTags1; diff --git a/source/de/anomic/htmlFilter/htmlFilterInputStream.java b/source/de/anomic/htmlFilter/htmlFilterInputStream.java index f7df006e5..dd1f06784 100644 --- a/source/de/anomic/htmlFilter/htmlFilterInputStream.java +++ b/source/de/anomic/htmlFilter/htmlFilterInputStream.java @@ -44,7 +44,7 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven private static final int MODE_PRESCAN_FINISHED = 1; private int mode = 1; - private long preBufferSize = 143336; + private long preBufferSize = 2048; private long preRead = 0; private BufferedInputStream bufferedIn; @@ -81,7 +81,7 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven String value = tagopts.getProperty("http-equiv"); if (value.equalsIgnoreCase("Content-Type")) { String contentType = tagopts.getProperty("content",""); - this.detectedCharset = httpHeader.extractCharsetFromMimetyeHeader(contentType); + this.detectedCharset = httpHeader.extractCharsetFromMimetypeHeader(contentType); if (this.detectedCharset != null && this.detectedCharset.length() > 0) { this.charsetChanged = true; } else if (tagopts.containsKey("charset")) { diff --git a/source/de/anomic/http/httpHeader.java b/source/de/anomic/http/httpHeader.java index 47f058c28..8c7484988 100644 --- a/source/de/anomic/http/httpHeader.java +++ b/source/de/anomic/http/httpHeader.java @@ -400,10 +400,10 @@ public final class httpHeader extends TreeMap implements Map 0) { - System.out.println("?" + new String(get(i+1).getColBytes(0))); + System.out.println("?" + new String(get(i + 1, false).getColBytes(0))); return false; } } @@ -915,7 +915,7 @@ public class kelondroRowCollection { System.out.println("create c : " + (t1 - t0) + " nanoseconds, " + d(testsize, (t1 - t0)) + " entries/nanoseconds"); kelondroRowCollection d = new kelondroRowCollection(r, testsize); for (int i = 0; i < testsize; i++) { - d.add(c.get(i).getColBytes(0)); + d.add(c.get(i, false).getColBytes(0)); } long t2 = System.nanoTime(); System.out.println("copy c -> d: " + (t2 - t1) + " nanoseconds, " + d(testsize, (t2 - t1)) + " entries/nanoseconds"); diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java index 600428af0..5655eb4ca 100644 --- a/source/de/anomic/kelondro/kelondroRowSet.java +++ b/source/de/anomic/kelondro/kelondroRowSet.java @@ -119,7 +119,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd private kelondroRow.Entry get(byte[] key, int astart, int alength) { long handle = profile.startRead(); int index = find(key, astart, alength); - kelondroRow.Entry entry = (index >= 0) ? get(index) : null; + kelondroRow.Entry entry = (index >= 0) ? get(index, true) : null; profile.stopRead(handle); return entry; } @@ -148,7 +148,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd if (index < 0) { super.addUnique(entry); } else { - oldentry = get(index); + oldentry = get(index, true); set(index, entry); } profile.stopWrite(handle); @@ -159,7 +159,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd int index = find(a, start, length); if (index < 0) return null; //System.out.println("remove: chunk found at index position (before remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length + 10) + ", searchkey=" + serverLog.arrayList(a, start, length)); - kelondroRow.Entry entry = super.get(index); + kelondroRow.Entry entry = super.get(index, true); super.removeRow(index, keepOrder); //System.out.println("remove: chunk found at index position (after remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length) + ", searchkey=" + serverLog.arrayList(a, start, length)); int findagainindex = find(a, start, length); @@ -381,7 +381,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd } public kelondroRow.Entry next() { - kelondroRow.Entry entry = get(p); + kelondroRow.Entry entry = get(p, true); if (up) p++; else p--; return entry; } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 272e8ab18..0c085faab 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -488,19 +488,16 @@ public final class plasmaSearchEvent { event.eventTime = System.currentTimeMillis(); // start worker threads to fetch urls and snippets event.workerThreads = new resultWorker[workerThreadCount]; + resultWorker worker; for (int i = 0; i < workerThreadCount; i++) { - event.workerThreads[i] = event.deployWorker(i, 10000); + worker = event.new resultWorker(i, 10000); + worker.start(); + event.workerThreads[i] = worker; } } return event; } - - private resultWorker deployWorker(int id, long lifetime) { - resultWorker worker = new resultWorker(id, lifetime); - worker.start(); - return worker; - } private class resultWorker extends Thread { diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index bad0fcb43..0bb9f8e19 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -54,6 +54,7 @@ import de.anomic.yacy.yacyURL; public final class plasmaSearchRankingProcess { public static kelondroBinSearch[] ybrTables = null; // block-rank tables + public static final int maxYBR = 3; // the lower this value, the faster the search private static boolean useYBR = true; private kelondroSortStack stack; @@ -289,17 +290,20 @@ public final class plasmaSearchRankingProcess { return bestEntry; } - public synchronized indexURLReference bestURL(boolean skipDoubleDom) { + public indexURLReference bestURL(boolean skipDoubleDom) { // returns from the current RWI list the best URL entry and removed this entry from the list while ((stack.size() > 0) || (size() > 0)) { - kelondroSortStack.stackElement obrwi = bestRWI(skipDoubleDom); - indexURLReference u = wordIndex.getURL(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue()); - if (u != null) { - indexURLReference.Components comp = u.comp(); - if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url - return u; + synchronized (this) { + if (((stack.size() == 0) && (size() == 0))) break; + kelondroSortStack.stackElement obrwi = bestRWI(skipDoubleDom); + indexURLReference u = wordIndex.getURL(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue()); + if (u != null) { + indexURLReference.Components comp = u.comp(); + if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url + return u; + } + misses.add(obrwi.element.urlHash()); } - misses.add(obrwi.element.urlHash()); } return null; } @@ -432,7 +436,8 @@ public final class plasmaSearchRankingProcess { if (ybrTables == null) return 15; if (!(useYBR)) return 15; final String domHash = urlHash.substring(6); - for (int i = 0; i < ybrTables.length; i++) { + int m = Math.min(maxYBR, ybrTables.length); + for (int i = 0; i < m; i++) { if ((ybrTables[i] != null) && (ybrTables[i].contains(domHash.getBytes()))) { //System.out.println("YBR FOUND: " + urlHash + " (" + i + ")"); return i; diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index bd6ef4bff..9436158a4 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -424,7 +424,7 @@ public final class plasmaWordIndex implements indexRI { indexRWIRowEntry e, elm = null; long lm = 0; for (int j = 0; j < set.size(); j++) { - e = new indexRWIRowEntry(set.get(j)); + e = new indexRWIRowEntry(set.get(j, true)); if ((elm == null) || (e.lastModified() > lm)) { elm = e; lm = e.lastModified();