small corrections and enhancements after search timing profiling

search should be a little bit faster now

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4734 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 0702dd2507
commit ff755fb858

@ -70,7 +70,7 @@ import de.anomic.yacy.yacyURL;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
// statics: for initialisation of the HTMLFilterAbstractScraper // statics: for initialization of the HTMLFilterAbstractScraper
private static TreeSet<String> linkTags0; private static TreeSet<String> linkTags0;
private static TreeSet<String> linkTags1; private static TreeSet<String> linkTags1;

@ -44,7 +44,7 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven
private static final int MODE_PRESCAN_FINISHED = 1; private static final int MODE_PRESCAN_FINISHED = 1;
private int mode = 1; private int mode = 1;
private long preBufferSize = 143336; private long preBufferSize = 2048;
private long preRead = 0; private long preRead = 0;
private BufferedInputStream bufferedIn; private BufferedInputStream bufferedIn;
@ -81,7 +81,7 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven
String value = tagopts.getProperty("http-equiv"); String value = tagopts.getProperty("http-equiv");
if (value.equalsIgnoreCase("Content-Type")) { if (value.equalsIgnoreCase("Content-Type")) {
String contentType = tagopts.getProperty("content",""); String contentType = tagopts.getProperty("content","");
this.detectedCharset = httpHeader.extractCharsetFromMimetyeHeader(contentType); this.detectedCharset = httpHeader.extractCharsetFromMimetypeHeader(contentType);
if (this.detectedCharset != null && this.detectedCharset.length() > 0) { if (this.detectedCharset != null && this.detectedCharset.length() > 0) {
this.charsetChanged = true; this.charsetChanged = true;
} else if (tagopts.containsKey("charset")) { } else if (tagopts.containsKey("charset")) {

@ -400,10 +400,10 @@ public final class httpHeader extends TreeMap<String, String> implements Map<Str
public String getCharacterEncoding() { public String getCharacterEncoding() {
String mimeType = mime(); String mimeType = mime();
return extractCharsetFromMimetyeHeader(mimeType); return extractCharsetFromMimetypeHeader(mimeType);
} }
public static String extractCharsetFromMimetyeHeader(String mimeType) { public static String extractCharsetFromMimetypeHeader(String mimeType) {
if (mimeType == null) return null; if (mimeType == null) return null;
String[] parts = mimeType.split(";"); String[] parts = mimeType.split(";");

@ -82,7 +82,7 @@ public class indexRWIEntryOrder {
entry = di.next(); entry = di.next();
this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue()); this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue());
} }
result = mmf0.decodedEntries; result = mmf0.decodedContainer();
result.addAll(mmf1.decodedContainer()); result.addAll(mmf1.decodedContainer());
//long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0); //long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0);
//System.out.println("***DEBUG*** indexRWIEntry.Order (2-THREADED): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond"); //System.out.println("***DEBUG*** indexRWIEntry.Order (2-THREADED): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond");
@ -118,7 +118,7 @@ public class indexRWIEntryOrder {
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf); //System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
long r = long r =
((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength) ((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
+ ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr) + ((ranking.coeff_ybr > 12) ? ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr) : 0)
+ ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps) + ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
+ ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength) + ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength)
+ ((max.posintext() == min.posintext() ) ? 0 : (256 - (((t.posintext() - min.posintext() ) << 8) / (max.posintext() - min.posintext()) )) << ranking.coeff_posintext) + ((max.posintext() == min.posintext() ) ? 0 : (256 - (((t.posintext() - min.posintext() ) << 8) / (max.posintext() - min.posintext()) )) << ranking.coeff_posintext)
@ -133,7 +133,7 @@ public class indexRWIEntryOrder {
+ ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother) + ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) + ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ tf + tf
+ (authority(t.urlHash()) << ranking.coeff_authority) + ((ranking.coeff_authority > 12) ? (authority(t.urlHash()) << ranking.coeff_authority) : 0)
+ (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)) + (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)) + (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)) + (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0))
@ -179,7 +179,7 @@ public class indexRWIEntryOrder {
String dom; String dom;
Integer count; Integer count;
while (p < this.end) { while (p < this.end) {
iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++))); iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++, false)));
this.decodedEntries.add(iEntry); this.decodedEntries.add(iEntry);
// find min/max // find min/max
if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry); if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);

@ -336,7 +336,7 @@ public class kelondroEcoTable implements kelondroIndex {
file.get(i, b, 0); file.get(i, b, 0);
} else { } else {
// construct the row using the copy in RAM // construct the row using the copy in RAM
kelondroRow.Entry v = table.get(i); kelondroRow.Entry v = table.get(i, false);
assert v != null; assert v != null;
if (v == null) return null; if (v == null) return null;
assert key.length == rowdef.primaryKeyLength; assert key.length == rowdef.primaryKeyLength;
@ -382,7 +382,7 @@ public class kelondroEcoTable implements kelondroIndex {
file.put(i, row.bytes(), 0); file.put(i, row.bytes(), 0);
} else { } else {
// read old value // read old value
kelondroRow.Entry v = table.get(i); kelondroRow.Entry v = table.get(i, false);
assert v != null; assert v != null;
System.arraycopy(row.getPrimaryKeyBytes(), 0, b, 0, rowdef.primaryKeyLength); System.arraycopy(row.getPrimaryKeyBytes(), 0, b, 0, rowdef.primaryKeyLength);
System.arraycopy(v.bytes(), 0, b, rowdef.primaryKeyLength, rowdef.objectsize - rowdef.primaryKeyLength); System.arraycopy(v.bytes(), 0, b, rowdef.primaryKeyLength, rowdef.objectsize - rowdef.primaryKeyLength);
@ -474,7 +474,7 @@ public class kelondroEcoTable implements kelondroIndex {
assert (file.size() == index.size() + fail); assert (file.size() == index.size() + fail);
} else { } else {
// get result value from the table copy, so we don't need to read it from the file // get result value from the table copy, so we don't need to read it from the file
kelondroRow.Entry v = table.get(i); kelondroRow.Entry v = table.get(i, false);
System.arraycopy(key, 0, b, 0, key.length); System.arraycopy(key, 0, b, 0, key.length);
System.arraycopy(v.bytes(), 0, b, rowdef.primaryKeyLength, taildef.objectsize); System.arraycopy(v.bytes(), 0, b, rowdef.primaryKeyLength, taildef.objectsize);
@ -610,7 +610,7 @@ public class kelondroEcoTable implements kelondroIndex {
} }
} else { } else {
// compose from table and key // compose from table and key
kelondroRow.Entry v = table.get(this.c); kelondroRow.Entry v = table.get(this.c, false);
assert v != null; assert v != null;
if (v == null) return null; if (v == null) return null;
System.arraycopy(k, 0, b, 0, rowdef.primaryKeyLength); System.arraycopy(k, 0, b, 0, rowdef.primaryKeyLength);

@ -264,7 +264,7 @@ public class kelondroRowCollection {
return b; return b;
} }
public synchronized final kelondroRow.Entry get(int index) { public synchronized final kelondroRow.Entry get(int index, boolean clone) {
assert (index >= 0) : "get: access with index " + index + " is below zero"; assert (index >= 0) : "get: access with index " + index + " is below zero";
assert (index < chunkcount) : "get: access with index " + index + " is above chunkcount " + chunkcount + "; sortBound = " + sortBound; assert (index < chunkcount) : "get: access with index " + index + " is above chunkcount " + chunkcount + "; sortBound = " + sortBound;
assert (index * rowdef.objectsize < chunkcache.length); assert (index * rowdef.objectsize < chunkcache.length);
@ -272,7 +272,7 @@ public class kelondroRowCollection {
if (index >= chunkcount) return null; if (index >= chunkcount) return null;
if ((index + 1) * rowdef.objectsize > chunkcache.length) return null; // the whole chunk does not fit into the chunkcache if ((index + 1) * rowdef.objectsize > chunkcache.length) return null; // the whole chunk does not fit into the chunkcache
this.lastTimeRead = System.currentTimeMillis(); this.lastTimeRead = System.currentTimeMillis();
return rowdef.newEntry(chunkcache, index * rowdef.objectsize, true); return rowdef.newEntry(chunkcache, index * rowdef.objectsize, clone);
} }
public synchronized final void set(int index, kelondroRow.Entry a) { public synchronized final void set(int index, kelondroRow.Entry a) {
@ -395,7 +395,7 @@ public class kelondroRowCollection {
public synchronized kelondroRow.Entry removeOne() { public synchronized kelondroRow.Entry removeOne() {
// removes the last entry from the collection // removes the last entry from the collection
if (chunkcount == 0) return null; if (chunkcount == 0) return null;
kelondroRow.Entry r = get(chunkcount - 1); kelondroRow.Entry r = get(chunkcount - 1, true);
if (chunkcount == sortBound) sortBound--; if (chunkcount == sortBound) sortBound--;
chunkcount--; chunkcount--;
this.lastTimeWrote = System.currentTimeMillis(); this.lastTimeWrote = System.currentTimeMillis();
@ -471,7 +471,7 @@ public class kelondroRowCollection {
} }
public kelondroRow.Entry next() { public kelondroRow.Entry next() {
return get(p++); return get(p++, true);
} }
public void remove() { public void remove() {
@ -747,13 +747,13 @@ public class kelondroRowCollection {
try { try {
while (i >= 0) { while (i >= 0) {
if (compare(i, i + 1) == 0) { if (compare(i, i + 1) == 0) {
collection.addUnique(get(i + 1)); collection.addUnique(get(i + 1, false));
removeRow(i + 1, false); removeRow(i + 1, false);
d++; d++;
if (i + 1 < chunkcount - 1) u = false; if (i + 1 < chunkcount - 1) u = false;
} else if (collection.size() > 0) { } else if (collection.size() > 0) {
// finish collection of double occurrences // finish collection of double occurrences
collection.addUnique(get(i + 1)); collection.addUnique(get(i + 1, false));
removeRow(i + 1, false); removeRow(i + 1, false);
d++; d++;
if (i + 1 < chunkcount - 1) u = false; if (i + 1 < chunkcount - 1) u = false;
@ -778,7 +778,7 @@ public class kelondroRowCollection {
for (int i = 0; i < chunkcount - 1; i++) { for (int i = 0; i < chunkcount - 1; i++) {
//System.out.println("*" + new String(get(i).getColBytes(0))); //System.out.println("*" + new String(get(i).getColBytes(0)));
if (compare(i, i + 1) > 0) { if (compare(i, i + 1) > 0) {
System.out.println("?" + new String(get(i+1).getColBytes(0))); System.out.println("?" + new String(get(i + 1, false).getColBytes(0)));
return false; return false;
} }
} }
@ -915,7 +915,7 @@ public class kelondroRowCollection {
System.out.println("create c : " + (t1 - t0) + " nanoseconds, " + d(testsize, (t1 - t0)) + " entries/nanoseconds"); System.out.println("create c : " + (t1 - t0) + " nanoseconds, " + d(testsize, (t1 - t0)) + " entries/nanoseconds");
kelondroRowCollection d = new kelondroRowCollection(r, testsize); kelondroRowCollection d = new kelondroRowCollection(r, testsize);
for (int i = 0; i < testsize; i++) { for (int i = 0; i < testsize; i++) {
d.add(c.get(i).getColBytes(0)); d.add(c.get(i, false).getColBytes(0));
} }
long t2 = System.nanoTime(); long t2 = System.nanoTime();
System.out.println("copy c -> d: " + (t2 - t1) + " nanoseconds, " + d(testsize, (t2 - t1)) + " entries/nanoseconds"); System.out.println("copy c -> d: " + (t2 - t1) + " nanoseconds, " + d(testsize, (t2 - t1)) + " entries/nanoseconds");

@ -119,7 +119,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
private kelondroRow.Entry get(byte[] key, int astart, int alength) { private kelondroRow.Entry get(byte[] key, int astart, int alength) {
long handle = profile.startRead(); long handle = profile.startRead();
int index = find(key, astart, alength); int index = find(key, astart, alength);
kelondroRow.Entry entry = (index >= 0) ? get(index) : null; kelondroRow.Entry entry = (index >= 0) ? get(index, true) : null;
profile.stopRead(handle); profile.stopRead(handle);
return entry; return entry;
} }
@ -148,7 +148,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
if (index < 0) { if (index < 0) {
super.addUnique(entry); super.addUnique(entry);
} else { } else {
oldentry = get(index); oldentry = get(index, true);
set(index, entry); set(index, entry);
} }
profile.stopWrite(handle); profile.stopWrite(handle);
@ -159,7 +159,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
int index = find(a, start, length); int index = find(a, start, length);
if (index < 0) return null; if (index < 0) return null;
//System.out.println("remove: chunk found at index position (before remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length + 10) + ", searchkey=" + serverLog.arrayList(a, start, length)); //System.out.println("remove: chunk found at index position (before remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length + 10) + ", searchkey=" + serverLog.arrayList(a, start, length));
kelondroRow.Entry entry = super.get(index); kelondroRow.Entry entry = super.get(index, true);
super.removeRow(index, keepOrder); super.removeRow(index, keepOrder);
//System.out.println("remove: chunk found at index position (after remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length) + ", searchkey=" + serverLog.arrayList(a, start, length)); //System.out.println("remove: chunk found at index position (after remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length) + ", searchkey=" + serverLog.arrayList(a, start, length));
int findagainindex = find(a, start, length); int findagainindex = find(a, start, length);
@ -381,7 +381,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
} }
public kelondroRow.Entry next() { public kelondroRow.Entry next() {
kelondroRow.Entry entry = get(p); kelondroRow.Entry entry = get(p, true);
if (up) p++; else p--; if (up) p++; else p--;
return entry; return entry;
} }

@ -488,19 +488,16 @@ public final class plasmaSearchEvent {
event.eventTime = System.currentTimeMillis(); event.eventTime = System.currentTimeMillis();
// start worker threads to fetch urls and snippets // start worker threads to fetch urls and snippets
event.workerThreads = new resultWorker[workerThreadCount]; event.workerThreads = new resultWorker[workerThreadCount];
resultWorker worker;
for (int i = 0; i < workerThreadCount; i++) { for (int i = 0; i < workerThreadCount; i++) {
event.workerThreads[i] = event.deployWorker(i, 10000); worker = event.new resultWorker(i, 10000);
worker.start();
event.workerThreads[i] = worker;
} }
} }
return event; return event;
} }
private resultWorker deployWorker(int id, long lifetime) {
resultWorker worker = new resultWorker(id, lifetime);
worker.start();
return worker;
}
private class resultWorker extends Thread { private class resultWorker extends Thread {

@ -54,6 +54,7 @@ import de.anomic.yacy.yacyURL;
public final class plasmaSearchRankingProcess { public final class plasmaSearchRankingProcess {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables public static kelondroBinSearch[] ybrTables = null; // block-rank tables
public static final int maxYBR = 3; // the lower this value, the faster the search
private static boolean useYBR = true; private static boolean useYBR = true;
private kelondroSortStack<indexRWIVarEntry> stack; private kelondroSortStack<indexRWIVarEntry> stack;
@ -289,17 +290,20 @@ public final class plasmaSearchRankingProcess {
return bestEntry; return bestEntry;
} }
public synchronized indexURLReference bestURL(boolean skipDoubleDom) { public indexURLReference bestURL(boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removed this entry from the list // returns from the current RWI list the best URL entry and removed this entry from the list
while ((stack.size() > 0) || (size() > 0)) { while ((stack.size() > 0) || (size() > 0)) {
kelondroSortStack<indexRWIVarEntry>.stackElement obrwi = bestRWI(skipDoubleDom); synchronized (this) {
indexURLReference u = wordIndex.getURL(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue()); if (((stack.size() == 0) && (size() == 0))) break;
if (u != null) { kelondroSortStack<indexRWIVarEntry>.stackElement obrwi = bestRWI(skipDoubleDom);
indexURLReference.Components comp = u.comp(); indexURLReference u = wordIndex.getURL(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url if (u != null) {
return u; indexURLReference.Components comp = u.comp();
if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url
return u;
}
misses.add(obrwi.element.urlHash());
} }
misses.add(obrwi.element.urlHash());
} }
return null; return null;
} }
@ -432,7 +436,8 @@ public final class plasmaSearchRankingProcess {
if (ybrTables == null) return 15; if (ybrTables == null) return 15;
if (!(useYBR)) return 15; if (!(useYBR)) return 15;
final String domHash = urlHash.substring(6); final String domHash = urlHash.substring(6);
for (int i = 0; i < ybrTables.length; i++) { int m = Math.min(maxYBR, ybrTables.length);
for (int i = 0; i < m; i++) {
if ((ybrTables[i] != null) && (ybrTables[i].contains(domHash.getBytes()))) { if ((ybrTables[i] != null) && (ybrTables[i].contains(domHash.getBytes()))) {
//System.out.println("YBR FOUND: " + urlHash + " (" + i + ")"); //System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
return i; return i;

@ -424,7 +424,7 @@ public final class plasmaWordIndex implements indexRI {
indexRWIRowEntry e, elm = null; indexRWIRowEntry e, elm = null;
long lm = 0; long lm = 0;
for (int j = 0; j < set.size(); j++) { for (int j = 0; j < set.size(); j++) {
e = new indexRWIRowEntry(set.get(j)); e = new indexRWIRowEntry(set.get(j, true));
if ((elm == null) || (e.lastModified() > lm)) { if ((elm == null) || (e.lastModified() > lm)) {
elm = e; elm = e;
lm = e.lastModified(); lm = e.lastModified();

Loading…
Cancel
Save