small corrections and enhancements after search timing profiling

search should be a little bit faster now

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4734 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 0702dd2507
commit ff755fb858

@ -70,7 +70,7 @@ import de.anomic.yacy.yacyURL;
public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper {
// statics: for initialisation of the HTMLFilterAbstractScraper
// statics: for initialization of the HTMLFilterAbstractScraper
private static TreeSet<String> linkTags0;
private static TreeSet<String> linkTags1;

@ -44,7 +44,7 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven
private static final int MODE_PRESCAN_FINISHED = 1;
private int mode = 1;
private long preBufferSize = 143336;
private long preBufferSize = 2048;
private long preRead = 0;
private BufferedInputStream bufferedIn;
@ -81,7 +81,7 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven
String value = tagopts.getProperty("http-equiv");
if (value.equalsIgnoreCase("Content-Type")) {
String contentType = tagopts.getProperty("content","");
this.detectedCharset = httpHeader.extractCharsetFromMimetyeHeader(contentType);
this.detectedCharset = httpHeader.extractCharsetFromMimetypeHeader(contentType);
if (this.detectedCharset != null && this.detectedCharset.length() > 0) {
this.charsetChanged = true;
} else if (tagopts.containsKey("charset")) {

@ -400,10 +400,10 @@ public final class httpHeader extends TreeMap<String, String> implements Map<Str
public String getCharacterEncoding() {
String mimeType = mime();
return extractCharsetFromMimetyeHeader(mimeType);
return extractCharsetFromMimetypeHeader(mimeType);
}
public static String extractCharsetFromMimetyeHeader(String mimeType) {
public static String extractCharsetFromMimetypeHeader(String mimeType) {
if (mimeType == null) return null;
String[] parts = mimeType.split(";");

@ -82,7 +82,7 @@ public class indexRWIEntryOrder {
entry = di.next();
this.doms.addScore(entry.getKey(), ((Integer) entry.getValue()).intValue());
}
result = mmf0.decodedEntries;
result = mmf0.decodedContainer();
result.addAll(mmf1.decodedContainer());
//long s1= System.currentTimeMillis(), sc = Math.max(1, s1 - s0);
//System.out.println("***DEBUG*** indexRWIEntry.Order (2-THREADED): " + sc + " milliseconds for " + container.size() + " entries, " + (container.size() / sc) + " entries/millisecond");
@ -118,7 +118,7 @@ public class indexRWIEntryOrder {
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
long r =
((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength)
+ ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr)
+ ((ranking.coeff_ybr > 12) ? ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr) : 0)
+ ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
+ ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength)
+ ((max.posintext() == min.posintext() ) ? 0 : (256 - (((t.posintext() - min.posintext() ) << 8) / (max.posintext() - min.posintext()) )) << ranking.coeff_posintext)
@ -133,7 +133,7 @@ public class indexRWIEntryOrder {
+ ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
+ tf
+ (authority(t.urlHash()) << ranking.coeff_authority)
+ ((ranking.coeff_authority > 12) ? (authority(t.urlHash()) << ranking.coeff_authority) : 0)
+ (((flags.get(indexRWIEntry.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0))
+ (((flags.get(indexRWIEntry.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0))
@ -179,7 +179,7 @@ public class indexRWIEntryOrder {
String dom;
Integer count;
while (p < this.end) {
iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++)));
iEntry = new indexRWIVarEntry(new indexRWIRowEntry(container.get(p++, false)));
this.decodedEntries.add(iEntry);
// find min/max
if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);

@ -336,7 +336,7 @@ public class kelondroEcoTable implements kelondroIndex {
file.get(i, b, 0);
} else {
// construct the row using the copy in RAM
kelondroRow.Entry v = table.get(i);
kelondroRow.Entry v = table.get(i, false);
assert v != null;
if (v == null) return null;
assert key.length == rowdef.primaryKeyLength;
@ -382,7 +382,7 @@ public class kelondroEcoTable implements kelondroIndex {
file.put(i, row.bytes(), 0);
} else {
// read old value
kelondroRow.Entry v = table.get(i);
kelondroRow.Entry v = table.get(i, false);
assert v != null;
System.arraycopy(row.getPrimaryKeyBytes(), 0, b, 0, rowdef.primaryKeyLength);
System.arraycopy(v.bytes(), 0, b, rowdef.primaryKeyLength, rowdef.objectsize - rowdef.primaryKeyLength);
@ -474,7 +474,7 @@ public class kelondroEcoTable implements kelondroIndex {
assert (file.size() == index.size() + fail);
} else {
// get result value from the table copy, so we don't need to read it from the file
kelondroRow.Entry v = table.get(i);
kelondroRow.Entry v = table.get(i, false);
System.arraycopy(key, 0, b, 0, key.length);
System.arraycopy(v.bytes(), 0, b, rowdef.primaryKeyLength, taildef.objectsize);
@ -610,7 +610,7 @@ public class kelondroEcoTable implements kelondroIndex {
}
} else {
// compose from table and key
kelondroRow.Entry v = table.get(this.c);
kelondroRow.Entry v = table.get(this.c, false);
assert v != null;
if (v == null) return null;
System.arraycopy(k, 0, b, 0, rowdef.primaryKeyLength);

@ -264,7 +264,7 @@ public class kelondroRowCollection {
return b;
}
public synchronized final kelondroRow.Entry get(int index) {
public synchronized final kelondroRow.Entry get(int index, boolean clone) {
assert (index >= 0) : "get: access with index " + index + " is below zero";
assert (index < chunkcount) : "get: access with index " + index + " is above chunkcount " + chunkcount + "; sortBound = " + sortBound;
assert (index * rowdef.objectsize < chunkcache.length);
@ -272,7 +272,7 @@ public class kelondroRowCollection {
if (index >= chunkcount) return null;
if ((index + 1) * rowdef.objectsize > chunkcache.length) return null; // the whole chunk does not fit into the chunkcache
this.lastTimeRead = System.currentTimeMillis();
return rowdef.newEntry(chunkcache, index * rowdef.objectsize, true);
return rowdef.newEntry(chunkcache, index * rowdef.objectsize, clone);
}
public synchronized final void set(int index, kelondroRow.Entry a) {
@ -395,7 +395,7 @@ public class kelondroRowCollection {
public synchronized kelondroRow.Entry removeOne() {
// removes the last entry from the collection
if (chunkcount == 0) return null;
kelondroRow.Entry r = get(chunkcount - 1);
kelondroRow.Entry r = get(chunkcount - 1, true);
if (chunkcount == sortBound) sortBound--;
chunkcount--;
this.lastTimeWrote = System.currentTimeMillis();
@ -471,7 +471,7 @@ public class kelondroRowCollection {
}
public kelondroRow.Entry next() {
return get(p++);
return get(p++, true);
}
public void remove() {
@ -747,13 +747,13 @@ public class kelondroRowCollection {
try {
while (i >= 0) {
if (compare(i, i + 1) == 0) {
collection.addUnique(get(i + 1));
collection.addUnique(get(i + 1, false));
removeRow(i + 1, false);
d++;
if (i + 1 < chunkcount - 1) u = false;
} else if (collection.size() > 0) {
// finish collection of double occurrences
collection.addUnique(get(i + 1));
collection.addUnique(get(i + 1, false));
removeRow(i + 1, false);
d++;
if (i + 1 < chunkcount - 1) u = false;
@ -778,7 +778,7 @@ public class kelondroRowCollection {
for (int i = 0; i < chunkcount - 1; i++) {
//System.out.println("*" + new String(get(i).getColBytes(0)));
if (compare(i, i + 1) > 0) {
System.out.println("?" + new String(get(i+1).getColBytes(0)));
System.out.println("?" + new String(get(i + 1, false).getColBytes(0)));
return false;
}
}
@ -915,7 +915,7 @@ public class kelondroRowCollection {
System.out.println("create c : " + (t1 - t0) + " nanoseconds, " + d(testsize, (t1 - t0)) + " entries/nanoseconds");
kelondroRowCollection d = new kelondroRowCollection(r, testsize);
for (int i = 0; i < testsize; i++) {
d.add(c.get(i).getColBytes(0));
d.add(c.get(i, false).getColBytes(0));
}
long t2 = System.nanoTime();
System.out.println("copy c -> d: " + (t2 - t1) + " nanoseconds, " + d(testsize, (t2 - t1)) + " entries/nanoseconds");

@ -119,7 +119,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
private kelondroRow.Entry get(byte[] key, int astart, int alength) {
long handle = profile.startRead();
int index = find(key, astart, alength);
kelondroRow.Entry entry = (index >= 0) ? get(index) : null;
kelondroRow.Entry entry = (index >= 0) ? get(index, true) : null;
profile.stopRead(handle);
return entry;
}
@ -148,7 +148,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
if (index < 0) {
super.addUnique(entry);
} else {
oldentry = get(index);
oldentry = get(index, true);
set(index, entry);
}
profile.stopWrite(handle);
@ -159,7 +159,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
int index = find(a, start, length);
if (index < 0) return null;
//System.out.println("remove: chunk found at index position (before remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length + 10) + ", searchkey=" + serverLog.arrayList(a, start, length));
kelondroRow.Entry entry = super.get(index);
kelondroRow.Entry entry = super.get(index, true);
super.removeRow(index, keepOrder);
//System.out.println("remove: chunk found at index position (after remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length) + ", searchkey=" + serverLog.arrayList(a, start, length));
int findagainindex = find(a, start, length);
@ -381,7 +381,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd
}
public kelondroRow.Entry next() {
kelondroRow.Entry entry = get(p);
kelondroRow.Entry entry = get(p, true);
if (up) p++; else p--;
return entry;
}

@ -488,19 +488,16 @@ public final class plasmaSearchEvent {
event.eventTime = System.currentTimeMillis();
// start worker threads to fetch urls and snippets
event.workerThreads = new resultWorker[workerThreadCount];
resultWorker worker;
for (int i = 0; i < workerThreadCount; i++) {
event.workerThreads[i] = event.deployWorker(i, 10000);
worker = event.new resultWorker(i, 10000);
worker.start();
event.workerThreads[i] = worker;
}
}
return event;
}
private resultWorker deployWorker(int id, long lifetime) {
resultWorker worker = new resultWorker(id, lifetime);
worker.start();
return worker;
}
private class resultWorker extends Thread {

@ -54,6 +54,7 @@ import de.anomic.yacy.yacyURL;
public final class plasmaSearchRankingProcess {
public static kelondroBinSearch[] ybrTables = null; // block-rank tables
public static final int maxYBR = 3; // the lower this value, the faster the search
private static boolean useYBR = true;
private kelondroSortStack<indexRWIVarEntry> stack;
@ -289,17 +290,20 @@ public final class plasmaSearchRankingProcess {
return bestEntry;
}
public synchronized indexURLReference bestURL(boolean skipDoubleDom) {
public indexURLReference bestURL(boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removed this entry from the list
while ((stack.size() > 0) || (size() > 0)) {
kelondroSortStack<indexRWIVarEntry>.stackElement obrwi = bestRWI(skipDoubleDom);
indexURLReference u = wordIndex.getURL(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
if (u != null) {
indexURLReference.Components comp = u.comp();
if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url
return u;
synchronized (this) {
if (((stack.size() == 0) && (size() == 0))) break;
kelondroSortStack<indexRWIVarEntry>.stackElement obrwi = bestRWI(skipDoubleDom);
indexURLReference u = wordIndex.getURL(obrwi.element.urlHash(), obrwi.element, obrwi.weight.longValue());
if (u != null) {
indexURLReference.Components comp = u.comp();
if (comp.url() != null) this.handover.put(u.hash(), comp.url().toNormalform(true, false)); // remember that we handed over this url
return u;
}
misses.add(obrwi.element.urlHash());
}
misses.add(obrwi.element.urlHash());
}
return null;
}
@ -432,7 +436,8 @@ public final class plasmaSearchRankingProcess {
if (ybrTables == null) return 15;
if (!(useYBR)) return 15;
final String domHash = urlHash.substring(6);
for (int i = 0; i < ybrTables.length; i++) {
int m = Math.min(maxYBR, ybrTables.length);
for (int i = 0; i < m; i++) {
if ((ybrTables[i] != null) && (ybrTables[i].contains(domHash.getBytes()))) {
//System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
return i;

@ -424,7 +424,7 @@ public final class plasmaWordIndex implements indexRI {
indexRWIRowEntry e, elm = null;
long lm = 0;
for (int j = 0; j < set.size(); j++) {
e = new indexRWIRowEntry(set.get(j));
e = new indexRWIRowEntry(set.get(j, true));
if ((elm == null) || (e.lastModified() > lm)) {
elm = e;
lm = e.lastModified();

Loading…
Cancel
Save