speed enhancements for crawler and url retrieval (affects also search speed)

- concurrency for LURL-fetching: this can be done using a concurrent lookup into the separated url databases. Concurrency is possible because there is no IO during lookup. The more LURL-Tables are present, the better is the speedup. More CPUs will increase speed
- because a large number of LURL-lookups are made during crawling (for double-check), the LURL-Lookup speed enhancements enhances also crawling speed
- search speed also profits from LURL-lookup enhancement
- changed some flushing parameters in word index caching which should make better use of large word index caches and should speed up indexing
- removed flush chunksize parameter, because this was only useful for IO path enhancement feature which was removed some weeks ago to prevent blocking and deadlocks during search requests

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4628 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 3ce3a4a3a1
commit 764a40e37d

@ -3,7 +3,7 @@ javacSource=1.5
javacTarget=1.5 javacTarget=1.5
# Release Configuration # Release Configuration
releaseVersion=0.576 releaseVersion=0.577
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz

@ -610,7 +610,6 @@ javastart_priority__pro=0
# flushed to disc; this may last some minutes. # flushed to disc; this may last some minutes.
wordCacheMaxCount = 20000 wordCacheMaxCount = 20000
wordCacheInitCount = 20000 wordCacheInitCount = 20000
wordFlushSize = 500
wordCacheMaxCount__pro = 20000 wordCacheMaxCount__pro = 20000
wordCacheInitCount__pro = 20000 wordCacheInitCount__pro = 20000
wordFlushSize__pro = 500 wordFlushSize__pro = 500

@ -138,15 +138,6 @@
This is is the init size of space for words in cache. This is is the init size of space for words in cache.
</td> </td>
</tr> </tr>
<tr valign="top" class="TableCellDark">
<td>word flush size:</td>
<td colspan="2">
<input name="wordFlushSize" type="text" size="20" maxlength="100" value="#[wordFlushSize]#" />
</td>
<td>
The word flush size is applied when an indexing loop is executed, and the cache size is exceeded.
</td>
</tr>
<tr valign="top" class="TableCellLight"> <tr valign="top" class="TableCellLight">
<td colspan="4"> <td colspan="4">
<input type="submit" name="cacheSizeSubmit" value="Enter New Cache Size" /> <input type="submit" name="cacheSizeSubmit" value="Enter New Cache Size" />

@ -189,10 +189,6 @@ public class PerformanceQueues_p {
int wordCacheInitCount = post.getInt(plasmaSwitchboard.WORDCACHE_INIT_COUNT, 30000); int wordCacheInitCount = post.getInt(plasmaSwitchboard.WORDCACHE_INIT_COUNT, 30000);
switchboard.setConfig(plasmaSwitchboard.WORDCACHE_INIT_COUNT, Integer.toString(wordCacheInitCount)); switchboard.setConfig(plasmaSwitchboard.WORDCACHE_INIT_COUNT, Integer.toString(wordCacheInitCount));
int flushsize = post.getInt("wordFlushSize", 2000);
switchboard.setConfig("wordFlushSize", Integer.toString(flushsize));
switchboard.wordIndex.setWordFlushSize(flushsize);
} }
if ((post != null) && (post.containsKey("poolConfig"))) { if ((post != null) && (post.containsKey("poolConfig"))) {
@ -249,7 +245,6 @@ public class PerformanceQueues_p {
prop.putNum("maxWaitingWordFlush", switchboard.getConfigLong("maxWaitingWordFlush", 180)); prop.putNum("maxWaitingWordFlush", switchboard.getConfigLong("maxWaitingWordFlush", 180));
prop.put("wordCacheMaxCount", switchboard.getConfigLong(plasmaSwitchboard.WORDCACHE_MAX_COUNT, 20000)); prop.put("wordCacheMaxCount", switchboard.getConfigLong(plasmaSwitchboard.WORDCACHE_MAX_COUNT, 20000));
prop.put("wordCacheInitCount", switchboard.getConfigLong(plasmaSwitchboard.WORDCACHE_INIT_COUNT, 30000)); prop.put("wordCacheInitCount", switchboard.getConfigLong(plasmaSwitchboard.WORDCACHE_INIT_COUNT, 30000));
prop.put("wordFlushSize", switchboard.getConfigLong("wordFlushSize", 2000));
prop.put("crawlPauseProxy", switchboard.getConfigLong(plasmaSwitchboard.PROXY_ONLINE_CAUTION_DELAY, 30000)); prop.put("crawlPauseProxy", switchboard.getConfigLong(plasmaSwitchboard.PROXY_ONLINE_CAUTION_DELAY, 30000));
prop.put("crawlPauseLocalsearch", switchboard.getConfigLong(plasmaSwitchboard.LOCALSEACH_ONLINE_CAUTION_DELAY, 30000)); prop.put("crawlPauseLocalsearch", switchboard.getConfigLong(plasmaSwitchboard.LOCALSEACH_ONLINE_CAUTION_DELAY, 30000));
prop.put("crawlPauseRemotesearch", switchboard.getConfigLong(plasmaSwitchboard.REMOTESEARCH_ONLINE_CAUTION_DELAY, 30000)); prop.put("crawlPauseRemotesearch", switchboard.getConfigLong(plasmaSwitchboard.REMOTESEARCH_ONLINE_CAUTION_DELAY, 30000));

@ -36,8 +36,18 @@ import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import de.anomic.server.serverMemory; import de.anomic.server.serverMemory;
import de.anomic.server.serverProcessor;
public class kelondroSplitTable implements kelondroIndex { public class kelondroSplitTable implements kelondroIndex {
@ -47,6 +57,10 @@ public class kelondroSplitTable implements kelondroIndex {
private static final long minimumRAM4Eco = 80 * 1024 * 1024; private static final long minimumRAM4Eco = 80 * 1024 * 1024;
private static final int EcoFSBufferSize = 20; private static final int EcoFSBufferSize = 20;
private static final kelondroIndex dummyIndex = new kelondroRAMIndex(new kelondroRow(new kelondroColumn[]{new kelondroColumn("key", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 2, "key")}, kelondroNaturalOrder.naturalOrder, 0), 0);
// the thread pool for the keeperOf executor service
private ExecutorService executor;
private HashMap<String, kelondroIndex> tables; // a map from a date string to a kelondroIndex object private HashMap<String, kelondroIndex> tables; // a map from a date string to a kelondroIndex object
private kelondroRow rowdef; private kelondroRow rowdef;
@ -64,6 +78,9 @@ public class kelondroSplitTable implements kelondroIndex {
public void init(boolean resetOnFail) { public void init(boolean resetOnFail) {
// init the thread pool for the keeperOf executor service
this.executor = new ThreadPoolExecutor(serverProcessor.useCPU + 1, serverProcessor.useCPU + 1, 10, TimeUnit.SECONDS, new ArrayBlockingQueue<Runnable>(serverProcessor.useCPU + 1));
// initialized tables map // initialized tables map
this.tables = new HashMap<String, kelondroIndex>(); this.tables = new HashMap<String, kelondroIndex>();
if (!(path.exists())) path.mkdirs(); if (!(path.exists())) path.mkdirs();
@ -174,12 +191,9 @@ public class kelondroSplitTable implements kelondroIndex {
} }
public int writeBufferSize() { public int writeBufferSize() {
Iterator<kelondroIndex> i = tables.values().iterator();
int s = 0; int s = 0;
kelondroIndex ki; for (final kelondroIndex index : tables.values()) {
while (i.hasNext()) { if (index instanceof kelondroCache) s += ((kelondroCache) index).writeBufferSize();
ki = ((kelondroIndex) i.next());
if (ki instanceof kelondroCache) s += ((kelondroCache) ki).writeBufferSize();
} }
return s; return s;
} }
@ -189,19 +203,13 @@ public class kelondroSplitTable implements kelondroIndex {
} }
public boolean has(byte[] key) throws IOException { public boolean has(byte[] key) throws IOException {
Iterator<kelondroIndex> i = tables.values().iterator(); return keeperOf(key) != null;
kelondroIndex table;
while (i.hasNext()) {
table = (kelondroIndex) i.next();
if (table.has(key)) return true;
}
return false;
} }
public synchronized kelondroRow.Entry get(byte[] key) throws IOException { public synchronized kelondroRow.Entry get(byte[] key) throws IOException {
Object[] keeper = keeperOf(key); kelondroIndex keeper = keeperOf(key);
if (keeper == null) return null; if (keeper == null) return null;
return (kelondroRow.Entry) keeper[1]; return keeper.get(key);
} }
public synchronized void putMultiple(List<kelondroRow.Entry> rows) throws IOException { public synchronized void putMultiple(List<kelondroRow.Entry> rows) throws IOException {
@ -214,8 +222,8 @@ public class kelondroSplitTable implements kelondroIndex {
public synchronized kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException { public synchronized kelondroRow.Entry put(kelondroRow.Entry row, Date entryDate) throws IOException {
assert row.objectsize() <= this.rowdef.objectsize; assert row.objectsize() <= this.rowdef.objectsize;
Object[] keeper = keeperOf(row.getColBytes(0)); kelondroIndex keeper = keeperOf(row.getColBytes(0));
if (keeper != null) return ((kelondroIndex) keeper[0]).put(row); if (keeper != null) return keeper.put(row);
if ((entryDate == null) || (entryDate.after(new Date()))) entryDate = new Date(); // fix date if ((entryDate == null) || (entryDate.after(new Date()))) entryDate = new Date(); // fix date
String suffix = dateSuffix(entryDate); String suffix = dateSuffix(entryDate);
if (suffix == null) return null; if (suffix == null) return null;
@ -247,17 +255,62 @@ public class kelondroSplitTable implements kelondroIndex {
return null; return null;
} }
public synchronized Object[] keeperOf(byte[] key) throws IOException { public synchronized kelondroIndex keeperOf(final byte[] key) throws IOException {
Iterator<kelondroIndex> i = tables.values().iterator(); // because the index is stored only in one table,
kelondroIndex table; // and the index is completely in RAM, a concurrency will create
kelondroRow.Entry entry; // not concurrent File accesses
while (i.hasNext()) { //long start = System.currentTimeMillis();
table = (kelondroIndex) i.next();
entry = table.get(key); // start a concurrent query to database tables
if (entry != null) return new Object[]{table, entry}; CompletionService<kelondroIndex> cs = new ExecutorCompletionService<kelondroIndex>(executor);
int s = tables.size();
for (final kelondroIndex table : tables.values()) {
cs.submit(new Callable<kelondroIndex>() {
public kelondroIndex call() {
try {
if (table.has(key)) return table; else return dummyIndex;
} catch (IOException e) {
return dummyIndex;
}
} }
});
}
// read the result
try {
for (int i = 0; i < s; i++) {
Future<kelondroIndex> f = cs.take();
kelondroIndex index = f.get();
if (index != dummyIndex) {
//System.out.println("*DEBUG SplitTable success.time = " + (System.currentTimeMillis() - start) + " ms");
return index;
}
}
//System.out.println("*DEBUG SplitTable fail.time = " + (System.currentTimeMillis() - start) + " ms");
return null; return null;
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} catch (ExecutionException e) {
throw new RuntimeException(e.getCause());
}
//System.out.println("*DEBUG SplitTable fail.time = " + (System.currentTimeMillis() - start) + " ms");
return null;
}
/*
public synchronized kelondroIndex keeperOf(byte[] key) throws IOException {
// TODO: apply concurrency here!
// because the index is stored only in one table,
// and the index is completely in RAM, a concurrency would create not concurrent File accesses
long start = System.currentTimeMillis();
for (final kelondroIndex table : tables.values()) {
if (table.has(key)) {
System.out.println("*DEBUG SplitTable success.time = " + (System.currentTimeMillis() - start) + " ms");
return table;
}
} }
System.out.println("*DEBUG SplitTable fail.time = " + (System.currentTimeMillis() - start) + " ms");
return null;
}*/
public synchronized void addUnique(kelondroRow.Entry row) throws IOException { public synchronized void addUnique(kelondroRow.Entry row) throws IOException {
addUnique(row, null); addUnique(row, null);
@ -303,15 +356,9 @@ public class kelondroSplitTable implements kelondroIndex {
} }
public synchronized kelondroRow.Entry remove(byte[] key, boolean keepOrder) throws IOException { public synchronized kelondroRow.Entry remove(byte[] key, boolean keepOrder) throws IOException {
Iterator<kelondroIndex> i = tables.values().iterator(); kelondroIndex table = keeperOf(key);
kelondroIndex table; if (table == null) return null;
kelondroRow.Entry entry; return table.remove(key, keepOrder);
while (i.hasNext()) {
table = i.next();
entry = table.remove(key, keepOrder);
if (entry != null) return entry;
}
return null;
} }
public synchronized kelondroRow.Entry removeOne() throws IOException { public synchronized kelondroRow.Entry removeOne() throws IOException {
@ -372,11 +419,17 @@ public class kelondroSplitTable implements kelondroIndex {
public synchronized void close() { public synchronized void close() {
if (tables == null) return; if (tables == null) return;
this.executor.shutdown();
try {
this.executor.awaitTermination(3, TimeUnit.SECONDS);
} catch (InterruptedException e) {
}
this.executor = null;
Iterator<kelondroIndex> i = tables.values().iterator(); Iterator<kelondroIndex> i = tables.values().iterator();
while (i.hasNext()) { while (i.hasNext()) {
i.next().close(); i.next().close();
} }
tables = null; this.tables = null;
} }
public static void main(String[] args) { public static void main(String[] args) {

@ -62,7 +62,7 @@ public class plasmaCrawlNURL {
public static final int STACK_TYPE_MOVIE = 12; // put on movie stack public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
public static final int STACK_TYPE_MUSIC = 13; // put on music stack public static final int STACK_TYPE_MUSIC = 13; // put on music stack
private static final long minimumLocalDelta = 10; // the minimum time difference between access of the same local domain private static final long minimumLocalDelta = 0; // the minimum time difference between access of the same local domain
private static final long minimumGlobalDelta = 333; // the minimum time difference between access of the same global domain private static final long minimumGlobalDelta = 333; // the minimum time difference between access of the same global domain
private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt

@ -902,7 +902,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
String networkUnitDefinition = getConfig("network.unit.definition", "defaults/yacy.network.freeworld.unit"); String networkUnitDefinition = getConfig("network.unit.definition", "defaults/yacy.network.freeworld.unit");
String networkGroupDefinition = getConfig("network.group.definition", "yacy.network.group"); String networkGroupDefinition = getConfig("network.group.definition", "yacy.network.group");
// patch old values // patch old values
if (networkUnitDefinition.equals("yacy.network.unit")) networkUnitDefinition = "defaults/yacy.network.freeworld.unit"; if (networkUnitDefinition.equals("yacy.network.unit")) {
networkUnitDefinition = "defaults/yacy.network.freeworld.unit";
setConfig("network.unit.definition", networkUnitDefinition);
}
// include additional network definition properties into our settings // include additional network definition properties into our settings
// note that these properties cannot be set in the application because they are // note that these properties cannot be set in the application because they are
@ -1124,7 +1127,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
(int) getConfigLong(WORDCACHE_MAX_COUNT, 20000)); (int) getConfigLong(WORDCACHE_MAX_COUNT, 20000));
setConfig(WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount)); setConfig(WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));
wordIndex.setMaxWordCount(wordCacheMaxCount); wordIndex.setMaxWordCount(wordCacheMaxCount);
wordIndex.setWordFlushSize((int) getConfigLong("wordFlushSize", 10000));
// set a maximum amount of memory for the caches // set a maximum amount of memory for the caches
// long memprereq = Math.max(getConfigLong(INDEXER_MEMPREREQ, 0), wordIndex.minMem()); // long memprereq = Math.max(getConfigLong(INDEXER_MEMPREREQ, 0), wordIndex.minMem());

@ -67,14 +67,13 @@ public final class plasmaWordIndex implements indexRI {
// environment constants // environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 400; // maximum number of references for each urlhash public static final int wCacheMaxChunk = 500; // maximum number of references for each urlhash
public static final int lowcachedivisor = 320; public static final int lowcachedivisor = 1000;
public static final int maxCollectionPartition = 7; // should be 7 public static final int maxCollectionPartition = 7; // should be 7
private final kelondroByteOrder indexOrder = kelondroBase64Order.enhancedCoder; private final kelondroByteOrder indexOrder = kelondroBase64Order.enhancedCoder;
private final indexRAMRI dhtOutCache, dhtInCache; private final indexRAMRI dhtOutCache, dhtInCache;
private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
private int flushsize;
private serverLog log; private serverLog log;
final indexRepositoryReference referenceURL; final indexRepositoryReference referenceURL;
@ -111,8 +110,6 @@ public final class plasmaWordIndex implements indexRI {
// create LURL-db // create LURL-db
referenceURL = new indexRepositoryReference(indexSecondaryRoot, networkName); referenceURL = new indexRepositoryReference(indexSecondaryRoot, networkName);
// performance settings
this.flushsize = 2000;
} }
public void putURL(indexURLReference entry) throws IOException { public void putURL(indexURLReference entry) throws IOException {
@ -200,7 +197,6 @@ public final class plasmaWordIndex implements indexRI {
Iterator<indexContainer> it = cache.wordContainers(null, false); Iterator<indexContainer> it = cache.wordContainers(null, false);
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes; while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
} }
return cacheBytes; return cacheBytes;
} }
@ -209,21 +205,15 @@ public final class plasmaWordIndex implements indexRI {
dhtInCache.setMaxWordCount(maxWords); dhtInCache.setMaxWordCount(maxWords);
} }
public void setWordFlushSize(int flushsize) {
this.flushsize = flushsize;
}
public void dhtFlushControl(indexRAMRI theCache) { public void dhtFlushControl(indexRAMRI theCache) {
// check for forced flush // check for forced flush
int count = -1; while (theCache.maxURLinCache() > wCacheMaxChunk ) {
synchronized (theCache) { flushCache(theCache, Math.min(10, theCache.size()));
if ((theCache.maxURLinCache() > wCacheMaxChunk ) ||
(theCache.size() > theCache.getMaxWordCount()) ||
(serverMemory.available() < collections.minMem())) {
count = theCache.size() + flushsize - theCache.getMaxWordCount();
} }
if ((theCache.size() > theCache.getMaxWordCount()) ||
(serverMemory.available() < collections.minMem())) {
flushCache(theCache, Math.min(theCache.size() - theCache.getMaxWordCount() + 1, theCache.size()));
} }
if (count >= 0) flushCache(theCache, (count > 0) ? count : 1);
} }
public long getUpdateTime(String wordHash) { public long getUpdateTime(String wordHash) {
@ -271,8 +261,8 @@ public final class plasmaWordIndex implements indexRI {
} }
public int flushCacheSome() { public int flushCacheSome() {
int fo = flushCache(dhtOutCache, (dhtOutCache.size() > 3 * flushsize) ? flushsize : Math.min(flushsize, Math.max(1, dhtOutCache.size() / lowcachedivisor))); int fo = flushCache(dhtOutCache, Math.max(1, dhtOutCache.size() / lowcachedivisor));
int fi = flushCache(dhtInCache, (dhtInCache.size() > 3 * flushsize) ? flushsize : Math.min(flushsize, Math.max(1, dhtInCache.size() / lowcachedivisor))); int fi = flushCache(dhtInCache, Math.max(1, dhtInCache.size() / lowcachedivisor));
return fo + fi; return fo + fi;
} }

@ -699,7 +699,7 @@ public final class serverCore extends serverAbstractBusyThread implements server
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
busySessions.remove(this); if (busySessions != null) busySessions.remove(this);
} }
} }

Loading…
Cancel
Save