- modified and enhanced the crawl balancer: better list export, fixing of damaged crawl queue at start-up, re-sorting at start-up to enhance domain order

- added option to set minimum crawl delta for domains in balancer
- added default values to crawl deltas in yacy.init
- added configuration for these deltas in performance queues
- enhanced performance setting computation (more time for indexing queue for a faster flush
- remote crawling is now enabled during local crawling if indexer has space and time for more links
- added database stub for new distributed file system
- refactoring of time computation to get an abstraction level that will be used by a TTL rule in new distributed file system

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4966 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 1afc36d9a8
commit 474659a71f

@ -206,6 +206,15 @@ proxyCacheSize__pro = 1024
# storage place for new releases
releases = DATA/RELEASE
# time limits for the crawler:
# these times (milliseconds) are the shortest times for an access of the crawler to the same domain
# the crawler may read files faster than that, but never from the same domain faster than these time intervals
# a delta of 500 milliseconds means that no more than two files are taken from the same server
# there is a hard-coded limit which prevents that the used time is shorter that these default times
# the time-limits are distinguished for local and global crawls: there is no limit for an intranet-crawl.
minimumLocalDelta = 0
minimumGlobalDelta = 500
# the following mime-types are the whitelist for indexing
#
# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser

@ -59,7 +59,6 @@ import de.anomic.server.serverDomains;
import de.anomic.server.serverInstantBusyThread;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties;
import de.anomic.yacy.yacyAccessible;
import de.anomic.yacy.yacySeed;

@ -44,6 +44,7 @@
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
@ -103,7 +104,7 @@ public class IndexCreateWWWGlobalQueue_p {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
CrawlEntry[] crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, showLimit);
ArrayList<CrawlEntry> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, showLimit);
CrawlEntry urle;
boolean dark = true;
@ -111,8 +112,8 @@ public class IndexCreateWWWGlobalQueue_p {
String profileHandle;
CrawlProfile.entry profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.length) && (showNum < showLimit); i++) {
urle = crawlerList[i];
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if ((urle != null)&&(urle.url()!=null)) {
initiator = sb.webIndex.seedDB.getConnected(urle.initiator());
profileHandle = urle.profileHandle();

@ -44,6 +44,7 @@
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
@ -171,7 +172,7 @@ public class IndexCreateWWWLocalQueue_p {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
CrawlEntry[] crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
ArrayList<CrawlEntry> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
CrawlEntry urle;
boolean dark = true;
@ -179,8 +180,8 @@ public class IndexCreateWWWLocalQueue_p {
String profileHandle;
CrawlProfile.entry profileEntry;
int i;
for (i = 0; (i < crawlerList.length) && (showNum < showLimit); i++) {
urle = crawlerList[i];
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if ((urle != null)&&(urle.url()!=null)) {
initiator = sb.webIndex.seedDB.getConnected(urle.initiator());
profileHandle = urle.profileHandle();

@ -44,6 +44,7 @@
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
@ -103,7 +104,7 @@ public class IndexCreateWWWRemoteQueue_p {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
CrawlEntry[] crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_REMOTE, showLimit);
ArrayList<CrawlEntry> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_REMOTE, showLimit);
CrawlEntry urle;
boolean dark = true;
@ -111,8 +112,8 @@ public class IndexCreateWWWRemoteQueue_p {
String profileHandle;
CrawlProfile.entry profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.length) && (showNum < showLimit); i++) {
urle = crawlerList[i];
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.webIndex.seedDB.getConnected(urle.initiator());
profileHandle = urle.profileHandle();

@ -153,27 +153,49 @@
</form>
<p>
<strong>Thread pool settings:</strong>
<strong>Balancer Settings:</strong>
</p>
<p>
This is the time delta between accessing of the same domain during a crawl. The crawl balancer tries to avoid that domains are
accessed too often, but if the balancer fails (i.e. if there are only links left from the same domain), then these minimum
delta times are ensured.
</p>
<form action="PerformanceQueues_p.html" method="post" enctype="multipart/form-data">
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td>Crawler Domain</td>
<td>Minimum Access Time Delta</td>
</tr>
<tr class="TableCellDark">
<td>local (intranet) crawls</td>
<td align="right"><input id="minimumLocalDelta" name="minimumLocalDelta" type="text" size="20" maxlength="100" value="#[minimumLocalDelta]#" /></td>
</tr>
<tr class="TableCellDark">
<td>global (internet) crawls</td>
<td align="right"><input id="minimumGlobalDelta" name="minimumGlobalDelta" type="text" size="20" maxlength="100" value="#[minimumGlobalDelta]#" /></td>
</tr>
<tr class="TableCellLight">
<td align="left" colspan="2"><input type="submit" name="minimumDeltaSubmit" value="Enter New Parameters" />
Changes take effect immediately</td>
</tr>
</table>
</form>
<p>
<strong>Thread Pool Settings:</strong>
</p>
<form action="PerformanceQueues_p.html" method="post" enctype="multipart/form-data">
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col width="150" />
<col width="80" span="5" />
<col />
</colgroup>
<tr class="TableHeader" valign="bottom">
<td>Thread Pool</td>
<td>maximum Active</td>
<td>current Active</td>
<td>Full Description</td>
</tr>
#{pool}#
<tr class="TableCellDark">
<td align="left">#[name]#</td>
<td align="right"><input name="#[name]#_maxActive" type="text" size="8" maxlength="8" value="#[maxActive]#" /></td>
<td align="right">#[numActive]#</td>
<td align="left"></td>
</tr>
#{/pool}#
<tr class="TableCellLight">
@ -184,6 +206,40 @@
</table>
</form>
<p>
<strong>Online Caution Settings:</strong><br />
This is the time that the crawler idles when the proxy is accessed, or a local or remote search is done.
The delay is extended by this time each time the proxy is accessed afterwards.
This shall improve performance of the affected process (proxy or search).
(current delta is #[crawlPauseProxyCurrent]#/#[crawlPauseLocalsearchCurrent]#/#[crawlPauseRemotesearchCurrent]#
seconds since last proxy/local-search/remote-search access.)
</p>
<form action="PerformanceQueues_p.html" method="post" enctype="multipart/form-data">
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td>Online Caution Case</td>
<td>indexer delay (milliseconds) after case occurency</td>
</tr>
<tr class="TableCellDark">
<td>Proxy:</td>
<td align="right"><input id="crawlPauseProxy" name="crawlPauseProxy" type="text" size="20" maxlength="100" value="#[crawlPauseProxy]#" /></td>
</tr>
<tr class="TableCellDark">
<td>Local Search:</td>
<td align="right"><input id="crawlPauseLocalsearch" name="crawlPauseLocalsearch" type="text" size="20" maxlength="100" value="#[crawlPauseLocalsearch]#" /></td>
</tr>
<tr class="TableCellDark">
<td>Remote Search:</td>
<td align="right"><input id="crawlPauseRemotesearch" name="crawlPauseRemotesearch" type="text" size="20" maxlength="100" value="#[crawlPauseRemotesearch]#" /></td>
</tr>
<tr class="TableCellLight">
<td align="left" colspan="2"><input type="submit" name="onlineCautionSubmit" value="Enter New Parameters" />
Changes take effect immediately</td>
</tr>
</table>
</form>
<form action="PerformanceQueues_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>YaCy Priority Settings</legend>
<dl>
@ -199,28 +255,6 @@
</fieldset>
</form>
<form action="PerformanceQueues_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>Proxy Performance Settings</legend>
<p>
This is the time that the crawler idles when the proxy is accessed, or a local or remote search is done.
The delay is extended by this time each time the proxy is accessed afterwards.
This shall improve performance of the affected process (proxy or search).
(current delta is #[crawlPauseProxyCurrent]#/#[crawlPauseLocalsearchCurrent]#/#[crawlPauseRemotesearchCurrent]#
seconds since last proxy/local-search/remote-search access.)
</p>
<dl>
<dt><label for="crawlPauseProxy">Online Caution Delay for Proxy (milliseconds)</label>:</dt>
<dd><input id="crawlPauseProxy" name="crawlPauseProxy" type="text" size="20" maxlength="100" value="#[crawlPauseProxy]#" /></dd>
<dt><label for="crawlPauseLocalsearch">Online Caution Delay for Local Search (milliseconds)</label>:</dt>
<dd><input id="crawlPauseLocalsearch" name="crawlPauseLocalsearch" type="text" size="20" maxlength="100" value="#[crawlPauseLocalsearch]#" /></dd>
<dt><label for="crawlPauseRemotesearch">Online Caution Delay for Remote Search (milliseconds)</label>:</dt>
<dd><input id="crawlPauseRemotesearch" name="crawlPauseRemotesearch" type="text" size="20" maxlength="100" value="#[crawlPauseRemotesearch]#" /></dd>
<dt><input type="submit" name="onlineCautionSubmit" value="Enter New Parameters" /></dt>
<dd>Changes take effect immediately</dd>
</dl>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -69,7 +69,6 @@ public class PerformanceQueues_p {
performanceProfiles.put("defaults/performance_dht.profile", "prefer DHT");
}
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch<?> sb) {
// return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) sb;
@ -253,6 +252,19 @@ public class PerformanceQueues_p {
switchboard.setConfig(plasmaSwitchboard.REMOTESEARCH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseRemotesearch", 30000)));
}
if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) {
long minimumLocalDelta = post.getLong("minimumLocalDelta", switchboard.crawlQueues.noticeURL.getMinimumLocalDelta());
long minimumGlobalDelta = post.getLong("minimumGlobalDelta", switchboard.crawlQueues.noticeURL.getMinimumGlobalDelta());
switchboard.setConfig("minimumLocalDelta", minimumLocalDelta);
switchboard.setConfig("minimumGlobalDelta", minimumGlobalDelta);
switchboard.crawlQueues.noticeURL.setMinimumLocalDelta(minimumLocalDelta);
switchboard.crawlQueues.noticeURL.setMinimumGlobalDelta(minimumGlobalDelta);
}
// delta settings
prop.put("minimumLocalDelta", switchboard.crawlQueues.noticeURL.getMinimumLocalDelta());
prop.put("minimumGlobalDelta", switchboard.crawlQueues.noticeURL.getMinimumGlobalDelta());
// table cache settings
prop.putNum("urlCacheSize", switchboard.webIndex.getURLwriteCacheSize());
prop.putNum("wordCacheWSize", switchboard.webIndex.dhtOutCacheSize());

@ -181,13 +181,13 @@ public class queues_p {
}
public static final void addNTable(plasmaSwitchboard sb, serverObjects prop, String tableName, CrawlEntry[] crawlerList) {
public static final void addNTable(plasmaSwitchboard sb, serverObjects prop, String tableName, ArrayList<CrawlEntry> crawlerList) {
int showNum = 0;
CrawlEntry urle;
yacySeed initiator;
for (int i = 0; i < crawlerList.length; i++) {
urle = crawlerList[i];
for (int i = 0; i < crawlerList.size(); i++) {
urle = crawlerList.get(i);
if ((urle != null) && (urle.url() != null)) {
initiator = sb.webIndex.seedDB.getConnected(urle.initiator());
prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle());

@ -116,6 +116,21 @@ public class Balancer {
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
openFileIndex();
if (urlFileStack.size() != urlFileIndex.size() || (urlFileIndex.size() < 10000 && urlFileIndex.size() > 0)) {
// fix the file stack
serverLog.logInfo("Balancer", "re-creating the " + stackname + " balancer stack, size = " + urlFileIndex.size() + ((urlFileStack.size() == urlFileIndex.size()) ? "" : " (the old stack size was wrong)" ));
urlFileStack = kelondroStack.reset(urlFileStack);
try {
Iterator<byte[]> i = urlFileIndex.keys(true, null);
byte[] hash;
while (i.hasNext()) {
hash = i.next();
pushHash(new String(hash));
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public synchronized void close() {
@ -134,7 +149,7 @@ public class Balancer {
public void finalize() {
if (urlFileStack != null) {
serverLog.logWarning("plasmaCrawlBalancer", "crawl stack " + stackname + " closed by finalizer");
serverLog.logWarning("Balancer", "crawl stack " + stackname + " closed by finalizer");
close();
}
}
@ -321,24 +336,29 @@ public class Balancer {
return;
}
// add to index
urlFileIndex.put(entry.toRow());
// add the hash to a queue
pushHash(entry.url().hash());
}
private void pushHash(String hash) throws IOException {
// extend domain stack
String dom = entry.url().hash().substring(6);
String dom = hash.substring(6);
LinkedList<String> domainList = domainStacks.get(dom);
if (domainList == null) {
// create new list
domainList = new LinkedList<String>();
synchronized (domainStacks) {
domainList.add(entry.url().hash());
domainList.add(hash);
domainStacks.put(dom, domainList);
}
} else {
// extend existent domain list
domainList.addLast(entry.url().hash());
domainList.addLast(hash);
}
// add to index
urlFileIndex.put(entry.toRow());
// check size of domainStacks and flush
if ((domainStacks.size() > 100) || (sizeDomainStacks() > 1000)) {
flushOnceDomStacks(1, urlRAMStack.size() < 100); // when the ram stack is small, flush it there
@ -507,16 +527,16 @@ public class Balancer {
if (lastAccess == null) return Long.MAX_VALUE; // never accessed
return System.currentTimeMillis() - lastAccess.time();
}
public synchronized CrawlEntry top(int dist) throws IOException {
public synchronized ArrayList<CrawlEntry> top(int count) throws IOException {
// if we need to flush anything, then flush the domain stack first,
// to avoid that new urls get hidden by old entries from the file stack
if (urlRAMStack == null) return null;
while ((domainStacksNotEmpty()) && (urlRAMStack.size() <= dist)) {
while ((domainStacksNotEmpty()) && (urlRAMStack.size() <= count)) {
// flush only that much as we need to display
flushOnceDomStacks(0, true);
}
while ((urlFileStack != null) && (urlRAMStack.size() <= dist) && (urlFileStack.size() > 0)) {
while ((urlFileStack != null) && (urlRAMStack.size() <= count) && (urlFileStack.size() > 0)) {
// flush some entries from disc to ram stack
try {
kelondroRow.Entry t = urlFileStack.pop();
@ -526,16 +546,18 @@ public class Balancer {
break;
}
}
if (dist >= urlRAMStack.size()) return null;
String urlhash = urlRAMStack.get(dist);
kelondroRow.Entry entry = urlFileIndex.get(urlhash.getBytes());
if (entry == null) {
if (kelondroAbstractRecords.debugmode) serverLog.logWarning("PLASMA BALANCER", "no entry in index for urlhash " + urlhash);
return null;
count = Math.min(count, urlRAMStack.size());
ArrayList<CrawlEntry> list = new ArrayList<CrawlEntry>();
for (int i = 0; i < count; i++) {
String urlhash = urlRAMStack.get(i);
kelondroRow.Entry entry = urlFileIndex.get(urlhash.getBytes());
if (entry == null) break;
list.add(new CrawlEntry(entry));
}
return new CrawlEntry(entry);
return list;
}
public synchronized Iterator<CrawlEntry> iterator() throws IOException {
return new EntryIterator();
}

@ -261,7 +261,7 @@ public class CrawlQueues {
return false;
}
if (sb.webIndex.queuePreStack.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30)) {
if (sb.webIndex.queuePreStack.size() >= (int) sb.getConfigLong(plasmaSwitchboard.INDEXER_SLOTS, 30) / 2) {
if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.webIndex.queuePreStack.size() + ")");
return false;
}
@ -276,12 +276,21 @@ public class CrawlQueues {
return false;
}
if (remoteTriggeredCrawlJobSize() > 0) {
if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: the remote-triggered crawl job queue is filled, omitting processing");
return false;
}
/*
if (coreCrawlJobSize() > 0) {
if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: a local crawl is running, omitting processing");
return false;
}
*/
// check if we have an entry in the provider list, otherwise fill the list
yacySeed seed;
if ((remoteCrawlProviderHashes.size() == 0) &&
(coreCrawlJobSize() == 0) &&
(remoteTriggeredCrawlJobSize() == 0) &&
(sb.queueSize() < 10)) {
if (remoteCrawlProviderHashes.size() == 0) {
if (sb.webIndex.seedDB != null && sb.webIndex.seedDB.sizeConnected() > 0) {
Iterator<yacySeed> e = sb.webIndex.peerActions.dhtAction.getProvidesRemoteCrawlURLs();
while (e.hasNext()) {

@ -194,6 +194,7 @@ public class IndexingStack {
}
public Collection<QueueEntry> getActiveQueueEntries() {
// todo: check dead entries?
return this.queueInProcess.values();
}

@ -62,9 +62,9 @@ public class NoticedURL {
public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
public static final int STACK_TYPE_MUSIC = 13; // put on music stack
private static final long minimumLocalDelta = 0; // the minimum time difference between access of the same local domain
private static final long minimumGlobalDelta = 333; // the minimum time difference between access of the same global domain
private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt
private static final long minimumLocalDeltaInit = 0; // the minimum time difference between access of the same local domain
private static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt
private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth
@ -73,14 +73,34 @@ public class NoticedURL {
//private kelondroStack imageStack; // links pointing to image resources
//private kelondroStack movieStack; // links pointing to movie resources
//private kelondroStack musicStack; // links pointing to music resources
private long minimumLocalDelta;
private long minimumGlobalDelta;
public NoticedURL(File cachePath) {
coreStack = new Balancer(cachePath, "urlNoticeCoreStack", false);
limitStack = new Balancer(cachePath, "urlNoticeLimitStack", false);
this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", false);
this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", false);
//overhangStack = new plasmaCrawlBalancer(overhangStackFile);
remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", false);
this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", false);
this.minimumLocalDelta = minimumLocalDeltaInit;
this.minimumGlobalDelta = minimumGlobalDeltaInit;
}
public long getMinimumLocalDelta() {
return this.minimumLocalDelta;
}
public long getMinimumGlobalDelta() {
return this.minimumGlobalDelta;
}
public void setMinimumLocalDelta(long newDelta) {
this.minimumLocalDelta = Math.max(minimumLocalDeltaInit, newDelta);
}
public void setMinimumGlobalDelta(long newDelta) {
this.minimumGlobalDelta = Math.max(minimumGlobalDeltaInit, newDelta);
}
public void clear() {
coreStack.clear();
limitStack.clear();
@ -185,7 +205,7 @@ public class NoticedURL {
return removed;
}
public CrawlEntry[] top(int stackType, int count) {
public ArrayList<CrawlEntry> top(int stackType, int count) {
switch (stackType) {
case STACK_TYPE_CORE: return top(coreStack, count);
case STACK_TYPE_LIMIT: return top(limitStack, count);
@ -240,20 +260,16 @@ public class NoticedURL {
throw new IOException("balancer stack is empty");
}
private CrawlEntry[] top(Balancer balancer, int count) {
private ArrayList<CrawlEntry> top(Balancer balancer, int count) {
// this is a filo - top
if (count > balancer.size()) count = balancer.size();
ArrayList<CrawlEntry> list = new ArrayList<CrawlEntry>(count);
for (int i = 0; i < count; i++) {
try {
CrawlEntry entry = balancer.top(i);
if (entry == null) break;
list.add(entry);
} catch (IOException e) {
break;
}
ArrayList<CrawlEntry> list;
try {
list = balancer.top(count);
} catch (IOException e) {
list = new ArrayList<CrawlEntry>(0);
}
return list.toArray(new CrawlEntry[list.size()]);
return list;
}
public Iterator<CrawlEntry> iterator(int stackType) {

@ -29,9 +29,9 @@ package de.anomic.index;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroMicroDate;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.yacy.yacySeedDB;
public final class indexRWIRowEntry implements indexRWIEntry {
@ -112,8 +112,8 @@ public final class indexRWIRowEntry implements indexRWIEntry {
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
this.entry = urlEntryRow.newEntry();
int mddlm = plasmaWordIndex.microDateDays(lastmodified);
int mddct = plasmaWordIndex.microDateDays(updatetime);
int mddlm = kelondroMicroDate.microDateDays(lastmodified);
int mddct = kelondroMicroDate.microDateDays(updatetime);
this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_lastModified, mddlm);
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
@ -158,11 +158,6 @@ public final class indexRWIRowEntry implements indexRWIEntry {
this.entry = rentry;
}
public static int days(long time) {
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
return (int) (time / 86400000);
}
public indexRWIRowEntry clone() {
byte[] b = new byte[urlEntryRow.objectsize];
System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize);
@ -186,11 +181,11 @@ public final class indexRWIRowEntry implements indexRWIEntry {
}
public long lastModified() {
return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
return kelondroMicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
}
public long freshUntil() {
return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_freshUntil));
return kelondroMicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_freshUntil));
}
public int hitcount() {

@ -27,7 +27,7 @@
package de.anomic.index;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.kelondro.kelondroMicroDate;
public class indexRWIVarEntry implements indexRWIEntry {
@ -62,8 +62,8 @@ public class indexRWIVarEntry implements indexRWIEntry {
double termfrequency
) {
if ((language == null) || (language.length() != 2)) language = "uk";
int mddlm = plasmaWordIndex.microDateDays(lastmodified);
int mddct = plasmaWordIndex.microDateDays(updatetime);
int mddlm = kelondroMicroDate.microDateDays(lastmodified);
int mddct = kelondroMicroDate.microDateDays(updatetime);
this.flags = flags;
this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2);
this.lastModified = lastmodified;

@ -0,0 +1,64 @@
// microDate.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 3.7.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.util.Date;
public class kelondroMicroDate {
private static final long hour = 3600000L; // milliseconds of a hour
private static final long day = 86400000L; // milliseconds of a day
public static int microDateDays(Date modified) {
return microDateDays(modified.getTime());
}
public static int microDateDays(long modified) {
// this calculates a virtual age from a given date
// the purpose is to have an age in days of a given modified date
// from a fixed standpoint in the past
// one day has 60*60*24 seconds = 86400 seconds
// we take mod 64**3 = 262144, this is the mask of the storage
return (int) ((modified / day) % 262144L);
}
public static String microDateHoursStr(long time) {
return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3);
}
public static int microDateHoursInt(long time) {
return (int) ((time / hour) % 262144L);
}
public static int microDateHoursAge(String mdhs) {
return microDateHoursInt(System.currentTimeMillis()) - (int) kelondroBase64Order.enhancedCoder.decodeLong(mdhs);
}
public static long reverseMicroDateDays(long microDateDays) {
return Math.min(System.currentTimeMillis(), microDateDays * day);
}
}

@ -0,0 +1,142 @@
package de.anomic.kelondro;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
public class kelondroRelations {
private File baseDir;
private HashMap<String, kelondroIndex> relations;
public kelondroRelations(File location) {
this.baseDir = location;
}
private static kelondroRow rowdef(String filename) {
int p = filename.lastIndexOf('.');
if (p >= 0) filename = filename.substring(0, p);
p = filename.lastIndexOf('-');
assert p >= 0;
int payloadsize = Integer.parseInt(filename.substring(p + 1));
filename = filename.substring(0, p);
p = filename.lastIndexOf('-');
assert p >= 0;
int keysize = Integer.parseInt(filename.substring(p + 1));
return rowdef(keysize, payloadsize);
}
private static kelondroRow rowdef(int keysize, int payloadsize) {
return new kelondroRow(
"byte[] key-" + keysize + ", " +
"long time-8" + keysize + ", " +
"int ttl-4" + keysize + ", " +
"byte[] node-" + payloadsize,
kelondroNaturalOrder.naturalOrder, 0);
}
private static String filename(String tablename, int keysize, int payloadsize) {
return tablename + "-" + keysize + "-" + payloadsize + ".eco";
}
public void declareRelation(String name, int keysize, int payloadsize) {
// try to get the relation from the relation-cache
kelondroIndex relation = relations.get(name);
if (relation != null) return;
// try to find the relation as stored on file
String[] list = baseDir.list();
String targetfilename = filename(name, keysize, payloadsize);
for (int i = 0; i < list.length; i++) {
if (list[i].startsWith(name)) {
if (!list[i].equals(targetfilename)) continue;
kelondroRow row = rowdef(list[i]);
if (row.primaryKeyLength != keysize || row.column(1).cellwidth != payloadsize) continue; // a wrong table
kelondroIndex table = new kelondroEcoTable(new File(baseDir, list[i]), row, kelondroEcoTable.tailCacheUsageAuto, 1024*1024, 0);
relations.put(name, table);
return;
}
}
// the relation does not exist, create it
kelondroRow row = rowdef(keysize, payloadsize);
kelondroIndex table = new kelondroEcoTable(new File(baseDir, targetfilename), row, kelondroEcoTable.tailCacheUsageAuto, 1024*1024, 0);
relations.put(name, table);
}
public kelondroIndex getRelation(String name) {
// try to get the relation from the relation-cache
kelondroIndex relation = relations.get(name);
if (relation != null) return relation;
// try to find the relation as stored on file
String[] list = baseDir.list();
for (int i = 0; i < list.length; i++) {
if (list[i].startsWith(name)) {
kelondroRow row = rowdef(list[i]);
kelondroIndex table = new kelondroEcoTable(new File(baseDir, list[i]), row, kelondroEcoTable.tailCacheUsageAuto, 1024*1024, 0);
relations.put(name, table);
return table;
}
}
// the relation does not exist
return null;
}
public String putRelation(String name, String key, String value) throws IOException {
byte[] r = putRelation(name, key.getBytes(), value.getBytes());
if (r == null) return null;
return new String(r);
}
public byte[] putRelation(String name, byte[] key, byte[] value) throws IOException {
kelondroIndex table = getRelation(name);
if (table == null) return null;
kelondroRow.Entry entry = table.row().newEntry();
entry.setCol(0, key);
entry.setCol(1, System.currentTimeMillis());
entry.setCol(2, 1000000);
entry.setCol(3, value);
kelondroRow.Entry oldentry = table.put(entry);
if (oldentry == null) return null;
return oldentry.getColBytes(3);
}
public String getRelation(String name, String key) throws IOException {
byte[] r = getRelation(name, key.getBytes());
if (r == null) return null;
return new String(r);
}
public byte[] getRelation(String name, byte[] key) throws IOException {
kelondroIndex table = getRelation(name);
if (table == null) return null;
kelondroRow.Entry entry = table.get(key);
if (entry == null) return null;
return entry.getColBytes(3);
}
public boolean hasRelation(String name, byte[] key) throws IOException {
kelondroIndex table = getRelation(name);
if (table == null) return false;
return table.has(key);
}
public byte[] removeRelation(String name, byte[] key) throws IOException {
kelondroIndex table = getRelation(name);
if (table == null) return null;
kelondroRow.Entry entry = table.remove(key, false);
if (entry == null) return null;
return entry.getColBytes(3);
}
public static void main(String args[]) {
kelondroRelations r = new kelondroRelations(new File("/Users/admin/"));
try {
String table1 = "test1";
r.declareRelation(table1, 12, 30);
r.putRelation(table1, "abcdefg", "eineintrag");
r.putRelation(table1, "abcdefg", "eineintrag");
} catch (IOException e) {
e.printStackTrace();
}
}
}

@ -55,6 +55,7 @@ import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCollectionIndex;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroMicroDate;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.server.serverDate;
@ -131,8 +132,8 @@ public class plasmaRankingCRProcess {
} else {
// initialize counters and dates
acc_entry = acc.newEntry(key, new_entry.getAttrs(), new_entry.getSeqSet());
FUDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // first update date
FDDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack
FUDate = kelondroMicroDate.microDateHoursInt(System.currentTimeMillis()); // first update date
FDDate = kelondroMicroDate.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack
LUDate = (int) new_entry.getAttr("VDate", 0);
UCount = 0;
PCount = (new_flags.get(1)) ? 1 : 0;
@ -210,8 +211,8 @@ public class plasmaRankingCRProcess {
acc_entry.setCol(i, new_entry.getAttr(acc.row().column(i).nickname, 0));
}
seq.put(key.getBytes(), new_entry.getSeqCollection());
FUDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // first update date
FDDate = plasmaWordIndex.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack
FUDate = kelondroMicroDate.microDateHoursInt(System.currentTimeMillis()); // first update date
FDDate = kelondroMicroDate.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack
LUDate = (int) new_entry.getAttr("VDate", 0);
UCount = 0;
PCount = (new_flags.get(1)) ? 1 : 0;

@ -1019,6 +1019,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// start a loader
log.logConfig("Starting Crawl Loader");
this.crawlQueues = new CrawlQueues(this, plasmaPath);
this.crawlQueues.noticeURL.setMinimumLocalDelta(this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()));
this.crawlQueues.noticeURL.setMinimumGlobalDelta(this.getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
/*
* Creating sync objects and loading status for the crawl jobs
@ -2388,19 +2390,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
thread = getThread(CRAWLJOB_LOCAL_CRAWL);
if (thread != null) {
setConfig(CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP , thread.setBusySleep(newBusySleep));
thread.setIdleSleep(1000);
thread.setIdleSleep(2000);
}
thread = getThread(PROXY_CACHE_ENQUEUE);
if (thread != null) {
setConfig(PROXY_CACHE_ENQUEUE_BUSYSLEEP , thread.setBusySleep(0));
thread.setIdleSleep(1000);
thread.setIdleSleep(2000);
}
thread = getThread(INDEXER);
if (thread != null) {
setConfig(INDEXER_BUSYSLEEP , thread.setBusySleep(newBusySleep / 2));
thread.setIdleSleep(1000);
setConfig(INDEXER_BUSYSLEEP , thread.setBusySleep(newBusySleep / 8));
thread.setIdleSleep(2000);
}
}

@ -38,6 +38,7 @@ import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroMicroDate;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
@ -119,8 +120,8 @@ public class plasmaWebStructure {
// append this reference to buffer
// generate header info
String head = url.hash() + "=" +
plasmaWordIndex.microDateHoursStr(docDate.getTime()) + // latest update timestamp of the URL
plasmaWordIndex.microDateHoursStr(System.currentTimeMillis()) + // last visit timestamp of the URL
kelondroMicroDate.microDateHoursStr(docDate.getTime()) + // latest update timestamp of the URL
kelondroMicroDate.microDateHoursStr(System.currentTimeMillis()) + // last visit timestamp of the URL
kelondroBase64Order.enhancedCoder.encodeLongSmart(LCount, 2) + // count of links to local resources
kelondroBase64Order.enhancedCoder.encodeLongSmart(GCount, 2) + // count of links to global resources
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document

@ -510,37 +510,6 @@ public final class plasmaWordIndex implements indexRI {
return containerList.size();
}
private static final int hour = 3600000;
private static final int day = 86400000;
public static int microDateDays(Date modified) {
return microDateDays(modified.getTime());
}
public static int microDateDays(long modified) {
// this calculates a virtual age from a given date
// the purpose is to have an age in days of a given modified date
// from a fixed standpoint in the past
// one day has 60*60*24 seconds = 86400 seconds
// we take mod 64**3 = 262144, this is the mask of the storage
return (int) ((modified / day) % 262144);
}
public static String microDateHoursStr(long time) {
return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3);
}
public static int microDateHoursInt(long time) {
return (int) ((time / hour) % 262144);
}
public static int microDateHoursAge(String mdhs) {
return microDateHoursInt(System.currentTimeMillis()) - (int) kelondroBase64Order.enhancedCoder.decodeLong(mdhs);
}
public static long reverseMicroDateDays(int microDateDays) {
return ((long) microDateDays) * ((long) day);
}
public int addPageIndex(yacyURL url, Date urlModified, int size, plasmaParserDocument document, plasmaCondenser condenser, String language, char doctype, int outlinksSame, int outlinksOther) {
// this is called by the switchboard to put in a new page into the index

@ -233,9 +233,11 @@ public final class yacyClient {
* @return response body
* @throws IOException
*/
/*
private static byte[] wput(final String url, String vhost, final List<Part> post, boolean gzipBody) throws IOException {
return wput(url, vhost, post, 10000, gzipBody);
}
*/
/**
* send data to the server named by vhost
*

Loading…
Cancel
Save