slightly changed crawling policy

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6723 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent de01fe0e6d
commit b88f5fbb4b

@ -138,10 +138,12 @@ public class CrawlQueues {
public String urlExists(final String hash) {
if (delegatedURL.exists(hash)) return "delegated";
if (errorURL.exists(hash)) return "errors";
/*
if (noticeURL.existsInStack(hash)) return "crawler";
for (final crawlWorker worker: workers.values()) {
if (worker.request.url().hash().equals(hash)) return "worker";
}
if (noticeURL.existsInStack(hash)) return "crawler";
*/
return null;
}
@ -154,9 +156,9 @@ public class CrawlQueues {
public DigestURI getURL(final String urlhash) {
assert urlhash != null;
if (urlhash == null || urlhash.length() == 0) return null;
ZURL.Entry ee = delegatedURL.getEntry(urlhash);
ZURL.Entry ee = delegatedURL.get(urlhash);
if (ee != null) return ee.url();
ee = errorURL.getEntry(urlhash);
ee = errorURL.get(urlhash);
if (ee != null) return ee.url();
for (final crawlWorker w: workers.values()) {
if (w.request.url().hash().equals(urlhash)) return w.request.url();

@ -260,23 +260,36 @@ public final class CrawlStacker {
// check if the url is double registered
final String dbocc = nextQueue.urlExists(entry.url().hash()); // returns the name of the queue if entry exists
URIMetadataRow oldEntry = null;
if (dbocc != null || (oldEntry = indexSegment.urlMetadata().load(entry.url().hash(), null, 0)) != null) {
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check
if ((dbocc != null) && (!recrawl)) {
URIMetadataRow oldEntry = indexSegment.urlMetadata().load(entry.url().hash(), null, 0);
if (oldEntry == null) {
if (dbocc != null) {
// do double-check
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "double in: " + dbocc;
if (dbocc.equals("errors")) {
ZURL.Entry errorEntry = nextQueue.errorURL.get(entry.url().hash());
return "double in: errors (" + errorEntry.anycause() + ")";
} else {
return "double in: " + dbocc;
}
}
if ((oldEntry != null) && (!recrawl)) {
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "double in: LURL";
}
// show potential re-crawl
if (recrawl && oldEntry != null) {
if (this.log.isFine()) this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " +
} else {
final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime();
if (recrawl) {
if (this.log.isFine())
this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " +
((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
} else {
if (dbocc == null) {
return "double in: LURL-DB";
} else {
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (dbocc.equals("errors")) {
ZURL.Entry errorEntry = nextQueue.errorURL.get(entry.url().hash());
return "double in: errors (" + errorEntry.anycause() + ")";
} else {
return "double in: " + dbocc;
}
}
}
}

@ -127,9 +127,10 @@ public class ZURL implements Iterable<ZURL.Entry> {
final int workcount,
String anycause) {
assert executor != null;
if (exists(bentry.url().hash())) return; // don't insert double causes
if (anycause == null) anycause = "unknown";
Entry entry = new Entry(bentry, executor, workdate, workcount, anycause);
entry.store();
put(entry);
stack.add(entry.hash());
while (stack.size() > maxStackSize) stack.poll();
}
@ -157,7 +158,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
}
public ZURL.Entry next() {
return getEntry(hi.next());
return get(hi.next());
}
public void remove() {
@ -166,7 +167,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
}
public ZURL.Entry getEntry(final String urlhash) {
public ZURL.Entry get(final String urlhash) {
try {
if (urlIndex == null) return null;
//System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash);
@ -179,6 +180,29 @@ public class ZURL implements Iterable<ZURL.Entry> {
}
}
/**
* private put (use push instead)
* @param entry
*/
private void put(Entry entry) {
// stores the values from the object variables into the database
if (entry.stored) return;
if (entry.bentry == null) return;
final Row.Entry newrow = rowdef.newEntry();
newrow.setCol(0, entry.bentry.url().hash().getBytes());
newrow.setCol(1, entry.executor.getBytes());
newrow.setCol(2, entry.workdate.getTime());
newrow.setCol(3, entry.workcount);
newrow.setCol(4, entry.anycause.getBytes());
newrow.setCol(5, entry.bentry.toRow().bytes());
try {
if (urlIndex != null) urlIndex.put(newrow);
entry.stored = true;
} catch (final Exception e) {
Log.logException(e);
}
}
public boolean exists(final String urlHash) {
return urlIndex.has(urlHash.getBytes());
}
@ -228,26 +252,6 @@ public class ZURL implements Iterable<ZURL.Entry> {
this.stored = true;
return;
}
protected void store() {
// stores the values from the object variables into the database
if (this.stored) return;
if (this.bentry == null) return;
final Row.Entry newrow = rowdef.newEntry();
newrow.setCol(0, this.bentry.url().hash().getBytes());
newrow.setCol(1, this.executor.getBytes());
newrow.setCol(2, this.workdate.getTime());
newrow.setCol(3, this.workcount);
newrow.setCol(4, this.anycause.getBytes());
newrow.setCol(5, this.bentry.toRow().bytes());
try {
//System.out.println("*** DEBUG ZURL " + urlIndex.filename() + " store " + newrow.getColString(0, "UTF-8"));
if (urlIndex != null) urlIndex.put(newrow);
this.stored = true;
} catch (final Exception e) {
Log.logException(e);
}
}
public DigestURI url() {
return this.bentry.url();

Loading…
Cancel
Save