forced deletion of ZURL entries for a specific host for each host that

appears in the crawl url list
pull/1/head
Michael Peter Christen 12 years ago
parent e137ff4171
commit dbef8ccfcb

@ -392,7 +392,7 @@ public class Crawler_p {
for (DigestURI u: rootURLs) {
hosthashes.add(ASCII.getBytes(u.hosthash()));
}
sb.crawlQueues.errorURL.removeHost(hosthashes, true);
sb.crawlQueues.errorURL.removeHosts(hosthashes, false);
for (byte[] hosthash: hosthashes) {
try {
String deletequery = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]";

@ -133,4 +133,15 @@ public class ASCII implements Comparator<String> {
}
return b;
}
public final static byte[] getBytes(final String s, final int beginIndex, final int endIndex) {
assert s != null;
//assert s.length() < 3 || s.charAt(2) != '@';
int count = endIndex - beginIndex;
final byte[] b = new byte[count];
for (int i = 0; i < count; i++) {
b[i] = (byte) s.charAt(i + beginIndex);
}
return b;
}
}

@ -27,7 +27,9 @@ package net.yacy.crawler;
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
@ -184,6 +186,9 @@ public final class CrawlStacker {
final byte[] urlhash = url.hash();
if (replace) {
this.indexSegment.fulltext().remove(urlhash);
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
List<byte[]> hosthashes = new ArrayList<byte[]>(); hosthashes.add(hosthash);
this.nextQueue.errorURL.removeHosts(hosthashes, false);
this.nextQueue.removeURL(urlhash);
String u = url.toNormalform(true);
if (u.endsWith("/")) {

@ -161,6 +161,7 @@ public class CrawlQueues {
}
public void removeURL(final byte[] hash) {
assert hash != null && hash.length == 12;
this.noticeURL.removeByURLHash(hash);
this.delegatedURL.remove(hash);
this.errorURL.remove(hash);

@ -136,13 +136,18 @@ public class ZURL implements Iterable<ZURL.Entry> {
if (hash == null) return false;
//System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " remove " + hash);
try {
Iterator<byte[]> i = ZURL.this.stack.iterator();
while (i.hasNext()) {
byte[] b = i.next();
if (NaturalOrder.naturalOrder.equal(hash, b)) i.remove();
}
return this.urlIndex.delete(hash);
} catch (final IOException e) {
return false;
}
}
public void removeHost(final Iterable<byte[]> hosthashes, final boolean concurrent) {
public void removeHosts(final Iterable<byte[]> hosthashes, final boolean concurrent) {
if (hosthashes == null) return;
Thread t = new Thread() {
public void run() {

@ -34,7 +34,6 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.data.ZURL.FailCategory;

@ -2903,6 +2903,9 @@ public final class Switchboard extends serverSwitch {
}
// remove the document from the error-db
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
List<byte[]> hosthashes = new ArrayList<byte[]>(); hosthashes.add(hosthash);
this.crawlQueues.errorURL.removeHosts(hosthashes, false);
this.crawlQueues.removeURL(urlhash);
// get a scraper to get the title

Loading…
Cancel
Save