better and consistent deletions for error urls

pull/1/head
Michael Peter Christen 12 years ago
parent 2602be8d1e
commit e40671ddb7

@ -24,7 +24,6 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@ -56,7 +55,6 @@ import net.yacy.peers.NewsPool;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -387,16 +385,9 @@ public class Crawler_p {
try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (final SpaceExceededException e1) {}
// delete all error urls for that domain
List<byte[]> hosthashes = new ArrayList<byte[]>();
for (DigestURL u: rootURLs) {
hosthashes.add(ASCII.getBytes(u.hosthash()));
}
sb.crawlQueues.errorURL.removeHosts(hosthashes);
for (byte[] hosthash: hosthashes) {
try {
String deletequery = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]";
sb.index.fulltext().getDefaultConnector().deleteByQuery(deletequery);
} catch (final IOException e) {ConcurrentLog.logException(e);}
sb.index.fulltext().remove(u.hash());
sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(u.hosthash()));
}
sb.index.fulltext().commit(true);

@ -32,7 +32,6 @@
import java.net.MalformedURLException;
import java.util.Date;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
@ -128,7 +127,6 @@ public class QuickCrawlLink_p {
final byte[] urlhash = crawlingStartURL.hash();
indexSegment.fulltext().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(ASCII.String(urlhash));
// create crawling profile
CrawlProfile pe = null;

@ -27,7 +27,6 @@ package net.yacy.crawler;
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
@ -185,9 +184,7 @@ public final class CrawlStacker {
if (replace) {
this.indexSegment.fulltext().remove(urlhash);
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
List<byte[]> hosthashes = new ArrayList<byte[]>(); hosthashes.add(hosthash);
this.nextQueue.errorURL.removeHosts(hosthashes);
this.nextQueue.removeURL(urlhash);
this.nextQueue.errorURL.removeHost(hosthash);
String u = url.toNormalform(true);
if (u.endsWith("/")) {
u = u + "index.html";
@ -198,7 +195,6 @@ public final class CrawlStacker {
final byte[] uh = new DigestURL(u).hash();
this.indexSegment.fulltext().remove(uh);
this.nextQueue.noticeURL.removeByURLHash(uh);
this.nextQueue.errorURL.remove(ASCII.String(uh));
} catch (final MalformedURLException e1) {}
}
@ -246,7 +242,6 @@ public final class CrawlStacker {
if (replace) {
CrawlStacker.this.indexSegment.fulltext().remove(urlhash);
cq.noticeURL.removeByURLHash(urlhash);
cq.errorURL.remove(ASCII.String(urlhash));
}
// put entry on crawl stack

@ -119,11 +119,6 @@ public class CrawlQueues {
this.workers.clear();
this.remoteCrawlProviderHashes.clear();
this.noticeURL.clear();
try {
this.errorURL.clear();
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
this.delegatedURL.clear();
}
@ -154,7 +149,6 @@ public class CrawlQueues {
assert hash != null && hash.length == 12;
this.noticeURL.removeByURLHash(hash);
this.delegatedURL.remove(hash);
this.errorURL.remove(ASCII.String(hash));
}
public DigestURL getURL(final byte[] urlhash) {

@ -2878,9 +2878,8 @@ public final class Switchboard extends serverSwitch {
// remove the document from the error-db
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
List<byte[]> hosthashes = new ArrayList<byte[]>(); hosthashes.add(hosthash);
this.crawlQueues.errorURL.removeHosts(hosthashes);
this.crawlQueues.removeURL(urlhash);
this.crawlQueues.errorURL.removeHost(hosthash);
this.index.fulltext().remove(urlhash);
// get a scraper to get the title
Document scraper;

@ -79,28 +79,14 @@ public class ErrorCache {
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
}
public void remove(final String hash) {
if (hash == null) return;
this.stack.remove(hash);
public void removeHost(final byte[] hosthash) {
if (hosthash == null) return;
try {
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + hash + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
} catch (final IOException e) {
return;
}
}
public void removeHosts(final Iterable<byte[]> hosthashes) {
if (hosthashes == null) return;
try {
for (byte[] hosthash : hosthashes) {
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
}
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
Iterator<String> i = ErrorCache.this.stack.keySet().iterator();
while (i.hasNext()) {
String b = i.next();
for (byte[] hosthash : hosthashes) {
if (NaturalOrder.naturalOrder.equal(hosthash, 0, ASCII.getBytes(b), 6, 6)) i.remove();
}
if (NaturalOrder.naturalOrder.equal(hosthash, 0, ASCII.getBytes(b), 6, 6)) i.remove();
}
} catch (final IOException e) {
}

Loading…
Cancel
Save