fixes to deletion methods (removed unnecessary concurrency and added

removal of crawl queue entries)
pull/1/head
Michael Peter Christen 12 years ago
parent f2c9b0b5f2
commit e26bdd4a52

@ -178,8 +178,7 @@ public class Crawler_p {
// add the prefix http:// if necessary
int pos = crawlingStart.indexOf("://",0);
if (pos == -1) {
if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart;
if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart;
if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; else crawlingStart = "http://" + crawlingStart;
}
try {
DigestURI crawlingStartURL = new DigestURI(crawlingStart);

@ -282,7 +282,7 @@ public class HostBrowser {
Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>();
Map<String, InfoCacheEntry> infoCache = new HashMap<String, InfoCacheEntry>();
int hostsize = 0;
final List<byte[]> deleteIDs = new ArrayList<byte[]>();
final List<String> deleteIDs = new ArrayList<String>();
long timeout = System.currentTimeMillis() + TIMEOUT;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
@ -292,7 +292,7 @@ public class HostBrowser {
infoCache.put(ids, new InfoCacheEntry(doc));
if (u.startsWith(path)) {
if (delete) {
deleteIDs.add(ASCII.getBytes(ids));
deleteIDs.add(ids);
} else {
if (error == null) storedDocs.add(u); else if (admin) errorDocs.put(u, error);
}
@ -328,10 +328,7 @@ public class HostBrowser {
}
if (System.currentTimeMillis() > timeout) break;
}
if (deleteIDs.size() > 0) {
for (byte[] b: deleteIDs) sb.crawlQueues.urlRemove(b);
sb.index.fulltext().remove(deleteIDs, true);
}
if (deleteIDs.size() > 0) sb.remove(deleteIDs);
// collect from crawler
List<Request> domainStackReferences = (admin) ? sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000) : new ArrayList<Request>(0);

@ -45,7 +45,7 @@ import net.yacy.server.serverSwitch;
public class IndexDeletion_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
@ -126,8 +126,7 @@ public class IndexDeletion_p {
if (urlStub == null || urlStub.length() == 0) continue;
int pos = urlStub.indexOf("://",0);
if (pos == -1) {
if (urlStub.startsWith("www")) urlStub = "http://" + urlStub;
if (urlStub.startsWith("ftp")) urlStub = "ftp://" + urlStub;
if (urlStub.startsWith("ftp")) urlStub = "ftp://" + urlStub; else urlStub = "http://" + urlStub;
}
try {
DigestURI u = new DigestURI(urlStub);
@ -140,7 +139,6 @@ public class IndexDeletion_p {
}
} catch (InterruptedException e) {
}
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete);
} catch (MalformedURLException e) {}
}
@ -148,11 +146,8 @@ public class IndexDeletion_p {
count = ids.size();
prop.put("urldelete-active", count == 0 ? 2 : 1);
} else {
try {
defaultConnector.deleteByIds(ids);
//webgraphConnector.deleteByQuery(webgraphQuery);
} catch (IOException e) {
}
sb.remove(ids);
sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete);
prop.put("urldelete-active", 2);
}
} else {

@ -30,7 +30,6 @@ import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.federate.solr.connector.CachedSolrConnector;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.index.Cache;

@ -2800,6 +2800,18 @@ public final class Switchboard extends serverSwitch {
}
}
public void remove(final Collection<String> deleteIDs) {
this.index.fulltext().remove(deleteIDs);
for (String id: deleteIDs) {
this.crawlQueues.urlRemove(ASCII.getBytes(id));
}
}
public void remove(final byte[] urlhash) {
this.index.fulltext().remove(urlhash);
this.crawlQueues.urlRemove(urlhash);
}
public void stackURLs(Set<DigestURI> rootURLs, final CrawlProfile profile, final Set<DigestURI> successurls, final Map<DigestURI,String> failurls) {
if (rootURLs == null || rootURLs.size() == 0) return;
List<Thread> stackthreads = new ArrayList<Thread>(); // do this concurrently
@ -2831,9 +2843,7 @@ public final class Switchboard extends serverSwitch {
// remove url from the index to be prepared for a re-crawl
final byte[] urlhash = url.hash();
this.index.fulltext().remove(urlhash);
this.crawlQueues.noticeURL.removeByURLHash(urlhash);
this.crawlQueues.errorURL.remove(urlhash);
remove(urlhash);
// special handling of ftp protocol
if (url.isFTP()) {

@ -685,37 +685,32 @@ public final class Fulltext {
* @param deleteIDs a list of urlhashes; each denoting a document
* @param concurrently if true, then the method returnes immediately and runs concurrently
*/
public void remove(final List<byte[]> deleteIDs, final boolean concurrently) {
public void remove(final Collection<String> deleteIDs) {
if (deleteIDs == null || deleteIDs.size() == 0) return;
Thread t = new Thread() {
public void run() {
try {
synchronized (Fulltext.this.solrInstances) {
for (byte[] urlHash: deleteIDs) {
Fulltext.this.getDefaultConnector().deleteById(ASCII.String(urlHash));
Fulltext.this.getWebgraphConnector().deleteByQuery(WebgraphSchema.source_id_s.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\"");
}
Fulltext.this.commit(true);
}
} catch (final Throwable e) {
Log.logException(e);
}
if (Fulltext.this.urlIndexFile != null) try {
for (byte[] urlHash: deleteIDs) {
final Row.Entry r = Fulltext.this.urlIndexFile.remove(urlHash);
if (r != null) Fulltext.this.statsDump = null;
}
} catch (final IOException e) {}
}};
if (concurrently) t.start(); else t.run();
try {
synchronized (Fulltext.this.solrInstances) {
this.getDefaultConnector().deleteByIds(deleteIDs);
this.getWebgraphConnector().deleteByIds(deleteIDs);
this.commit(true);
}
} catch (final Throwable e) {
Log.logException(e);
}
if (Fulltext.this.urlIndexFile != null) try {
for (String id: deleteIDs) {
final Row.Entry r = Fulltext.this.urlIndexFile.remove(ASCII.getBytes(id));
if (r != null) Fulltext.this.statsDump = null;
}
} catch (final IOException e) {}
}
public boolean remove(final byte[] urlHash) {
if (urlHash == null) return false;
try {
String id = ASCII.String(urlHash);
synchronized (this.solrInstances) {
this.getDefaultConnector().deleteById(ASCII.String(urlHash));
this.getWebgraphConnector().deleteByQuery(WebgraphSchema.source_id_s.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\"");
this.getDefaultConnector().deleteById(id);
this.getWebgraphConnector().deleteById(id);
}
} catch (final Throwable e) {
Log.logException(e);

@ -136,6 +136,10 @@ public class Segment {
this.writeWebgraph = check;
}
public boolean writeToWebgraph() {
return this.writeWebgraph;
}
public boolean connectedRWI() {
return this.termIndex != null;
}

Loading…
Cancel
Save