diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index dcb21c0d2..411a32752 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -178,8 +178,7 @@ public class Crawler_p { // add the prefix http:// if necessary int pos = crawlingStart.indexOf("://",0); if (pos == -1) { - if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart; - if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; + if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; else crawlingStart = "http://" + crawlingStart; } try { DigestURI crawlingStartURL = new DigestURI(crawlingStart); diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 9d43b68d7..e1d2f169d 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -282,7 +282,7 @@ public class HostBrowser { Map> outboundHosts = new HashMap>(); Map infoCache = new HashMap(); int hostsize = 0; - final List deleteIDs = new ArrayList(); + final List deleteIDs = new ArrayList(); long timeout = System.currentTimeMillis() + TIMEOUT; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); @@ -292,7 +292,7 @@ public class HostBrowser { infoCache.put(ids, new InfoCacheEntry(doc)); if (u.startsWith(path)) { if (delete) { - deleteIDs.add(ASCII.getBytes(ids)); + deleteIDs.add(ids); } else { if (error == null) storedDocs.add(u); else if (admin) errorDocs.put(u, error); } @@ -328,10 +328,7 @@ public class HostBrowser { } if (System.currentTimeMillis() > timeout) break; } - if (deleteIDs.size() > 0) { - for (byte[] b: deleteIDs) sb.crawlQueues.urlRemove(b); - sb.index.fulltext().remove(deleteIDs, true); - } + if (deleteIDs.size() > 0) sb.remove(deleteIDs); // collect from crawler List domainStackReferences = (admin) ? sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000) : new ArrayList(0); diff --git a/htroot/IndexDeletion_p.java b/htroot/IndexDeletion_p.java index 68d19af5e..d886191b2 100644 --- a/htroot/IndexDeletion_p.java +++ b/htroot/IndexDeletion_p.java @@ -45,7 +45,7 @@ import net.yacy.server.serverSwitch; public class IndexDeletion_p { - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); @@ -126,8 +126,7 @@ public class IndexDeletion_p { if (urlStub == null || urlStub.length() == 0) continue; int pos = urlStub.indexOf("://",0); if (pos == -1) { - if (urlStub.startsWith("www")) urlStub = "http://" + urlStub; - if (urlStub.startsWith("ftp")) urlStub = "ftp://" + urlStub; + if (urlStub.startsWith("ftp")) urlStub = "ftp://" + urlStub; else urlStub = "http://" + urlStub; } try { DigestURI u = new DigestURI(urlStub); @@ -140,7 +139,6 @@ public class IndexDeletion_p { } } catch (InterruptedException e) { } - sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete); } catch (MalformedURLException e) {} } @@ -148,11 +146,8 @@ public class IndexDeletion_p { count = ids.size(); prop.put("urldelete-active", count == 0 ? 2 : 1); } else { - try { - defaultConnector.deleteByIds(ids); - //webgraphConnector.deleteByQuery(webgraphQuery); - } catch (IOException e) { - } + sb.remove(ids); + sb.tables.recordAPICall(post, "IndexDeletion_p.html", WorkTables.TABLE_API_TYPE_DELETION, "deletion, docs matching with " + urldelete); prop.put("urldelete-active", 2); } } else { diff --git a/htroot/PerformanceMemory_p.java b/htroot/PerformanceMemory_p.java index 6c05a438a..357f4d666 100644 --- a/htroot/PerformanceMemory_p.java +++ b/htroot/PerformanceMemory_p.java @@ -30,7 +30,6 @@ import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.Map; -import net.yacy.cora.federate.solr.connector.CachedSolrConnector; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.index.Cache; diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 1134e48ff..f658e512a 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2800,6 +2800,18 @@ public final class Switchboard extends serverSwitch { } } + public void remove(final Collection deleteIDs) { + this.index.fulltext().remove(deleteIDs); + for (String id: deleteIDs) { + this.crawlQueues.urlRemove(ASCII.getBytes(id)); + } + } + + public void remove(final byte[] urlhash) { + this.index.fulltext().remove(urlhash); + this.crawlQueues.urlRemove(urlhash); + } + public void stackURLs(Set rootURLs, final CrawlProfile profile, final Set successurls, final Map failurls) { if (rootURLs == null || rootURLs.size() == 0) return; List stackthreads = new ArrayList(); // do this concurrently @@ -2831,9 +2843,7 @@ public final class Switchboard extends serverSwitch { // remove url from the index to be prepared for a re-crawl final byte[] urlhash = url.hash(); - this.index.fulltext().remove(urlhash); - this.crawlQueues.noticeURL.removeByURLHash(urlhash); - this.crawlQueues.errorURL.remove(urlhash); + remove(urlhash); // special handling of ftp protocol if (url.isFTP()) { diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index eac2a7e6f..d3a5fdec5 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -685,37 +685,32 @@ public final class Fulltext { * @param deleteIDs a list of urlhashes; each denoting a document * @param concurrently if true, then the method returnes immediately and runs concurrently */ - public void remove(final List deleteIDs, final boolean concurrently) { + public void remove(final Collection deleteIDs) { if (deleteIDs == null || deleteIDs.size() == 0) return; - Thread t = new Thread() { - public void run() { - try { - synchronized (Fulltext.this.solrInstances) { - for (byte[] urlHash: deleteIDs) { - Fulltext.this.getDefaultConnector().deleteById(ASCII.String(urlHash)); - Fulltext.this.getWebgraphConnector().deleteByQuery(WebgraphSchema.source_id_s.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\""); - } - Fulltext.this.commit(true); - } - } catch (final Throwable e) { - Log.logException(e); - } - if (Fulltext.this.urlIndexFile != null) try { - for (byte[] urlHash: deleteIDs) { - final Row.Entry r = Fulltext.this.urlIndexFile.remove(urlHash); - if (r != null) Fulltext.this.statsDump = null; - } - } catch (final IOException e) {} - }}; - if (concurrently) t.start(); else t.run(); + try { + synchronized (Fulltext.this.solrInstances) { + this.getDefaultConnector().deleteByIds(deleteIDs); + this.getWebgraphConnector().deleteByIds(deleteIDs); + this.commit(true); + } + } catch (final Throwable e) { + Log.logException(e); + } + if (Fulltext.this.urlIndexFile != null) try { + for (String id: deleteIDs) { + final Row.Entry r = Fulltext.this.urlIndexFile.remove(ASCII.getBytes(id)); + if (r != null) Fulltext.this.statsDump = null; + } + } catch (final IOException e) {} } public boolean remove(final byte[] urlHash) { if (urlHash == null) return false; try { + String id = ASCII.String(urlHash); synchronized (this.solrInstances) { - this.getDefaultConnector().deleteById(ASCII.String(urlHash)); - this.getWebgraphConnector().deleteByQuery(WebgraphSchema.source_id_s.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\""); + this.getDefaultConnector().deleteById(id); + this.getWebgraphConnector().deleteById(id); } } catch (final Throwable e) { Log.logException(e); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 4e5d662c5..f8c451e02 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -136,6 +136,10 @@ public class Segment { this.writeWebgraph = check; } + public boolean writeToWebgraph() { + return this.writeWebgraph; + } + public boolean connectedRWI() { return this.termIndex != null; }