From 5fd3b936616cc11f9134b08245375c5bf30ac863 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 13 Nov 2012 16:54:28 +0100 Subject: [PATCH] added deletion of hosts during crawl start if deleteold option was given --- htroot/CrawlResults.java | 2 +- htroot/Crawler_p.java | 15 +- htroot/IndexControlURLs_p.java | 2 +- htroot/api/timeline.java | 1 - htroot/gsa/searchresult.java | 1 - .../solr/connector/MirrorSolrConnector.java | 8 +- .../solr/connector/MultipleSolrConnector.java | 4 +- .../solr/connector/RetrySolrConnector.java | 6 +- .../solr/connector/ShardSolrConnector.java | 6 +- .../solr/connector/SolrConnector.java | 3 +- .../solr/connector/SolrServerConnector.java | 8 +- source/net/yacy/search/index/Fulltext.java | 136 ++++++++++-------- source/net/yacy/search/index/Segment.java | 9 +- 13 files changed, 117 insertions(+), 84 deletions(-) diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index a40e1d65e..c0ba43b4c 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -126,7 +126,7 @@ public class CrawlResults { final String domain = post.get("domain", null); final String hashpart = domain == null ? null : DigestURI.hosthash6(domain); if (hashpart != null) { - sb.index.fulltext().deleteDomain(hashpart, false); + sb.index.fulltext().deleteDomain(hashpart, null, false); ResultURLs.deleteDomain(tabletype, domain, hashpart); } } diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index b5958da94..e225033a1 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -153,8 +153,13 @@ public class Crawler_p { final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start final boolean restrictedcrawl = fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch); - final boolean deleteold = restrictedcrawl && post.getBoolean("deleteold"); final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off")); + Date deleteageDate = null; + if (deleteage) { + long t = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour + if (t > 0) deleteageDate = new Date(t); + } + final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold")); String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); @@ -286,7 +291,10 @@ public class Crawler_p { if (fullDomain) { siteFilter = CrawlProfile.siteFilter(rootURLs); if (deleteold) { - for (DigestURI u: rootURLs) sb.index.fulltext().deleteDomain(u.hosthash(), true); + for (DigestURI u: rootURLs) { + int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 0); + if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost()); + } } } else if (subPath) { siteFilter = CrawlProfile.subpathFilter(rootURLs); @@ -294,7 +302,8 @@ public class Crawler_p { for (DigestURI u: rootURLs) { String subpath = CrawlProfile.mustMatchSubpath(u); if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2); - sb.index.fulltext().remove(subpath, true); + int count = sb.index.fulltext().remove(subpath, deleteageDate, rootURLs.size() > 0); + if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost()); } } } diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 19b537348..ddc2bbb71 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -297,7 +297,7 @@ public class IndexControlURLs_p { if (post.containsKey("deletedomain")) { final String hp = post.get("hashpart"); - segment.fulltext().deleteDomain(hp, false); + segment.fulltext().deleteDomain(hp, null, false); // trigger the loading of the table post.put("statistics", ""); } diff --git a/htroot/api/timeline.java b/htroot/api/timeline.java index 07c10a8b8..04ee5950f 100644 --- a/htroot/api/timeline.java +++ b/htroot/api/timeline.java @@ -24,7 +24,6 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java index 5a39fbc98..8a325ef0f 100644 --- a/htroot/gsa/searchresult.java +++ b/htroot/gsa/searchresult.java @@ -23,7 +23,6 @@ import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.ArrayList; -import java.util.Collection; import java.util.List; import java.util.Map; import java.util.regex.Pattern; diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index 12f4ad19c..51f25c44d 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -188,10 +188,12 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo } @Override - public void deleteByQuery(final String querystring) throws IOException { - if (this.solr0 != null) this.solr0.deleteByQuery(querystring); - if (this.solr1 != null) this.solr1.deleteByQuery(querystring); + public int deleteByQuery(final String querystring) throws IOException { + int count = 0; + if (this.solr0 != null) count += this.solr0.deleteByQuery(querystring); + if (this.solr1 != null) count += this.solr1.deleteByQuery(querystring); this.clearCache(); + return count; } /** diff --git a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java index 9040aebde..c6efa96cf 100644 --- a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java @@ -139,8 +139,8 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr } @Override - public void deleteByQuery(final String querystring) throws IOException { - this.solr.deleteByQuery(querystring); + public int deleteByQuery(final String querystring) throws IOException { + return this.solr.deleteByQuery(querystring); } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java index 814a67019..9ecbf2fe1 100644 --- a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java @@ -115,18 +115,18 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public void deleteByQuery(final String querystring) throws IOException { + public int deleteByQuery(final String querystring) throws IOException { final long t = System.currentTimeMillis() + this.retryMaxTime; Throwable ee = null; while (System.currentTimeMillis() < t) try { - this.solrConnector.deleteByQuery(querystring); - return; + return this.solrConnector.deleteByQuery(querystring); } catch (final Throwable e) { ee = e; try {Thread.sleep(10);} catch (final InterruptedException e1) {} continue; } if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage()); + return 0; } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java index c09bcb3b6..51ff3e18d 100644 --- a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java @@ -112,8 +112,10 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public void deleteByQuery(final String querystring) throws IOException { - for (final SolrConnector connector: this.connectors) connector.deleteByQuery(querystring); + public int deleteByQuery(final String querystring) throws IOException { + int count = 0; + for (final SolrConnector connector: this.connectors) count += connector.deleteByQuery(querystring); + return count; } /** diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index f21763b9f..e082436b5 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -82,9 +82,10 @@ public interface SolrConnector extends Iterable /* Iterable of document /** * delete entries from solr according the given solr query string * @param id the url hash of the entry + * @return the number of deletions * @throws IOException */ - public void deleteByQuery(final String querystring) throws IOException; + public int deleteByQuery(final String querystring) throws IOException; /** * check if a given id exists in solr diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index 024591365..8e16fa3fe 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -40,12 +40,14 @@ import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest; import org.apache.solr.client.solrj.response.FacetField; import org.apache.solr.client.solrj.response.FacetField.Count; import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.util.NamedList; public abstract class SolrServerConnector extends AbstractSolrConnector implements SolrConnector { @@ -164,10 +166,14 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen * @throws IOException */ @Override - public void deleteByQuery(final String querystring) throws IOException { + public int deleteByQuery(final String querystring) throws IOException { try { synchronized (this.server) { + long c0 = this.getQueryCount(querystring); this.server.deleteByQuery(querystring, this.commitWithinMs); + this.commit(); + long c1 = this.getQueryCount(querystring); + return (int) (c1 - c0); } } catch (final Throwable e) { throw new IOException(e); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index b3135a86b..1404432f8 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -36,8 +36,10 @@ import java.util.List; import java.util.Map; import java.util.TreeSet; import java.util.concurrent.BlockingQueue; +import java.util.concurrent.atomic.AtomicInteger; import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.solr.YaCySchema; @@ -305,35 +307,100 @@ public final class Fulltext implements Iterable { if (MemoryControl.shortStatus()) clearCache(); } + /** + * using a fragment of the url hash (6 bytes: bytes 6 to 11) it is possible to address all urls from a specific domain + * here such a fragment can be used to delete all these domains at once + * @param hosthash the hash of the host to be deleted + * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted + * @return number of deleted domains + * @throws IOException + */ + public int deleteDomain(final String hosthash, Date freshdate, boolean concurrent) { + // first collect all url hashes that belong to the domain + assert hosthash.length() == 6; + final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" + + ((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); + final AtomicInteger count = new AtomicInteger(0); + Thread t = new Thread() { + public void run() { + // delete in solr + synchronized (Fulltext.this.solr) { + try { + count.addAndGet(Fulltext.this.solr.deleteByQuery(q)); + Fulltext.this.solr.commit(); + } catch (IOException e) {} + } + + // delete in old metadata structure + if (Fulltext.this.urlIndexFile != null) { + final ArrayList l = new ArrayList(); + synchronized (this) { + CloneableIterator i; + try { + i = Fulltext.this.urlIndexFile.keys(true, null); + String hash; + while (i != null && i.hasNext()) { + hash = ASCII.String(i.next()); + if (hosthash.equals(hash.substring(6))) l.add(hash); + } + + // then delete the urls using this list + for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h)); + } catch (IOException e) {} + } + } + + // finally remove the line with statistics + if (Fulltext.this.statsDump != null) { + final Iterator hsi = Fulltext.this.statsDump.iterator(); + HostStat hs; + while (hsi.hasNext()) { + hs = hsi.next(); + if (hs.hosthash.equals(hosthash)) { + hsi.remove(); + break; + } + } + } + } + }; + if (concurrent) t.start(); else t.run(); + return count.get(); + } + /** * remove a full subpath from the index * @param subpath the left path of the url; at least until the end of the host + * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted * @param concurrently if true, then the method returnes immediately and runs concurrently */ - public void remove(String subpath, final boolean concurrently) { + public int remove(String subpath, Date freshdate, final boolean concurrently) { int p = subpath.substring(0, subpath.length() - 1).lastIndexOf('/'); final String path = p > 8 ? subpath.substring(0, p + 1) : subpath; DigestURI uri; - try {uri = new DigestURI(path);} catch (MalformedURLException e) {return;} + try {uri = new DigestURI(path);} catch (MalformedURLException e) {return 0;} final String host = uri.getHost(); + final String q = YaCySchema.host_s.getSolrFieldName() + ":" + host + + ((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); + final AtomicInteger count = new AtomicInteger(0); Thread t = new Thread(){ public void run() { - final BlockingQueue docs = getSolr().concurrentQuery(YaCySchema.host_s.getSolrFieldName() + ":" + host, 0, 1000000, 600000, -1); + final BlockingQueue docs = getSolr().concurrentQuery(q, 0, 1000000, 600000, -1); try { SolrDocument doc; - boolean removed = false; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); if (u.startsWith(path)) { remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()))); - removed = true; + count.incrementAndGet(); } } - if (removed) Fulltext.this.solr.commit(); + if (count.get() > 0) Fulltext.this.solr.commit(); } catch (InterruptedException e) {} } }; if (concurrently) t.start(); else t.run(); + return count.get(); } /** @@ -801,61 +868,4 @@ public final class Fulltext implements Iterable { this.count = count; } } - - /** - * using a fragment of the url hash (6 bytes: bytes 6 to 11) it is possible to address all urls from a specific domain - * here such a fragment can be used to delete all these domains at once - * @param hosthash - * @return number of deleted domains - * @throws IOException - */ - public void deleteDomain(final String hosthash, boolean concurrent) { - // first collect all url hashes that belong to the domain - assert hosthash.length() == 6; - - Thread t = new Thread() { - public void run() { - // delete in solr - synchronized (Fulltext.this.solr) { - try { - Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\""); - Fulltext.this.solr.commit(); - } catch (IOException e) {} - } - - // delete in old metadata structure - if (Fulltext.this.urlIndexFile != null) { - final ArrayList l = new ArrayList(); - synchronized (this) { - CloneableIterator i; - try { - i = Fulltext.this.urlIndexFile.keys(true, null); - String hash; - while (i != null && i.hasNext()) { - hash = ASCII.String(i.next()); - if (hosthash.equals(hash.substring(6))) l.add(hash); - } - - // then delete the urls using this list - for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h)); - } catch (IOException e) {} - } - } - - // finally remove the line with statistics - if (Fulltext.this.statsDump != null) { - final Iterator hsi = Fulltext.this.statsDump.iterator(); - HostStat hs; - while (hsi.hasNext()) { - hs = hsi.next(); - if (hs.hosthash.equals(hosthash)) { - hsi.remove(); - break; - } - } - } - } - }; - if (concurrent) t.start(); else t.run(); - } } diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 03c6a6e5f..cc9ea86f9 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -365,16 +365,21 @@ public class Segment { // STORE TO SOLR final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language); - tryloop: for (int i = 0; i < 10; i++) { - String error = ""; + String error = null; + tryloop: for (int i = 0; i < 20; i++) { try { + error = null; this.fulltext.putDocument(solrInputDoc); break tryloop; } catch ( final IOException e ) { error = "failed to send " + urlNormalform + " to solr"; Log.logWarning("SOLR", error + e.getMessage()); + if (i == 10) this.fulltext.commit(); try {Thread.sleep(1000);} catch (InterruptedException e1) {} + continue tryloop; } + } + if (error != null) { Log.logWarning("SOLR", error + ", pausing Crawler!"); // pause the crawler!!! Switchboard.getSwitchboard().pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, error);