added deletion of hosts during crawl start if deleteold option was given

pull/1/head
Michael Peter Christen 12 years ago
parent d64445c3cb
commit 5fd3b93661

@ -126,7 +126,7 @@ public class CrawlResults {
final String domain = post.get("domain", null); final String domain = post.get("domain", null);
final String hashpart = domain == null ? null : DigestURI.hosthash6(domain); final String hashpart = domain == null ? null : DigestURI.hosthash6(domain);
if (hashpart != null) { if (hashpart != null) {
sb.index.fulltext().deleteDomain(hashpart, false); sb.index.fulltext().deleteDomain(hashpart, null, false);
ResultURLs.deleteDomain(tabletype, domain, hashpart); ResultURLs.deleteDomain(tabletype, domain, hashpart);
} }
} }

@ -153,8 +153,13 @@ public class Crawler_p {
final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start
final boolean restrictedcrawl = fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch); final boolean restrictedcrawl = fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch);
final boolean deleteold = restrictedcrawl && post.getBoolean("deleteold");
final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off")); final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off"));
Date deleteageDate = null;
if (deleteage) {
long t = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour
if (t > 0) deleteageDate = new Date(t);
}
final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold"));
String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
@ -286,7 +291,10 @@ public class Crawler_p {
if (fullDomain) { if (fullDomain) {
siteFilter = CrawlProfile.siteFilter(rootURLs); siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) { if (deleteold) {
for (DigestURI u: rootURLs) sb.index.fulltext().deleteDomain(u.hosthash(), true); for (DigestURI u: rootURLs) {
int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 0);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
}
} }
} else if (subPath) { } else if (subPath) {
siteFilter = CrawlProfile.subpathFilter(rootURLs); siteFilter = CrawlProfile.subpathFilter(rootURLs);
@ -294,7 +302,8 @@ public class Crawler_p {
for (DigestURI u: rootURLs) { for (DigestURI u: rootURLs) {
String subpath = CrawlProfile.mustMatchSubpath(u); String subpath = CrawlProfile.mustMatchSubpath(u);
if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2); if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2);
sb.index.fulltext().remove(subpath, true); int count = sb.index.fulltext().remove(subpath, deleteageDate, rootURLs.size() > 0);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
} }
} }
} }

@ -297,7 +297,7 @@ public class IndexControlURLs_p {
if (post.containsKey("deletedomain")) { if (post.containsKey("deletedomain")) {
final String hp = post.get("hashpart"); final String hp = post.get("hashpart");
segment.fulltext().deleteDomain(hp, false); segment.fulltext().deleteDomain(hp, null, false);
// trigger the loading of the table // trigger the loading of the table
post.put("statistics", ""); post.put("statistics", "");
} }

@ -24,7 +24,6 @@
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;

@ -23,7 +23,6 @@ import java.io.OutputStream;
import java.io.OutputStreamWriter; import java.io.OutputStreamWriter;
import java.io.Writer; import java.io.Writer;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.regex.Pattern; import java.util.regex.Pattern;

@ -188,10 +188,12 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
} }
@Override @Override
public void deleteByQuery(final String querystring) throws IOException { public int deleteByQuery(final String querystring) throws IOException {
if (this.solr0 != null) this.solr0.deleteByQuery(querystring); int count = 0;
if (this.solr1 != null) this.solr1.deleteByQuery(querystring); if (this.solr0 != null) count += this.solr0.deleteByQuery(querystring);
if (this.solr1 != null) count += this.solr1.deleteByQuery(querystring);
this.clearCache(); this.clearCache();
return count;
} }
/** /**

@ -139,8 +139,8 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr
} }
@Override @Override
public void deleteByQuery(final String querystring) throws IOException { public int deleteByQuery(final String querystring) throws IOException {
this.solr.deleteByQuery(querystring); return this.solr.deleteByQuery(querystring);
} }
@Override @Override

@ -115,18 +115,18 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon
} }
@Override @Override
public void deleteByQuery(final String querystring) throws IOException { public int deleteByQuery(final String querystring) throws IOException {
final long t = System.currentTimeMillis() + this.retryMaxTime; final long t = System.currentTimeMillis() + this.retryMaxTime;
Throwable ee = null; Throwable ee = null;
while (System.currentTimeMillis() < t) try { while (System.currentTimeMillis() < t) try {
this.solrConnector.deleteByQuery(querystring); return this.solrConnector.deleteByQuery(querystring);
return;
} catch (final Throwable e) { } catch (final Throwable e) {
ee = e; ee = e;
try {Thread.sleep(10);} catch (final InterruptedException e1) {} try {Thread.sleep(10);} catch (final InterruptedException e1) {}
continue; continue;
} }
if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage()); if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
return 0;
} }
@Override @Override

@ -112,8 +112,10 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon
} }
@Override @Override
public void deleteByQuery(final String querystring) throws IOException { public int deleteByQuery(final String querystring) throws IOException {
for (final SolrConnector connector: this.connectors) connector.deleteByQuery(querystring); int count = 0;
for (final SolrConnector connector: this.connectors) count += connector.deleteByQuery(querystring);
return count;
} }
/** /**

@ -82,9 +82,10 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
/** /**
* delete entries from solr according the given solr query string * delete entries from solr according the given solr query string
* @param id the url hash of the entry * @param id the url hash of the entry
* @return the number of deletions
* @throws IOException * @throws IOException
*/ */
public void deleteByQuery(final String querystring) throws IOException; public int deleteByQuery(final String querystring) throws IOException;
/** /**
* check if a given id exists in solr * check if a given id exists in solr

@ -40,12 +40,14 @@ import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.FacetField; import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.FacetField.Count; import org.apache.solr.client.solrj.response.FacetField.Count;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
public abstract class SolrServerConnector extends AbstractSolrConnector implements SolrConnector { public abstract class SolrServerConnector extends AbstractSolrConnector implements SolrConnector {
@ -164,10 +166,14 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
* @throws IOException * @throws IOException
*/ */
@Override @Override
public void deleteByQuery(final String querystring) throws IOException { public int deleteByQuery(final String querystring) throws IOException {
try { try {
synchronized (this.server) { synchronized (this.server) {
long c0 = this.getQueryCount(querystring);
this.server.deleteByQuery(querystring, this.commitWithinMs); this.server.deleteByQuery(querystring, this.commitWithinMs);
this.commit();
long c1 = this.getQueryCount(querystring);
return (int) (c1 - c0);
} }
} catch (final Throwable e) { } catch (final Throwable e) {
throw new IOException(e); throw new IOException(e);

@ -36,8 +36,10 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.solr.YaCySchema;
@ -305,35 +307,100 @@ public final class Fulltext implements Iterable<byte[]> {
if (MemoryControl.shortStatus()) clearCache(); if (MemoryControl.shortStatus()) clearCache();
} }
/**
* using a fragment of the url hash (6 bytes: bytes 6 to 11) it is possible to address all urls from a specific domain
* here such a fragment can be used to delete all these domains at once
* @param hosthash the hash of the host to be deleted
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
* @return number of deleted domains
* @throws IOException
*/
public int deleteDomain(final String hosthash, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
Fulltext.this.solr.commit();
} catch (IOException e) {}
}
// delete in old metadata structure
if (Fulltext.this.urlIndexFile != null) {
final ArrayList<String> l = new ArrayList<String>();
synchronized (this) {
CloneableIterator<byte[]> i;
try {
i = Fulltext.this.urlIndexFile.keys(true, null);
String hash;
while (i != null && i.hasNext()) {
hash = ASCII.String(i.next());
if (hosthash.equals(hash.substring(6))) l.add(hash);
}
// then delete the urls using this list
for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h));
} catch (IOException e) {}
}
}
// finally remove the line with statistics
if (Fulltext.this.statsDump != null) {
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
HostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hs.hosthash.equals(hosthash)) {
hsi.remove();
break;
}
}
}
}
};
if (concurrent) t.start(); else t.run();
return count.get();
}
/** /**
* remove a full subpath from the index * remove a full subpath from the index
* @param subpath the left path of the url; at least until the end of the host * @param subpath the left path of the url; at least until the end of the host
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
* @param concurrently if true, then the method returnes immediately and runs concurrently * @param concurrently if true, then the method returnes immediately and runs concurrently
*/ */
public void remove(String subpath, final boolean concurrently) { public int remove(String subpath, Date freshdate, final boolean concurrently) {
int p = subpath.substring(0, subpath.length() - 1).lastIndexOf('/'); int p = subpath.substring(0, subpath.length() - 1).lastIndexOf('/');
final String path = p > 8 ? subpath.substring(0, p + 1) : subpath; final String path = p > 8 ? subpath.substring(0, p + 1) : subpath;
DigestURI uri; DigestURI uri;
try {uri = new DigestURI(path);} catch (MalformedURLException e) {return;} try {uri = new DigestURI(path);} catch (MalformedURLException e) {return 0;}
final String host = uri.getHost(); final String host = uri.getHost();
final String q = YaCySchema.host_s.getSolrFieldName() + ":" + host +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread(){ Thread t = new Thread(){
public void run() { public void run() {
final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(YaCySchema.host_s.getSolrFieldName() + ":" + host, 0, 1000000, 600000, -1); final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(q, 0, 1000000, 600000, -1);
try { try {
SolrDocument doc; SolrDocument doc;
boolean removed = false;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
if (u.startsWith(path)) { if (u.startsWith(path)) {
remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()))); remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
removed = true; count.incrementAndGet();
} }
} }
if (removed) Fulltext.this.solr.commit(); if (count.get() > 0) Fulltext.this.solr.commit();
} catch (InterruptedException e) {} } catch (InterruptedException e) {}
} }
}; };
if (concurrently) t.start(); else t.run(); if (concurrently) t.start(); else t.run();
return count.get();
} }
/** /**
@ -801,61 +868,4 @@ public final class Fulltext implements Iterable<byte[]> {
this.count = count; this.count = count;
} }
} }
/**
* using a fragment of the url hash (6 bytes: bytes 6 to 11) it is possible to address all urls from a specific domain
* here such a fragment can be used to delete all these domains at once
* @param hosthash
* @return number of deleted domains
* @throws IOException
*/
public void deleteDomain(final String hosthash, boolean concurrent) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"");
Fulltext.this.solr.commit();
} catch (IOException e) {}
}
// delete in old metadata structure
if (Fulltext.this.urlIndexFile != null) {
final ArrayList<String> l = new ArrayList<String>();
synchronized (this) {
CloneableIterator<byte[]> i;
try {
i = Fulltext.this.urlIndexFile.keys(true, null);
String hash;
while (i != null && i.hasNext()) {
hash = ASCII.String(i.next());
if (hosthash.equals(hash.substring(6))) l.add(hash);
}
// then delete the urls using this list
for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h));
} catch (IOException e) {}
}
}
// finally remove the line with statistics
if (Fulltext.this.statsDump != null) {
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
HostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hs.hosthash.equals(hosthash)) {
hsi.remove();
break;
}
}
}
}
};
if (concurrent) t.start(); else t.run();
}
} }

@ -365,16 +365,21 @@ public class Segment {
// STORE TO SOLR // STORE TO SOLR
final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language); final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language);
tryloop: for (int i = 0; i < 10; i++) { String error = null;
String error = ""; tryloop: for (int i = 0; i < 20; i++) {
try { try {
error = null;
this.fulltext.putDocument(solrInputDoc); this.fulltext.putDocument(solrInputDoc);
break tryloop; break tryloop;
} catch ( final IOException e ) { } catch ( final IOException e ) {
error = "failed to send " + urlNormalform + " to solr"; error = "failed to send " + urlNormalform + " to solr";
Log.logWarning("SOLR", error + e.getMessage()); Log.logWarning("SOLR", error + e.getMessage());
if (i == 10) this.fulltext.commit();
try {Thread.sleep(1000);} catch (InterruptedException e1) {} try {Thread.sleep(1000);} catch (InterruptedException e1) {}
continue tryloop;
} }
}
if (error != null) {
Log.logWarning("SOLR", error + ", pausing Crawler!"); Log.logWarning("SOLR", error + ", pausing Crawler!");
// pause the crawler!!! // pause the crawler!!!
Switchboard.getSwitchboard().pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, error); Switchboard.getSwitchboard().pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, error);

Loading…
Cancel
Save