added deletion of hosts during crawl start if deleteold option was given

pull/1/head
Michael Peter Christen 12 years ago
parent d64445c3cb
commit 5fd3b93661

@ -126,7 +126,7 @@ public class CrawlResults {
final String domain = post.get("domain", null);
final String hashpart = domain == null ? null : DigestURI.hosthash6(domain);
if (hashpart != null) {
sb.index.fulltext().deleteDomain(hashpart, false);
sb.index.fulltext().deleteDomain(hashpart, null, false);
ResultURLs.deleteDomain(tabletype, domain, hashpart);
}
}

@ -153,8 +153,13 @@ public class Crawler_p {
final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start
final boolean restrictedcrawl = fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch);
final boolean deleteold = restrictedcrawl && post.getBoolean("deleteold");
final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off"));
Date deleteageDate = null;
if (deleteage) {
long t = timeParser(true, post.getInt("deleteIfOlderNumber", -1), post.get("deleteIfOlderUnit","year")); // year, month, day, hour
if (t > 0) deleteageDate = new Date(t);
}
final boolean deleteold = (deleteage && deleteageDate != null) || (restrictedcrawl && post.getBoolean("deleteold"));
String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));
@ -286,7 +291,10 @@ public class Crawler_p {
if (fullDomain) {
siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) {
for (DigestURI u: rootURLs) sb.index.fulltext().deleteDomain(u.hosthash(), true);
for (DigestURI u: rootURLs) {
int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 0);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
}
}
} else if (subPath) {
siteFilter = CrawlProfile.subpathFilter(rootURLs);
@ -294,7 +302,8 @@ public class Crawler_p {
for (DigestURI u: rootURLs) {
String subpath = CrawlProfile.mustMatchSubpath(u);
if (subpath.endsWith(".*")) subpath = subpath.substring(0, subpath.length() - 2);
sb.index.fulltext().remove(subpath, true);
int count = sb.index.fulltext().remove(subpath, deleteageDate, rootURLs.size() > 0);
if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
}
}
}

@ -297,7 +297,7 @@ public class IndexControlURLs_p {
if (post.containsKey("deletedomain")) {
final String hp = post.get("hashpart");
segment.fulltext().deleteDomain(hp, false);
segment.fulltext().deleteDomain(hp, null, false);
// trigger the loading of the table
post.put("statistics", "");
}

@ -24,7 +24,6 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

@ -23,7 +23,6 @@ import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

@ -188,10 +188,12 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
}
@Override
public void deleteByQuery(final String querystring) throws IOException {
if (this.solr0 != null) this.solr0.deleteByQuery(querystring);
if (this.solr1 != null) this.solr1.deleteByQuery(querystring);
public int deleteByQuery(final String querystring) throws IOException {
int count = 0;
if (this.solr0 != null) count += this.solr0.deleteByQuery(querystring);
if (this.solr1 != null) count += this.solr1.deleteByQuery(querystring);
this.clearCache();
return count;
}
/**

@ -139,8 +139,8 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr
}
@Override
public void deleteByQuery(final String querystring) throws IOException {
this.solr.deleteByQuery(querystring);
public int deleteByQuery(final String querystring) throws IOException {
return this.solr.deleteByQuery(querystring);
}
@Override

@ -115,18 +115,18 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon
}
@Override
public void deleteByQuery(final String querystring) throws IOException {
public int deleteByQuery(final String querystring) throws IOException {
final long t = System.currentTimeMillis() + this.retryMaxTime;
Throwable ee = null;
while (System.currentTimeMillis() < t) try {
this.solrConnector.deleteByQuery(querystring);
return;
return this.solrConnector.deleteByQuery(querystring);
} catch (final Throwable e) {
ee = e;
try {Thread.sleep(10);} catch (final InterruptedException e1) {}
continue;
}
if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
return 0;
}
@Override

@ -112,8 +112,10 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon
}
@Override
public void deleteByQuery(final String querystring) throws IOException {
for (final SolrConnector connector: this.connectors) connector.deleteByQuery(querystring);
public int deleteByQuery(final String querystring) throws IOException {
int count = 0;
for (final SolrConnector connector: this.connectors) count += connector.deleteByQuery(querystring);
return count;
}
/**

@ -82,9 +82,10 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
/**
* delete entries from solr according the given solr query string
* @param id the url hash of the entry
* @return the number of deletions
* @throws IOException
*/
public void deleteByQuery(final String querystring) throws IOException;
public int deleteByQuery(final String querystring) throws IOException;
/**
* check if a given id exists in solr

@ -40,12 +40,14 @@ import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.FacetField.Count;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
public abstract class SolrServerConnector extends AbstractSolrConnector implements SolrConnector {
@ -164,10 +166,14 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
* @throws IOException
*/
@Override
public void deleteByQuery(final String querystring) throws IOException {
public int deleteByQuery(final String querystring) throws IOException {
try {
synchronized (this.server) {
long c0 = this.getQueryCount(querystring);
this.server.deleteByQuery(querystring, this.commitWithinMs);
this.commit();
long c1 = this.getQueryCount(querystring);
return (int) (c1 - c0);
}
} catch (final Throwable e) {
throw new IOException(e);

@ -36,8 +36,10 @@ import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.solr.YaCySchema;
@ -305,35 +307,100 @@ public final class Fulltext implements Iterable<byte[]> {
if (MemoryControl.shortStatus()) clearCache();
}
/**
* using a fragment of the url hash (6 bytes: bytes 6 to 11) it is possible to address all urls from a specific domain
* here such a fragment can be used to delete all these domains at once
* @param hosthash the hash of the host to be deleted
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
* @return number of deleted domains
* @throws IOException
*/
public int deleteDomain(final String hosthash, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
Fulltext.this.solr.commit();
} catch (IOException e) {}
}
// delete in old metadata structure
if (Fulltext.this.urlIndexFile != null) {
final ArrayList<String> l = new ArrayList<String>();
synchronized (this) {
CloneableIterator<byte[]> i;
try {
i = Fulltext.this.urlIndexFile.keys(true, null);
String hash;
while (i != null && i.hasNext()) {
hash = ASCII.String(i.next());
if (hosthash.equals(hash.substring(6))) l.add(hash);
}
// then delete the urls using this list
for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h));
} catch (IOException e) {}
}
}
// finally remove the line with statistics
if (Fulltext.this.statsDump != null) {
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
HostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hs.hosthash.equals(hosthash)) {
hsi.remove();
break;
}
}
}
}
};
if (concurrent) t.start(); else t.run();
return count.get();
}
/**
* remove a full subpath from the index
* @param subpath the left path of the url; at least until the end of the host
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
* @param concurrently if true, then the method returnes immediately and runs concurrently
*/
public void remove(String subpath, final boolean concurrently) {
public int remove(String subpath, Date freshdate, final boolean concurrently) {
int p = subpath.substring(0, subpath.length() - 1).lastIndexOf('/');
final String path = p > 8 ? subpath.substring(0, p + 1) : subpath;
DigestURI uri;
try {uri = new DigestURI(path);} catch (MalformedURLException e) {return;}
try {uri = new DigestURI(path);} catch (MalformedURLException e) {return 0;}
final String host = uri.getHost();
final String q = YaCySchema.host_s.getSolrFieldName() + ":" + host +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread(){
public void run() {
final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(YaCySchema.host_s.getSolrFieldName() + ":" + host, 0, 1000000, 600000, -1);
final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(q, 0, 1000000, 600000, -1);
try {
SolrDocument doc;
boolean removed = false;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
if (u.startsWith(path)) {
remove(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
removed = true;
count.incrementAndGet();
}
}
if (removed) Fulltext.this.solr.commit();
if (count.get() > 0) Fulltext.this.solr.commit();
} catch (InterruptedException e) {}
}
};
if (concurrently) t.start(); else t.run();
return count.get();
}
/**
@ -801,61 +868,4 @@ public final class Fulltext implements Iterable<byte[]> {
this.count = count;
}
}
/**
* using a fragment of the url hash (6 bytes: bytes 6 to 11) it is possible to address all urls from a specific domain
* here such a fragment can be used to delete all these domains at once
* @param hosthash
* @return number of deleted domains
* @throws IOException
*/
public void deleteDomain(final String hosthash, boolean concurrent) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
Fulltext.this.solr.deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"");
Fulltext.this.solr.commit();
} catch (IOException e) {}
}
// delete in old metadata structure
if (Fulltext.this.urlIndexFile != null) {
final ArrayList<String> l = new ArrayList<String>();
synchronized (this) {
CloneableIterator<byte[]> i;
try {
i = Fulltext.this.urlIndexFile.keys(true, null);
String hash;
while (i != null && i.hasNext()) {
hash = ASCII.String(i.next());
if (hosthash.equals(hash.substring(6))) l.add(hash);
}
// then delete the urls using this list
for (final String h: l) Fulltext.this.urlIndexFile.delete(ASCII.getBytes(h));
} catch (IOException e) {}
}
}
// finally remove the line with statistics
if (Fulltext.this.statsDump != null) {
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
HostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hs.hosthash.equals(hosthash)) {
hsi.remove();
break;
}
}
}
}
};
if (concurrent) t.start(); else t.run();
}
}

@ -365,16 +365,21 @@ public class Segment {
// STORE TO SOLR
final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language);
tryloop: for (int i = 0; i < 10; i++) {
String error = "";
String error = null;
tryloop: for (int i = 0; i < 20; i++) {
try {
error = null;
this.fulltext.putDocument(solrInputDoc);
break tryloop;
} catch ( final IOException e ) {
error = "failed to send " + urlNormalform + " to solr";
Log.logWarning("SOLR", error + e.getMessage());
if (i == 10) this.fulltext.commit();
try {Thread.sleep(1000);} catch (InterruptedException e1) {}
continue tryloop;
}
}
if (error != null) {
Log.logWarning("SOLR", error + ", pausing Crawler!");
// pause the crawler!!!
Switchboard.getSwitchboard().pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, error);

Loading…
Cancel
Save