|
|
@ -35,11 +35,8 @@ import net.yacy.crawler.retrieval.Request;
|
|
|
|
import net.yacy.kelondro.workflow.AbstractBusyThread;
|
|
|
|
import net.yacy.kelondro.workflow.AbstractBusyThread;
|
|
|
|
import net.yacy.search.Switchboard;
|
|
|
|
import net.yacy.search.Switchboard;
|
|
|
|
import net.yacy.search.schema.CollectionSchema;
|
|
|
|
import net.yacy.search.schema.CollectionSchema;
|
|
|
|
import org.apache.solr.client.solrj.SolrQuery;
|
|
|
|
|
|
|
|
import org.apache.solr.client.solrj.response.QueryResponse;
|
|
|
|
|
|
|
|
import org.apache.solr.common.SolrDocument;
|
|
|
|
import org.apache.solr.common.SolrDocument;
|
|
|
|
import org.apache.solr.common.SolrDocumentList;
|
|
|
|
import org.apache.solr.common.SolrDocumentList;
|
|
|
|
import org.apache.solr.common.params.CommonParams;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
|
* Selects documents by a query from the local index
|
|
|
|
* Selects documents by a query from the local index
|
|
|
@ -63,10 +60,10 @@ public class RecrawlBusyThread extends AbstractBusyThread {
|
|
|
|
super(3000, 1000); // set lower limits of cycle delay
|
|
|
|
super(3000, 1000); // set lower limits of cycle delay
|
|
|
|
this.setIdleSleep(10*60000); // set actual cycle delays
|
|
|
|
this.setIdleSleep(10*60000); // set actual cycle delays
|
|
|
|
this.setBusySleep(2*60000);
|
|
|
|
this.setBusySleep(2*60000);
|
|
|
|
|
|
|
|
this.setPriority(Thread.MIN_PRIORITY);
|
|
|
|
|
|
|
|
|
|
|
|
this.sb = xsb;
|
|
|
|
this.sb = xsb;
|
|
|
|
urlstack = new HashSet<DigestURL>();
|
|
|
|
urlstack = new HashSet<DigestURL>();
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
@ -102,11 +99,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
this.urlstack.clear();
|
|
|
|
this.urlstack.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (added > 0);
|
|
|
|
if (added > 0) {
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
@ -116,13 +109,13 @@ public class RecrawlBusyThread extends AbstractBusyThread {
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
|
public boolean job() {
|
|
|
|
public boolean job() {
|
|
|
|
|
|
|
|
// other crawls are running, do nothing
|
|
|
|
if (sb.crawlQueues.coreCrawlJobSize() > 0) {
|
|
|
|
if (sb.crawlQueues.coreCrawlJobSize() > 0) {
|
|
|
|
return false;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (this.urlstack.isEmpty()) {
|
|
|
|
if (this.urlstack.isEmpty()) {
|
|
|
|
processSingleQuery();
|
|
|
|
return processSingleQuery();
|
|
|
|
return true;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
return feedToCrawler();
|
|
|
|
return feedToCrawler();
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -131,27 +124,24 @@ public class RecrawlBusyThread extends AbstractBusyThread {
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
|
* Selects documents to recrawl the urls
|
|
|
|
* Selects documents to recrawl the urls
|
|
|
|
|
|
|
|
* @return true if query has more results
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
private void processSingleQuery() {
|
|
|
|
private boolean processSingleQuery() {
|
|
|
|
if (!this.urlstack.isEmpty()) {
|
|
|
|
if (!this.urlstack.isEmpty()) {
|
|
|
|
return;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
SolrDocumentList docList = null;
|
|
|
|
SolrDocumentList docList = null;
|
|
|
|
SolrQuery solrQuery = new SolrQuery();
|
|
|
|
|
|
|
|
solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special
|
|
|
|
|
|
|
|
solrQuery.set("sort", CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc");
|
|
|
|
|
|
|
|
solrQuery.set(CommonParams.FL, CollectionSchema.sku.getSolrFieldName());
|
|
|
|
|
|
|
|
solrQuery.set(CommonParams.ROWS, this.chunksize);
|
|
|
|
|
|
|
|
solrQuery.set(CommonParams.START, this.chunkstart);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
|
|
|
|
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
|
|
|
|
if (!solrConnector.isClosed()) {
|
|
|
|
if (!solrConnector.isClosed()) {
|
|
|
|
try {
|
|
|
|
try {
|
|
|
|
QueryResponse rsp = solrConnector.getResponseByParams(solrQuery);
|
|
|
|
docList = solrConnector.getDocumentListByQuery(currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
|
|
|
|
docList = rsp.getResults();
|
|
|
|
CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
|
|
|
|
this.urlsfound = docList.getNumFound();
|
|
|
|
this.urlsfound = docList.getNumFound();
|
|
|
|
} catch (Throwable e) {
|
|
|
|
} catch (Throwable e) {
|
|
|
|
|
|
|
|
this.urlsfound = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
this.urlsfound =0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (docList != null) {
|
|
|
|
if (docList != null) {
|
|
|
@ -161,14 +151,15 @@ public class RecrawlBusyThread extends AbstractBusyThread {
|
|
|
|
} catch (MalformedURLException ex) {
|
|
|
|
} catch (MalformedURLException ex) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
this.chunkstart = this.chunkstart + this.chunksize;
|
|
|
|
this.chunkstart = this.chunkstart + urlstack.size();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (docList.getNumFound() <= this.chunkstart) {
|
|
|
|
|
|
|
|
this.chunkstart = 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (this.urlsfound <= this.chunkstart) {
|
|
|
|
|
|
|
|
this.chunkstart = 0;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// TODO: add a stop condition
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
|