Merge branch 'master' of git@github.com:yacy/yacy_search_server.git

pull/8/head
Michael Peter Christen 10 years ago
commit dbf9e3503d

@ -17,7 +17,7 @@
<tr> <tr>
<td>Documents in current queue</td> <td>Documents in current queue</td>
<td>#[querysize]#</td> <td>#[querysize]#</td>
<td>#(reindexjobrunning)#::<input type="submit" value="refresh page" class="btn btn-primary"/>#(/reindexjobrunning)#</td> <td>#(reindexjobrunning)#::<input type="submit" value="refresh page" class="btn btn-success"/>#(/reindexjobrunning)#</td>
</tr> </tr>
<tr> <tr>
<td>Documents processed</td> <td>Documents processed</td>
@ -37,7 +37,7 @@
</table> </table>
#(reindexjobrunning)# #(reindexjobrunning)#
<input type="submit" name="reindexnow" value="start reindex job now" class="btn btn-primary"/> <input type="submit" name="reindexnow" value="start reindex job now" class="btn btn-primary"/>
::<input type="submit" name="stopreindex" value="stop reindexing" class="btn btn-primary"/> ::<input type="submit" name="stopreindex" value="stop reindexing" class="btn btn-danger"/>
#(/reindexjobrunning)# #(/reindexjobrunning)#
<p class="info">#[infomessage]#</p> <p class="info">#[infomessage]#</p>
</fieldset> </fieldset>
@ -57,6 +57,24 @@
#(/reindexjobrunning)# #(/reindexjobrunning)#
</td></tr></table> </td></tr></table>
</form> </form>
<h2>Re-Crawl Index Documents</h2>
<p>Searches the local index and selects documents to add to the crawler (recrawl the document).
This runs transparent as background job. Documents are added to the crawler only if no other crawls are active
and are added in small chunks.</p>
<form action="IndexReIndexMonitor_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
#(recrawljobrunning)#
<input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/>
to re-crawl documents with fresh_date_dt before today.
::<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/>
<table>
<tr>
<td>Documents to process</td> <td>#[docCount]#</td> <td> with fresh_date_dt before today</td>
</tr>
</table>
#(/recrawljobrunning)#
</fieldset>
</form>
#%env/templates/footer.template%# #%env/templates/footer.template%#
</body> </body>
</html> </html>

@ -21,6 +21,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.kelondro.workflow.BusyThread; import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.migration; import net.yacy.migration;
import net.yacy.crawler.RecrawlBusyThread;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.ReindexSolrBusyThread; import net.yacy.search.index.ReindexSolrBusyThread;
@ -36,26 +37,26 @@ public class IndexReIndexMonitor_p {
prop.put("docsprocessed", "0"); prop.put("docsprocessed", "0");
prop.put("currentselectquery",""); prop.put("currentselectquery","");
BusyThread bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); BusyThread reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME);
if (bt == null) { if (reidxbt == null) {
if (post != null && post.containsKey("reindexnow") && sb.index.fulltext().connectedLocalSolr()) { if (post != null && post.containsKey("reindexnow") && sb.index.fulltext().connectedLocalSolr()) {
migration.reindexToschema(sb); migration.reindexToschema(sb);
prop.put("querysize", "0"); prop.put("querysize", "0");
prop.put("infomessage","reindex job started"); prop.put("infomessage","reindex job started");
bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts
} }
} }
if (bt != null) { if (reidxbt != null) {
prop.put("reindexjobrunning", 1); prop.put("reindexjobrunning", 1);
prop.put("querysize", bt.getJobCount()); prop.put("querysize", reidxbt.getJobCount());
if (bt instanceof ReindexSolrBusyThread) { if (reidxbt instanceof ReindexSolrBusyThread) {
prop.put("docsprocessed", ((ReindexSolrBusyThread) bt).getProcessed()); prop.put("docsprocessed", ((ReindexSolrBusyThread) reidxbt).getProcessed());
prop.put("currentselectquery","q="+((ReindexSolrBusyThread) bt).getCurrentQuery()); prop.put("currentselectquery","q="+((ReindexSolrBusyThread) reidxbt).getCurrentQuery());
// prepare list of fields in queue // prepare list of fields in queue
final OrderedScoreMap<String> querylist = ((ReindexSolrBusyThread) bt).getQueryList(); final OrderedScoreMap<String> querylist = ((ReindexSolrBusyThread) reidxbt).getQueryList();
if (querylist != null) { if (querylist != null) {
int i = 0; int i = 0;
for (String oneqs : querylist) { // just use fieldname from query (fieldname:[* TO *]) for (String oneqs : querylist) { // just use fieldname from query (fieldname:[* TO *])
@ -86,6 +87,34 @@ public class IndexReIndexMonitor_p {
prop.putHTML("infomessage", "! reindex works only with embedded Solr index !"); prop.putHTML("infomessage", "! reindex works only with embedded Solr index !");
} }
} }
// recrawl job handling
BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
if (recrawlbt == null) {
if (post != null && post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
sb.deployThread(RecrawlBusyThread.THREAD_NAME,
"ReCrawl",
"recrawl existing documents",
null,
new RecrawlBusyThread(Switchboard.getSwitchboard()),
1000);
recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
}
}
if (recrawlbt != null) {
if (post != null && post.containsKey("stoprecrawl")) {
sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false);
prop.put("recrawljobrunning",0);
} else {
prop.put("recrawljobrunning", 1);
prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound);
}
} else {
prop.put("recrawljobrunning", 0);
}
// return rewrite properties // return rewrite properties
return prop; return prop;
} }

@ -519,7 +519,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10); final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
for (final Map.Entry<String, String> element: getAttributes().entrySet()) { for (final Map.Entry<String, String> element: getAttributes().entrySet()) {
qtmp.append('&'); qtmp.append('&');
qtmp.append(element.getKey()); qtmp.append(escape(element.getKey()));
qtmp.append('='); qtmp.append('=');
qtmp.append(escape(element.getValue())); qtmp.append(escape(element.getValue()));
} }
@ -1007,6 +1007,11 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return token; return token;
} }
/**
* Evaluates url search part and returns attribute '=' value pairs
*
* @return map key=attribue name, value=string after '='
*/
public Map<String, String> getAttributes() { public Map<String, String> getAttributes() {
Map<String, String > map = new LinkedHashMap<>(); Map<String, String > map = new LinkedHashMap<>();
if (this.searchpart == null) return map; if (this.searchpart == null) return map;
@ -1016,7 +1021,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (p != -1) { if (p != -1) {
map.put(element.substring(0, p), element.substring(p + 1)); map.put(element.substring(0, p), element.substring(p + 1));
} else { } else {
map.put(element.substring(0, p), ""); if (!element.isEmpty()) map.put(element, "");
} }
} }
return map; return map;

@ -0,0 +1,175 @@
/**
* RecrawlBusyThread.java
* Copyright 2015 by Burkhard Buelte
* First released 15.05.2015 at http://yacy.net
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* LICENSE
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
/**
* Selects documents by a query from the local index
* and feeds the found urls to the crawler to recrawl the documents.
* This is intended to keep the index up-to-date
* Currently the doucments are selected by expired fresh_date_dt field
* an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is runnin.
*/
public class RecrawlBusyThread extends AbstractBusyThread {
public final static String THREAD_NAME = "recrawlindex";
public String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
private int chunkstart = 0;
private int chunksize = 200;
final Switchboard sb;
private Set<DigestURL> urlstack; // buffer of urls to recrawl
public long urlsfound = 0;
public RecrawlBusyThread(Switchboard xsb) {
super(3000, 1000); // set lower limits of cycle delay
this.setIdleSleep(10*60000); // set actual cycle delays
this.setBusySleep(2*60000);
this.setPriority(Thread.MIN_PRIORITY);
this.sb = xsb;
urlstack = new HashSet<DigestURL>();
}
/**
* feed urls to the local crawler
*
* @return true if urls were added/accepted to the crawler
*/
private boolean feedToCrawler() {
int added = 0;
if (!this.urlstack.isEmpty()) {
final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile;
for (DigestURL url : this.urlstack) {
final Request request = sb.loader.request(url, true, true);
String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
if (acceptedError == null) {
acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile);
}
if (acceptedError != null) {
ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
continue;
}
final String s;
s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots);
if (s != null) {
ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s);
} else {
added++;
}
}
this.urlstack.clear();
}
return (added > 0);
}
/**
* Process query and hand over urls to the crawler
*
* @return true if something processed
*/
@Override
public boolean job() {
// other crawls are running, do nothing
if (sb.crawlQueues.coreCrawlJobSize() > 0) {
return false;
}
if (this.urlstack.isEmpty()) {
return processSingleQuery();
} else {
return feedToCrawler();
}
}
/**
* Selects documents to recrawl the urls
* @return true if query has more results
*/
private boolean processSingleQuery() {
if (!this.urlstack.isEmpty()) {
return true;
}
SolrDocumentList docList = null;
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (!solrConnector.isClosed()) {
try {
docList = solrConnector.getDocumentListByQuery(currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
this.urlsfound = docList.getNumFound();
} catch (Throwable e) {
this.urlsfound = 0;
}
} else {
this.urlsfound =0;
}
if (docList != null) {
for (SolrDocument doc : docList) {
try {
this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
} catch (MalformedURLException ex) {
}
}
this.chunkstart = this.chunkstart + this.chunksize;
}
if (this.urlsfound <= this.chunkstart) {
this.chunkstart = 0;
return false;
// TODO: add a stop condition
}
return true;
}
@Override
public int getJobCount() {
return this.urlstack.size();
}
@Override
public void freemem() {
this.urlstack.clear();
}
}

@ -60,7 +60,7 @@ public class FileLoader {
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
DigestURL url = request.url(); DigestURL url = request.url();
if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol()); if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol());
RequestHeader requestHeader = new RequestHeader(); RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) { if (request.referrerhash() != null) {

@ -252,7 +252,7 @@ public abstract class AbstractBusyThread extends AbstractThread implements BusyT
// do a clean-up // do a clean-up
this.freemem(); this.freemem();
// sleep a while // sleep a while
ratz(this.idlePause + 1000*(outofmemoryCycles++)); ratz(this.idlePause + 1000*(outofmemoryCycles++ % 0x0F)); // limit extra sleep time (oomCycles can grow big over time)
idletime += System.currentTimeMillis() - timestamp; idletime += System.currentTimeMillis() - timestamp;
} }
} }

@ -3294,7 +3294,7 @@ public final class Switchboard extends serverSwitch {
if (acceptedError == null) acceptedError = this.crawlStacker.checkAcceptanceInitially(url, profile); if (acceptedError == null) acceptedError = this.crawlStacker.checkAcceptanceInitially(url, profile);
if (acceptedError != null) { if (acceptedError != null) {
this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
return; continue;
} }
final String s; final String s;
if (asglobal) { if (asglobal) {

@ -1888,7 +1888,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
List<String> a = new ArrayList<String>(dimension); List<String> a = new ArrayList<String>(dimension);
for (int i = 0; i < dimension; i++) a.add("http"); for (int i = 0; i < dimension; i++) a.add("http");
if (iplist == null) return a; if (iplist == null) return a;
for (Object ip: iplist) a.set(Integer.parseInt(((String) ip).substring(0, 3)), ((String) ip).substring(4)); for (Object ip : iplist) {
// ip format is 001-https but can be 4 digits 1011-https
int i = ((String) ip).indexOf('-');
a.set(Integer.parseInt(((String) ip).substring(0, i)), ((String) ip).substring(i+1));
}
return a; return a;
} }

@ -4,6 +4,7 @@ import static org.junit.Assert.*;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Map;
import java.util.TreeSet; import java.util.TreeSet;
import org.junit.Test; import org.junit.Test;
@ -158,7 +159,7 @@ public class MultiProtocolURLTest {
for (String[] testString : testStrings) { for (String[] testString : testStrings) {
// desired conversion result // desired conversion result
System.out.print("orig uri: " + testString[0]); System.out.print("toNormalform orig uri: " + testString[0]);
String shouldBe = testString[1]; String shouldBe = testString[1];
// conversion result // conversion result
String resultUrl = new MultiProtocolURL(testString[0]).toNormalform(true); String resultUrl = new MultiProtocolURL(testString[0]).toNormalform(true);
@ -167,6 +168,30 @@ public class MultiProtocolURLTest {
System.out.println(" -> " + resultUrl); System.out.println(" -> " + resultUrl);
} }
} }
/**
* Test of getAttribute method, of class MultiProtocolURL.
*/
@Test
public void testGetAttribute() throws Exception {
// some test url/uri with problems in the past
String[][] testStrings = new String[][]{
// teststring , expectedresult
new String[]{"http://yacy.net?&test", "test"}
};
for (String[] testString : testStrings) {
// desired conversion result
System.out.print("test getAttribute: " + testString[0]);
String shouldBe = testString[1];
MultiProtocolURL resultUrl = new MultiProtocolURL(testString[0]);
Map<String, String> attr = resultUrl.getAttributes();
assertEquals("", attr.get(shouldBe));
System.out.println(" -> " + resultUrl.toNormalform(false));
}
}
} }

Loading…
Cancel
Save