Record recrawl calls to make them schedulable

pull/154/head
luccioman 7 years ago
parent 433e241e4f
commit 0c9e0b3566

@ -26,7 +26,6 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Date; import java.util.Date;
import java.util.Iterator;
import org.apache.http.Header; import org.apache.http.Header;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
@ -40,7 +39,6 @@ import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.TransactionManager; import net.yacy.data.TransactionManager;
import net.yacy.data.WorkTables; import net.yacy.data.WorkTables;
import net.yacy.document.importer.MediawikiImporter; import net.yacy.document.importer.MediawikiImporter;
@ -112,7 +110,7 @@ public class IndexImportMediawiki_p {
MultiProtocolURL sourceURL = null; MultiProtocolURL sourceURL = null;
int status = 0; int status = 0;
String sourceFilePath = ""; String sourceFilePath = "";
final Row lastExecutedCall = selectLastExecutedCall(post, sb); final Row lastExecutedCall = WorkTables.selectLastExecutedApiCall("IndexImportMediawiki_p.html", post, sb);
Date lastExecutionDate = null; Date lastExecutionDate = null;
if (lastExecutedCall != null) { if (lastExecutedCall != null) {
lastExecutionDate = lastExecutedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null); lastExecutionDate = lastExecutedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null);
@ -192,47 +190,6 @@ public class IndexImportMediawiki_p {
return prop; return prop;
} }
/**
* @param post Servlet request parameters. Must not be null.
* @param sb the {@link Switchboard} instance. Must not be null.
* @return the most recently recorded call to this API with the same parameters
*/
private static Row selectLastExecutedCall(final serverObjects post, final Switchboard sb) {
Row lastRecordedCall = null;
if (sb.tables != null) {
try {
if(post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
/* Search the table on the primary key when when present (re-execution of a recorded call) */
lastRecordedCall = sb.tables.select(WorkTables.TABLE_API_NAME, UTF8.getBytes(post.get(WorkTables.TABLE_API_COL_APICALL_PK)));
} else {
/* Else search the table on the API URL as recorded (including parameters) */
final String apiURL = WorkTables.generateRecordedURL(post, "IndexImportMediawiki_p.html");
Iterator<Row> rowsIt = sb.tables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL,
UTF8.getBytes(apiURL));
while (rowsIt.hasNext()) {
Row currentRow = rowsIt.next();
if (currentRow != null) {
Date currentLastExec = currentRow.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null);
if(currentLastExec != null) {
if(lastRecordedCall == null) {
lastRecordedCall = currentRow;
} else if(lastRecordedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null).before(currentLastExec)) {
lastRecordedCall = currentRow;
}
}
}
}
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
} catch(final SpaceExceededException e) {
ConcurrentLog.logException(e);
}
}
return lastRecordedCall;
}
/** /**
* @param fileURL the file URL. Must not be null. * @param fileURL the file URL. Must not be null.
* @return the last modified date for the file at fileURL, or 0L when unknown or when an error occurred * @return the last modified date for the file at fileURL, or 0L when unknown or when an error occurred

@ -25,12 +25,15 @@ import java.time.format.FormatStyle;
import java.util.Locale; import java.util.Locale;
import net.yacy.migration; import net.yacy.migration;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.RecrawlBusyThread; import net.yacy.crawler.RecrawlBusyThread;
import net.yacy.data.TransactionManager; import net.yacy.data.TransactionManager;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.blob.Tables.Row;
import net.yacy.kelondro.workflow.BusyThread; import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.ReindexSolrBusyThread; import net.yacy.search.index.ReindexSolrBusyThread;
@ -38,8 +41,11 @@ import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch; import net.yacy.server.serverSwitch;
public class IndexReIndexMonitor_p { public class IndexReIndexMonitor_p {
/** This servlet name, used for identifying recorded API calls */
private static final String SERVLET_NAME = IndexReIndexMonitor_p.class.getSimpleName() + ".html";
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env; final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
@ -132,6 +138,22 @@ public class IndexReIndexMonitor_p {
sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null, sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null,
new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000); new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000);
recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
/* store this call as an api call for easy scheduling possibility */
if(sb.tables != null) {
/* We avoid creating a duplicate of any already recorded API call with the same parameters */
final Row lastExecutedCall = WorkTables
.selectLastExecutedApiCall(IndexReIndexMonitor_p.SERVLET_NAME, post, sb);
if (lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
byte[] lastExecutedCallPk = lastExecutedCall.getPK();
if (lastExecutedCallPk != null) {
post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
}
}
sb.tables.recordAPICall(post, IndexReIndexMonitor_p.SERVLET_NAME, WorkTables.TABLE_API_TYPE_CRAWLER,
"Recrawl documents matching selection query : " + recrawlQuery);
}
} else if(post.containsKey("simulateRecrawl") && sb.index.fulltext().connectedLocalSolr()) { } else if(post.containsKey("simulateRecrawl") && sb.index.fulltext().connectedLocalSolr()) {
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (!solrConnector.isClosed()) { if (!solrConnector.isClosed()) {

@ -100,31 +100,93 @@ public class WorkTables extends Tables {
/** /**
* *
* @param post the api call request parameters. Must not be null. * @param post the api call eventual request parameters.
* @param servletName the name of the servlet * @param servletName the name of the servlet. Must not be null.
* @return the API URL to be recorded * @return the API URL to be recorded, formatted to include request parameters as URL query parameters
*/ */
public static String generateRecordedURL(final serverObjects post, final String servletName) { public static String generateRecordedURL(final serverObjects post, final String servletName) {
/* Before API URL serialization, we set any eventual transaction token value to empty : /* Before API URL serialization, we set any eventual transaction token value to empty :
* this will later help identify a new valid transaction token will be necessary, * this will later help identify a new valid transaction token will be necessary,
* but without revealing it in the URL displayed in the process scheduler and storing an invalid value */ * but prevents revealing it in the URL displayed in the process scheduler and prevents storing an outdated value */
final String transactionToken = post.get(TransactionManager.TRANSACTION_TOKEN_PARAM); final String transactionToken;
if(transactionToken != null) { if(post != null) {
transactionToken = post.get(TransactionManager.TRANSACTION_TOKEN_PARAM);
} else {
transactionToken = null;
}
if(transactionToken != null && post != null) {
post.put(TransactionManager.TRANSACTION_TOKEN_PARAM, ""); post.put(TransactionManager.TRANSACTION_TOKEN_PARAM, "");
} }
// generate the apicall url - without the apicall attributes // generate the apicall url - without the apicall attributes
final String apiurl = /*"http://localhost:" + getConfig("port", "8090") +*/ "/" + servletName + "?" + post.toString(); String apiurl = "/" + servletName;
if(post != null) {
apiurl += "?" + post.toString();
}
/* Now restore the eventual transaction token to prevent side effects on the post object eventually still used by the caller */ /* Now restore the eventual transaction token to prevent side effects on the post object eventually still used by the caller */
if(transactionToken != null) { if(post != null) {
post.put(TransactionManager.TRANSACTION_TOKEN_PARAM, transactionToken); if(transactionToken != null) {
} else { post.put(TransactionManager.TRANSACTION_TOKEN_PARAM, transactionToken);
post.remove(TransactionManager.TRANSACTION_TOKEN_PARAM); } else {
post.remove(TransactionManager.TRANSACTION_TOKEN_PARAM);
}
} }
return apiurl; return apiurl;
} }
/**
* @param servletName the servlet name used to identify the API when the call is recorded.
* @param post Servlet request parameters. Must not be null.
* @param sb the {@link Switchboard} instance. Must not be null.
* @return the most recently recorded call to the given API with the same parameters, or null when no one was found or data is not accessible
*/
public static Row selectLastExecutedApiCall(final String servletName, final serverObjects post, final Switchboard sb) {
Row lastRecordedCall = null;
if (servletName != null && sb != null && sb.tables != null) {
try {
if (post != null && post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
/*
* Search the table on the primary key when when present (re-execution of a
* recorded call)
*/
lastRecordedCall = sb.tables.select(WorkTables.TABLE_API_NAME,
UTF8.getBytes(post.get(WorkTables.TABLE_API_COL_APICALL_PK)));
} else {
/* Else search the table on the API URL as recorded (including parameters) */
final String apiURL = WorkTables.generateRecordedURL(post, servletName);
final Iterator<Row> rowsIt = sb.tables.iterator(WorkTables.TABLE_API_NAME,
WorkTables.TABLE_API_COL_URL, UTF8.getBytes(apiURL));
while (rowsIt.hasNext()) {
final Row currentRow = rowsIt.next();
if (currentRow != null) {
final Date currentLastExec = currentRow.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC,
(Date) null);
if (currentLastExec != null) {
if (lastRecordedCall == null) {
/*
* Do not break now the loop : we are looking for the most recent API call on
* the same URL
*/
lastRecordedCall = currentRow;
} else if (lastRecordedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null)
.before(currentLastExec)) {
lastRecordedCall = currentRow;
}
}
}
}
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
}
}
return lastRecordedCall;
}
/** /**
* recording of a api call. stores the call parameters into the API database table * recording of a api call. stores the call parameters into the API database table

Loading…
Cancel
Save