Record recrawl calls to make them schedulable

pull/154/head
luccioman 7 years ago
parent 433e241e4f
commit 0c9e0b3566

@ -26,7 +26,6 @@ import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Iterator;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
@ -40,7 +39,6 @@ import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.TransactionManager;
import net.yacy.data.WorkTables;
import net.yacy.document.importer.MediawikiImporter;
@ -112,7 +110,7 @@ public class IndexImportMediawiki_p {
MultiProtocolURL sourceURL = null;
int status = 0;
String sourceFilePath = "";
final Row lastExecutedCall = selectLastExecutedCall(post, sb);
final Row lastExecutedCall = WorkTables.selectLastExecutedApiCall("IndexImportMediawiki_p.html", post, sb);
Date lastExecutionDate = null;
if (lastExecutedCall != null) {
lastExecutionDate = lastExecutedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null);
@ -192,47 +190,6 @@ public class IndexImportMediawiki_p {
return prop;
}
/**
* @param post Servlet request parameters. Must not be null.
* @param sb the {@link Switchboard} instance. Must not be null.
* @return the most recently recorded call to this API with the same parameters
*/
private static Row selectLastExecutedCall(final serverObjects post, final Switchboard sb) {
Row lastRecordedCall = null;
if (sb.tables != null) {
try {
if(post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
/* Search the table on the primary key when when present (re-execution of a recorded call) */
lastRecordedCall = sb.tables.select(WorkTables.TABLE_API_NAME, UTF8.getBytes(post.get(WorkTables.TABLE_API_COL_APICALL_PK)));
} else {
/* Else search the table on the API URL as recorded (including parameters) */
final String apiURL = WorkTables.generateRecordedURL(post, "IndexImportMediawiki_p.html");
Iterator<Row> rowsIt = sb.tables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL,
UTF8.getBytes(apiURL));
while (rowsIt.hasNext()) {
Row currentRow = rowsIt.next();
if (currentRow != null) {
Date currentLastExec = currentRow.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null);
if(currentLastExec != null) {
if(lastRecordedCall == null) {
lastRecordedCall = currentRow;
} else if(lastRecordedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null).before(currentLastExec)) {
lastRecordedCall = currentRow;
}
}
}
}
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
} catch(final SpaceExceededException e) {
ConcurrentLog.logException(e);
}
}
return lastRecordedCall;
}
/**
* @param fileURL the file URL. Must not be null.
* @return the last modified date for the file at fileURL, or 0L when unknown or when an error occurred

@ -25,12 +25,15 @@ import java.time.format.FormatStyle;
import java.util.Locale;
import net.yacy.migration;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.RecrawlBusyThread;
import net.yacy.data.TransactionManager;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.blob.Tables.Row;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.search.Switchboard;
import net.yacy.search.index.ReindexSolrBusyThread;
@ -38,8 +41,11 @@ import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class IndexReIndexMonitor_p {
/** This servlet name, used for identifying recorded API calls */
private static final String SERVLET_NAME = IndexReIndexMonitor_p.class.getSimpleName() + ".html";
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
@ -132,6 +138,22 @@ public class IndexReIndexMonitor_p {
sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null,
new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000);
recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
/* store this call as an api call for easy scheduling possibility */
if(sb.tables != null) {
/* We avoid creating a duplicate of any already recorded API call with the same parameters */
final Row lastExecutedCall = WorkTables
.selectLastExecutedApiCall(IndexReIndexMonitor_p.SERVLET_NAME, post, sb);
if (lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
byte[] lastExecutedCallPk = lastExecutedCall.getPK();
if (lastExecutedCallPk != null) {
post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
}
}
sb.tables.recordAPICall(post, IndexReIndexMonitor_p.SERVLET_NAME, WorkTables.TABLE_API_TYPE_CRAWLER,
"Recrawl documents matching selection query : " + recrawlQuery);
}
} else if(post.containsKey("simulateRecrawl") && sb.index.fulltext().connectedLocalSolr()) {
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (!solrConnector.isClosed()) {

@ -100,31 +100,93 @@ public class WorkTables extends Tables {
/**
*
* @param post the api call request parameters. Must not be null.
* @param servletName the name of the servlet
* @return the API URL to be recorded
* @param post the api call eventual request parameters.
* @param servletName the name of the servlet. Must not be null.
* @return the API URL to be recorded, formatted to include request parameters as URL query parameters
*/
public static String generateRecordedURL(final serverObjects post, final String servletName) {
/* Before API URL serialization, we set any eventual transaction token value to empty :
* this will later help identify a new valid transaction token will be necessary,
* but without revealing it in the URL displayed in the process scheduler and storing an invalid value */
final String transactionToken = post.get(TransactionManager.TRANSACTION_TOKEN_PARAM);
if(transactionToken != null) {
* but prevents revealing it in the URL displayed in the process scheduler and prevents storing an outdated value */
final String transactionToken;
if(post != null) {
transactionToken = post.get(TransactionManager.TRANSACTION_TOKEN_PARAM);
} else {
transactionToken = null;
}
if(transactionToken != null && post != null) {
post.put(TransactionManager.TRANSACTION_TOKEN_PARAM, "");
}
// generate the apicall url - without the apicall attributes
final String apiurl = /*"http://localhost:" + getConfig("port", "8090") +*/ "/" + servletName + "?" + post.toString();
String apiurl = "/" + servletName;
if(post != null) {
apiurl += "?" + post.toString();
}
/* Now restore the eventual transaction token to prevent side effects on the post object eventually still used by the caller */
if(transactionToken != null) {
post.put(TransactionManager.TRANSACTION_TOKEN_PARAM, transactionToken);
} else {
post.remove(TransactionManager.TRANSACTION_TOKEN_PARAM);
if(post != null) {
if(transactionToken != null) {
post.put(TransactionManager.TRANSACTION_TOKEN_PARAM, transactionToken);
} else {
post.remove(TransactionManager.TRANSACTION_TOKEN_PARAM);
}
}
return apiurl;
}
/**
* @param servletName the servlet name used to identify the API when the call is recorded.
* @param post Servlet request parameters. Must not be null.
* @param sb the {@link Switchboard} instance. Must not be null.
* @return the most recently recorded call to the given API with the same parameters, or null when no one was found or data is not accessible
*/
public static Row selectLastExecutedApiCall(final String servletName, final serverObjects post, final Switchboard sb) {
Row lastRecordedCall = null;
if (servletName != null && sb != null && sb.tables != null) {
try {
if (post != null && post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
/*
* Search the table on the primary key when when present (re-execution of a
* recorded call)
*/
lastRecordedCall = sb.tables.select(WorkTables.TABLE_API_NAME,
UTF8.getBytes(post.get(WorkTables.TABLE_API_COL_APICALL_PK)));
} else {
/* Else search the table on the API URL as recorded (including parameters) */
final String apiURL = WorkTables.generateRecordedURL(post, servletName);
final Iterator<Row> rowsIt = sb.tables.iterator(WorkTables.TABLE_API_NAME,
WorkTables.TABLE_API_COL_URL, UTF8.getBytes(apiURL));
while (rowsIt.hasNext()) {
final Row currentRow = rowsIt.next();
if (currentRow != null) {
final Date currentLastExec = currentRow.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC,
(Date) null);
if (currentLastExec != null) {
if (lastRecordedCall == null) {
/*
* Do not break now the loop : we are looking for the most recent API call on
* the same URL
*/
lastRecordedCall = currentRow;
} else if (lastRecordedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null)
.before(currentLastExec)) {
lastRecordedCall = currentRow;
}
}
}
}
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
}
}
return lastRecordedCall;
}
/**
* recording of a api call. stores the call parameters into the API database table

Loading…
Cancel
Save