From 0c9e0b3566031da00e35f2f318780d6787b8c2c8 Mon Sep 17 00:00:00 2001 From: luccioman Date: Wed, 10 Jan 2018 17:05:53 +0100 Subject: [PATCH] Record recrawl calls to make them schedulable --- htroot/IndexImportMediawiki_p.java | 45 +-------------- htroot/IndexReIndexMonitor_p.java | 24 +++++++- source/net/yacy/data/WorkTables.java | 84 ++++++++++++++++++++++++---- 3 files changed, 97 insertions(+), 56 deletions(-) diff --git a/htroot/IndexImportMediawiki_p.java b/htroot/IndexImportMediawiki_p.java index 80eb1d452..c2279255e 100644 --- a/htroot/IndexImportMediawiki_p.java +++ b/htroot/IndexImportMediawiki_p.java @@ -26,7 +26,6 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; -import java.util.Iterator; import org.apache.http.Header; import org.apache.http.HttpResponse; @@ -40,7 +39,6 @@ import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.TransactionManager; import net.yacy.data.WorkTables; import net.yacy.document.importer.MediawikiImporter; @@ -112,7 +110,7 @@ public class IndexImportMediawiki_p { MultiProtocolURL sourceURL = null; int status = 0; String sourceFilePath = ""; - final Row lastExecutedCall = selectLastExecutedCall(post, sb); + final Row lastExecutedCall = WorkTables.selectLastExecutedApiCall("IndexImportMediawiki_p.html", post, sb); Date lastExecutionDate = null; if (lastExecutedCall != null) { lastExecutionDate = lastExecutedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null); @@ -192,47 +190,6 @@ public class IndexImportMediawiki_p { return prop; } - /** - * @param post Servlet request parameters. Must not be null. - * @param sb the {@link Switchboard} instance. Must not be null. - * @return the most recently recorded call to this API with the same parameters - */ - private static Row selectLastExecutedCall(final serverObjects post, final Switchboard sb) { - Row lastRecordedCall = null; - if (sb.tables != null) { - try { - if(post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) { - /* Search the table on the primary key when when present (re-execution of a recorded call) */ - lastRecordedCall = sb.tables.select(WorkTables.TABLE_API_NAME, UTF8.getBytes(post.get(WorkTables.TABLE_API_COL_APICALL_PK))); - } else { - /* Else search the table on the API URL as recorded (including parameters) */ - final String apiURL = WorkTables.generateRecordedURL(post, "IndexImportMediawiki_p.html"); - Iterator rowsIt = sb.tables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, - UTF8.getBytes(apiURL)); - while (rowsIt.hasNext()) { - Row currentRow = rowsIt.next(); - if (currentRow != null) { - Date currentLastExec = currentRow.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null); - if(currentLastExec != null) { - if(lastRecordedCall == null) { - lastRecordedCall = currentRow; - } else if(lastRecordedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null).before(currentLastExec)) { - lastRecordedCall = currentRow; - } - } - } - } - } - - } catch (final IOException e) { - ConcurrentLog.logException(e); - } catch(final SpaceExceededException e) { - ConcurrentLog.logException(e); - } - } - return lastRecordedCall; - } - /** * @param fileURL the file URL. Must not be null. * @return the last modified date for the file at fileURL, or 0L when unknown or when an error occurred diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java index 750444220..5d9ff4cfc 100644 --- a/htroot/IndexReIndexMonitor_p.java +++ b/htroot/IndexReIndexMonitor_p.java @@ -25,12 +25,15 @@ import java.time.format.FormatStyle; import java.util.Locale; import net.yacy.migration; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.RecrawlBusyThread; import net.yacy.data.TransactionManager; +import net.yacy.data.WorkTables; +import net.yacy.kelondro.blob.Tables.Row; import net.yacy.kelondro.workflow.BusyThread; import net.yacy.search.Switchboard; import net.yacy.search.index.ReindexSolrBusyThread; @@ -38,8 +41,11 @@ import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; public class IndexReIndexMonitor_p { + + /** This servlet name, used for identifying recorded API calls */ + private static final String SERVLET_NAME = IndexReIndexMonitor_p.class.getSimpleName() + ".html"; - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); @@ -132,6 +138,22 @@ public class IndexReIndexMonitor_p { sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null, new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000); recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + + /* store this call as an api call for easy scheduling possibility */ + if(sb.tables != null) { + /* We avoid creating a duplicate of any already recorded API call with the same parameters */ + final Row lastExecutedCall = WorkTables + .selectLastExecutedApiCall(IndexReIndexMonitor_p.SERVLET_NAME, post, sb); + if (lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) { + byte[] lastExecutedCallPk = lastExecutedCall.getPK(); + if (lastExecutedCallPk != null) { + post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk)); + } + } + sb.tables.recordAPICall(post, IndexReIndexMonitor_p.SERVLET_NAME, WorkTables.TABLE_API_TYPE_CRAWLER, + "Recrawl documents matching selection query : " + recrawlQuery); + } + } else if(post.containsKey("simulateRecrawl") && sb.index.fulltext().connectedLocalSolr()) { SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); if (!solrConnector.isClosed()) { diff --git a/source/net/yacy/data/WorkTables.java b/source/net/yacy/data/WorkTables.java index 1de8b7431..2405ed206 100644 --- a/source/net/yacy/data/WorkTables.java +++ b/source/net/yacy/data/WorkTables.java @@ -100,31 +100,93 @@ public class WorkTables extends Tables { /** * - * @param post the api call request parameters. Must not be null. - * @param servletName the name of the servlet - * @return the API URL to be recorded + * @param post the api call eventual request parameters. + * @param servletName the name of the servlet. Must not be null. + * @return the API URL to be recorded, formatted to include request parameters as URL query parameters */ public static String generateRecordedURL(final serverObjects post, final String servletName) { /* Before API URL serialization, we set any eventual transaction token value to empty : * this will later help identify a new valid transaction token will be necessary, - * but without revealing it in the URL displayed in the process scheduler and storing an invalid value */ - final String transactionToken = post.get(TransactionManager.TRANSACTION_TOKEN_PARAM); - if(transactionToken != null) { + * but prevents revealing it in the URL displayed in the process scheduler and prevents storing an outdated value */ + final String transactionToken; + if(post != null) { + transactionToken = post.get(TransactionManager.TRANSACTION_TOKEN_PARAM); + } else { + transactionToken = null; + } + if(transactionToken != null && post != null) { post.put(TransactionManager.TRANSACTION_TOKEN_PARAM, ""); } // generate the apicall url - without the apicall attributes - final String apiurl = /*"http://localhost:" + getConfig("port", "8090") +*/ "/" + servletName + "?" + post.toString(); + String apiurl = "/" + servletName; + if(post != null) { + apiurl += "?" + post.toString(); + } /* Now restore the eventual transaction token to prevent side effects on the post object eventually still used by the caller */ - if(transactionToken != null) { - post.put(TransactionManager.TRANSACTION_TOKEN_PARAM, transactionToken); - } else { - post.remove(TransactionManager.TRANSACTION_TOKEN_PARAM); + if(post != null) { + if(transactionToken != null) { + post.put(TransactionManager.TRANSACTION_TOKEN_PARAM, transactionToken); + } else { + post.remove(TransactionManager.TRANSACTION_TOKEN_PARAM); + } } return apiurl; } + + /** + * @param servletName the servlet name used to identify the API when the call is recorded. + * @param post Servlet request parameters. Must not be null. + * @param sb the {@link Switchboard} instance. Must not be null. + * @return the most recently recorded call to the given API with the same parameters, or null when no one was found or data is not accessible + */ + public static Row selectLastExecutedApiCall(final String servletName, final serverObjects post, final Switchboard sb) { + Row lastRecordedCall = null; + if (servletName != null && sb != null && sb.tables != null) { + try { + if (post != null && post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) { + /* + * Search the table on the primary key when when present (re-execution of a + * recorded call) + */ + lastRecordedCall = sb.tables.select(WorkTables.TABLE_API_NAME, + UTF8.getBytes(post.get(WorkTables.TABLE_API_COL_APICALL_PK))); + } else { + /* Else search the table on the API URL as recorded (including parameters) */ + final String apiURL = WorkTables.generateRecordedURL(post, servletName); + final Iterator rowsIt = sb.tables.iterator(WorkTables.TABLE_API_NAME, + WorkTables.TABLE_API_COL_URL, UTF8.getBytes(apiURL)); + while (rowsIt.hasNext()) { + final Row currentRow = rowsIt.next(); + if (currentRow != null) { + final Date currentLastExec = currentRow.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, + (Date) null); + if (currentLastExec != null) { + if (lastRecordedCall == null) { + /* + * Do not break now the loop : we are looking for the most recent API call on + * the same URL + */ + lastRecordedCall = currentRow; + } else if (lastRecordedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null) + .before(currentLastExec)) { + lastRecordedCall = currentRow; + } + } + } + } + } + + } catch (final IOException e) { + ConcurrentLog.logException(e); + } catch (final SpaceExceededException e) { + ConcurrentLog.logException(e); + } + } + return lastRecordedCall; + } /** * recording of a api call. stores the call parameters into the API database table