From a87281b4988bcf20934b5a6d95a66b297f00fbff Mon Sep 17 00:00:00 2001
From: luccioman
Date: Wed, 3 May 2017 18:53:01 +0200
Subject: [PATCH] Added MediaWiki dump import scheduling feature.
Checking the last modified date by default to prevent unnecessary long
running operations.
---
htroot/IndexImportMediawiki_p.html | 17 ++++
htroot/IndexImportMediawiki_p.java | 125 +++++++++++++++++++++++++++
source/net/yacy/data/WorkTables.java | 38 +++++---
3 files changed, 167 insertions(+), 13 deletions(-)
diff --git a/htroot/IndexImportMediawiki_p.html b/htroot/IndexImportMediawiki_p.html
index 907b597ff..c37583d4e 100644
--- a/htroot/IndexImportMediawiki_p.html
+++ b/htroot/IndexImportMediawiki_p.html
@@ -19,6 +19,7 @@
::Error : file not found "#[sourceFile]#"
::Error : can not read file "#[sourceFile]#"
::Error : you selected a directory ("#[sourceFile]#")
+ ::Error : dump file ("#[sourceFile]#") was not modified since last import (#[lastImportDate]#).
#(/status)#
diff --git a/htroot/IndexImportMediawiki_p.java b/htroot/IndexImportMediawiki_p.java
index 8e0967b85..80eb1d452 100644
--- a/htroot/IndexImportMediawiki_p.java
+++ b/htroot/IndexImportMediawiki_p.java
@@ -23,12 +23,28 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
+import java.io.IOException;
import java.net.MalformedURLException;
+import java.util.Date;
+import java.util.Iterator;
+import org.apache.http.Header;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpStatus;
+
+import net.yacy.cora.date.GenericFormatter;
+import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.cora.protocol.http.HTTPClient;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.TransactionManager;
+import net.yacy.data.WorkTables;
import net.yacy.document.importer.MediawikiImporter;
+import net.yacy.kelondro.blob.Tables.Row;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@@ -46,6 +62,7 @@ public class IndexImportMediawiki_p {
* @param post request parameters. Supported keys :
*
* - file : a dump URL or file path on this YaCy server local file system
+ * - iffresh : when set to true, the dump file is imported only if its last modified date is unknown or after the last import trial date on this same file.
* - report : when set, display the currently running thread monitoring info, or the last import report when no one is running.
* Ignored when no import thread is known.
*
@@ -95,6 +112,11 @@ public class IndexImportMediawiki_p {
MultiProtocolURL sourceURL = null;
int status = 0;
String sourceFilePath = "";
+ final Row lastExecutedCall = selectLastExecutedCall(post, sb);
+ Date lastExecutionDate = null;
+ if (lastExecutedCall != null) {
+ lastExecutionDate = lastExecutedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null);
+ }
try {
sourceURL = new MultiProtocolURL(file);
if(sourceURL.isFile()) {
@@ -108,10 +130,41 @@ public class IndexImportMediawiki_p {
status = 4;
}
}
+
+ if (status == 0 && post.getBoolean("iffresh")) {
+ long lastModified = getLastModified(sourceURL);
+ if (lastExecutionDate != null && lastModified != 0L
+ && lastModified <= lastExecutionDate.getTime()) {
+ status = 5;
+ prop.put("import_status_lastImportDate",
+ GenericFormatter.FORMAT_SIMPLE.format(lastExecutionDate));
+
+ /* the import is not performed, but we increase here the api call count */
+ if(sb.tables != null) {
+ byte[] lastExecutedCallPk = lastExecutedCall.getPK();
+ if(lastExecutedCallPk != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
+ post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
+ }
+ sb.tables.recordAPICall(post, "IndexImportMediawiki_p.html", WorkTables.TABLE_API_TYPE_DUMP, "MediaWiki Dump Import for " + sourceURL);
+ }
+ }
+ }
} catch (MalformedURLException e) {
status = 1;
}
if (status == 0) {
+ /* store this call as an api call */
+ if(sb.tables != null) {
+ /* We avoid creating a duplicate of any already recorded API call with the same parameters */
+ if(lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
+ byte[] lastExecutedCallPk = lastExecutedCall.getPK();
+ if(lastExecutedCallPk != null) {
+ post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
+ }
+ }
+ sb.tables.recordAPICall(post, "IndexImportMediawiki_p.html", WorkTables.TABLE_API_TYPE_DUMP, "MediaWiki Dump Import for " + sourceURL);
+ }
+
MediawikiImporter.job = new MediawikiImporter(sourceURL, sb.surrogatesInPath);
MediawikiImporter.job.start();
prop.put("import_dump", MediawikiImporter.job.source());
@@ -138,4 +191,76 @@ public class IndexImportMediawiki_p {
}
return prop;
}
+
+ /**
+ * @param post Servlet request parameters. Must not be null.
+ * @param sb the {@link Switchboard} instance. Must not be null.
+ * @return the most recently recorded call to this API with the same parameters
+ */
+ private static Row selectLastExecutedCall(final serverObjects post, final Switchboard sb) {
+ Row lastRecordedCall = null;
+ if (sb.tables != null) {
+ try {
+ if(post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
+ /* Search the table on the primary key when when present (re-execution of a recorded call) */
+ lastRecordedCall = sb.tables.select(WorkTables.TABLE_API_NAME, UTF8.getBytes(post.get(WorkTables.TABLE_API_COL_APICALL_PK)));
+ } else {
+ /* Else search the table on the API URL as recorded (including parameters) */
+ final String apiURL = WorkTables.generateRecordedURL(post, "IndexImportMediawiki_p.html");
+ Iterator rowsIt = sb.tables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL,
+ UTF8.getBytes(apiURL));
+ while (rowsIt.hasNext()) {
+ Row currentRow = rowsIt.next();
+ if (currentRow != null) {
+ Date currentLastExec = currentRow.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null);
+ if(currentLastExec != null) {
+ if(lastRecordedCall == null) {
+ lastRecordedCall = currentRow;
+ } else if(lastRecordedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null).before(currentLastExec)) {
+ lastRecordedCall = currentRow;
+ }
+ }
+ }
+ }
+ }
+
+ } catch (final IOException e) {
+ ConcurrentLog.logException(e);
+ } catch(final SpaceExceededException e) {
+ ConcurrentLog.logException(e);
+ }
+ }
+ return lastRecordedCall;
+ }
+
+ /**
+ * @param fileURL the file URL. Must not be null.
+ * @return the last modified date for the file at fileURL, or 0L when unknown or when an error occurred
+ */
+ private static long getLastModified(MultiProtocolURL fileURL) {
+ long lastModified = 0l;
+ try {
+ if (fileURL.isHTTP() || fileURL.isHTTPS()) {
+ /* http(s) : we do not use MultiprotocolURL.lastModified() which always returns 0L for these protocols */
+ HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
+ HttpResponse headResponse = httpClient.HEADResponse(fileURL, false);
+ if (headResponse != null && headResponse.getStatusLine() != null
+ && headResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
+ Header lastModifiedHeader = headResponse
+ .getFirstHeader(HeaderFramework.LAST_MODIFIED);
+ if (lastModifiedHeader != null) {
+ Date lastModifiedDate = HeaderFramework.parseHTTPDate(lastModifiedHeader.getValue());
+ if(lastModifiedDate != null) {
+ lastModified = lastModifiedDate.getTime();
+ }
+ }
+ }
+ } else {
+ lastModified = fileURL.lastModified();
+ }
+ } catch (IOException ignored) {
+ ConcurrentLog.warn("IndexImportMediawiki_p", "Could not retrieve last modified date for dump file at " + fileURL);
+ }
+ return lastModified;
+ }
}
diff --git a/source/net/yacy/data/WorkTables.java b/source/net/yacy/data/WorkTables.java
index 35a2e2484..1de8b7431 100644
--- a/source/net/yacy/data/WorkTables.java
+++ b/source/net/yacy/data/WorkTables.java
@@ -97,22 +97,14 @@ public class WorkTables extends Tables {
super(workPath, 12);
this.bookmarks = new YMarkTables(this);
}
-
+
/**
- * recording of a api call. stores the call parameters into the API database table
- * @param post the post arguments of the api call
+ *
+ * @param post the api call request parameters. Must not be null.
* @param servletName the name of the servlet
- * @param type name of the servlet category
- * @param comment visual description of the process
- * @return the pk of the new entry in the api table
+ * @return the API URL to be recorded
*/
- public byte[] recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) {
- // remove the apicall attributes from the post object
- String[] pks = post.remove(TABLE_API_COL_APICALL_PK);
-
- byte[] pk = pks == null ? null : UTF8.getBytes(pks[0]);
-
-
+ public static String generateRecordedURL(final serverObjects post, final String servletName) {
/* Before API URL serialization, we set any eventual transaction token value to empty :
* this will later help identify a new valid transaction token will be necessary,
* but without revealing it in the URL displayed in the process scheduler and storing an invalid value */
@@ -130,6 +122,26 @@ public class WorkTables extends Tables {
} else {
post.remove(TransactionManager.TRANSACTION_TOKEN_PARAM);
}
+
+ return apiurl;
+ }
+
+ /**
+ * recording of a api call. stores the call parameters into the API database table
+ * @param post the post arguments of the api call. Must not be null.
+ * @param servletName the name of the servlet
+ * @param type name of the servlet category
+ * @param comment visual description of the process
+ * @return the pk of the new entry in the api table
+ */
+ public byte[] recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) {
+ // remove the apicall attributes from the post object
+ String[] pks = post.remove(TABLE_API_COL_APICALL_PK);
+
+ byte[] pk = pks == null ? null : UTF8.getBytes(pks[0]);
+
+ // generate the apicall url - without the apicall attributes
+ final String apiurl = generateRecordedURL(post, servletName);
// read old entry from the apicall table (if exists)
Row row = null;