From a87281b4988bcf20934b5a6d95a66b297f00fbff Mon Sep 17 00:00:00 2001 From: luccioman Date: Wed, 3 May 2017 18:53:01 +0200 Subject: [PATCH] Added MediaWiki dump import scheduling feature. Checking the last modified date by default to prevent unnecessary long running operations. --- htroot/IndexImportMediawiki_p.html | 17 ++++ htroot/IndexImportMediawiki_p.java | 125 +++++++++++++++++++++++++++ source/net/yacy/data/WorkTables.java | 38 +++++--- 3 files changed, 167 insertions(+), 13 deletions(-) diff --git a/htroot/IndexImportMediawiki_p.html b/htroot/IndexImportMediawiki_p.html index 907b597ff..c37583d4e 100644 --- a/htroot/IndexImportMediawiki_p.html +++ b/htroot/IndexImportMediawiki_p.html @@ -19,6 +19,7 @@ :: :: :: + :: #(/status)#

@@ -40,6 +41,22 @@ +
+
+
+ +
+
+
+ When checked, the dump file is imported only if its last modified date is unknown or is after the last import execution date on this same file + (see recorded API calls with the "dump" type). +
+
diff --git a/htroot/IndexImportMediawiki_p.java b/htroot/IndexImportMediawiki_p.java index 8e0967b85..80eb1d452 100644 --- a/htroot/IndexImportMediawiki_p.java +++ b/htroot/IndexImportMediawiki_p.java @@ -23,12 +23,28 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.io.File; +import java.io.IOException; import java.net.MalformedURLException; +import java.util.Date; +import java.util.Iterator; +import org.apache.http.Header; +import org.apache.http.HttpResponse; +import org.apache.http.HttpStatus; + +import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.TransactionManager; +import net.yacy.data.WorkTables; import net.yacy.document.importer.MediawikiImporter; +import net.yacy.kelondro.blob.Tables.Row; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -46,6 +62,7 @@ public class IndexImportMediawiki_p { * @param post request parameters. Supported keys : * @@ -95,6 +112,11 @@ public class IndexImportMediawiki_p { MultiProtocolURL sourceURL = null; int status = 0; String sourceFilePath = ""; + final Row lastExecutedCall = selectLastExecutedCall(post, sb); + Date lastExecutionDate = null; + if (lastExecutedCall != null) { + lastExecutionDate = lastExecutedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null); + } try { sourceURL = new MultiProtocolURL(file); if(sourceURL.isFile()) { @@ -108,10 +130,41 @@ public class IndexImportMediawiki_p { status = 4; } } + + if (status == 0 && post.getBoolean("iffresh")) { + long lastModified = getLastModified(sourceURL); + if (lastExecutionDate != null && lastModified != 0L + && lastModified <= lastExecutionDate.getTime()) { + status = 5; + prop.put("import_status_lastImportDate", + GenericFormatter.FORMAT_SIMPLE.format(lastExecutionDate)); + + /* the import is not performed, but we increase here the api call count */ + if(sb.tables != null) { + byte[] lastExecutedCallPk = lastExecutedCall.getPK(); + if(lastExecutedCallPk != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) { + post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk)); + } + sb.tables.recordAPICall(post, "IndexImportMediawiki_p.html", WorkTables.TABLE_API_TYPE_DUMP, "MediaWiki Dump Import for " + sourceURL); + } + } + } } catch (MalformedURLException e) { status = 1; } if (status == 0) { + /* store this call as an api call */ + if(sb.tables != null) { + /* We avoid creating a duplicate of any already recorded API call with the same parameters */ + if(lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) { + byte[] lastExecutedCallPk = lastExecutedCall.getPK(); + if(lastExecutedCallPk != null) { + post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk)); + } + } + sb.tables.recordAPICall(post, "IndexImportMediawiki_p.html", WorkTables.TABLE_API_TYPE_DUMP, "MediaWiki Dump Import for " + sourceURL); + } + MediawikiImporter.job = new MediawikiImporter(sourceURL, sb.surrogatesInPath); MediawikiImporter.job.start(); prop.put("import_dump", MediawikiImporter.job.source()); @@ -138,4 +191,76 @@ public class IndexImportMediawiki_p { } return prop; } + + /** + * @param post Servlet request parameters. Must not be null. + * @param sb the {@link Switchboard} instance. Must not be null. + * @return the most recently recorded call to this API with the same parameters + */ + private static Row selectLastExecutedCall(final serverObjects post, final Switchboard sb) { + Row lastRecordedCall = null; + if (sb.tables != null) { + try { + if(post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) { + /* Search the table on the primary key when when present (re-execution of a recorded call) */ + lastRecordedCall = sb.tables.select(WorkTables.TABLE_API_NAME, UTF8.getBytes(post.get(WorkTables.TABLE_API_COL_APICALL_PK))); + } else { + /* Else search the table on the API URL as recorded (including parameters) */ + final String apiURL = WorkTables.generateRecordedURL(post, "IndexImportMediawiki_p.html"); + Iterator rowsIt = sb.tables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, + UTF8.getBytes(apiURL)); + while (rowsIt.hasNext()) { + Row currentRow = rowsIt.next(); + if (currentRow != null) { + Date currentLastExec = currentRow.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null); + if(currentLastExec != null) { + if(lastRecordedCall == null) { + lastRecordedCall = currentRow; + } else if(lastRecordedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null).before(currentLastExec)) { + lastRecordedCall = currentRow; + } + } + } + } + } + + } catch (final IOException e) { + ConcurrentLog.logException(e); + } catch(final SpaceExceededException e) { + ConcurrentLog.logException(e); + } + } + return lastRecordedCall; + } + + /** + * @param fileURL the file URL. Must not be null. + * @return the last modified date for the file at fileURL, or 0L when unknown or when an error occurred + */ + private static long getLastModified(MultiProtocolURL fileURL) { + long lastModified = 0l; + try { + if (fileURL.isHTTP() || fileURL.isHTTPS()) { + /* http(s) : we do not use MultiprotocolURL.lastModified() which always returns 0L for these protocols */ + HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent); + HttpResponse headResponse = httpClient.HEADResponse(fileURL, false); + if (headResponse != null && headResponse.getStatusLine() != null + && headResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { + Header lastModifiedHeader = headResponse + .getFirstHeader(HeaderFramework.LAST_MODIFIED); + if (lastModifiedHeader != null) { + Date lastModifiedDate = HeaderFramework.parseHTTPDate(lastModifiedHeader.getValue()); + if(lastModifiedDate != null) { + lastModified = lastModifiedDate.getTime(); + } + } + } + } else { + lastModified = fileURL.lastModified(); + } + } catch (IOException ignored) { + ConcurrentLog.warn("IndexImportMediawiki_p", "Could not retrieve last modified date for dump file at " + fileURL); + } + return lastModified; + } } diff --git a/source/net/yacy/data/WorkTables.java b/source/net/yacy/data/WorkTables.java index 35a2e2484..1de8b7431 100644 --- a/source/net/yacy/data/WorkTables.java +++ b/source/net/yacy/data/WorkTables.java @@ -97,22 +97,14 @@ public class WorkTables extends Tables { super(workPath, 12); this.bookmarks = new YMarkTables(this); } - + /** - * recording of a api call. stores the call parameters into the API database table - * @param post the post arguments of the api call + * + * @param post the api call request parameters. Must not be null. * @param servletName the name of the servlet - * @param type name of the servlet category - * @param comment visual description of the process - * @return the pk of the new entry in the api table + * @return the API URL to be recorded */ - public byte[] recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) { - // remove the apicall attributes from the post object - String[] pks = post.remove(TABLE_API_COL_APICALL_PK); - - byte[] pk = pks == null ? null : UTF8.getBytes(pks[0]); - - + public static String generateRecordedURL(final serverObjects post, final String servletName) { /* Before API URL serialization, we set any eventual transaction token value to empty : * this will later help identify a new valid transaction token will be necessary, * but without revealing it in the URL displayed in the process scheduler and storing an invalid value */ @@ -130,6 +122,26 @@ public class WorkTables extends Tables { } else { post.remove(TransactionManager.TRANSACTION_TOKEN_PARAM); } + + return apiurl; + } + + /** + * recording of a api call. stores the call parameters into the API database table + * @param post the post arguments of the api call. Must not be null. + * @param servletName the name of the servlet + * @param type name of the servlet category + * @param comment visual description of the process + * @return the pk of the new entry in the api table + */ + public byte[] recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) { + // remove the apicall attributes from the post object + String[] pks = post.remove(TABLE_API_COL_APICALL_PK); + + byte[] pk = pks == null ? null : UTF8.getBytes(pks[0]); + + // generate the apicall url - without the apicall attributes + final String apiurl = generateRecordedURL(post, servletName); // read old entry from the apicall table (if exists) Row row = null;