You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
225 lines
10 KiB
225 lines
10 KiB
// IndexImportMediawiki.java
|
|
// -------------------------
|
|
// (C) 2009 by Michael Peter Christen; mc@yacy.net
|
|
// first published 04.05.2009 on http://yacy.net
|
|
// Frankfurt, Germany
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.net.MalformedURLException;
|
|
import java.time.Instant;
|
|
import java.util.Date;
|
|
|
|
import org.apache.http.Header;
|
|
import org.apache.http.HttpResponse;
|
|
import org.apache.http.HttpStatus;
|
|
|
|
import net.yacy.cora.date.GenericFormatter;
|
|
import net.yacy.cora.document.encoding.UTF8;
|
|
import net.yacy.cora.document.id.MultiProtocolURL;
|
|
import net.yacy.cora.protocol.ClientIdentification;
|
|
import net.yacy.cora.protocol.HeaderFramework;
|
|
import net.yacy.cora.protocol.RequestHeader;
|
|
import net.yacy.cora.protocol.http.HTTPClient;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.data.TransactionManager;
|
|
import net.yacy.data.WorkTables;
|
|
import net.yacy.document.importer.MediawikiImporter;
|
|
import net.yacy.kelondro.blob.Tables.Row;
|
|
import net.yacy.search.Switchboard;
|
|
import net.yacy.server.serverObjects;
|
|
import net.yacy.server.serverSwitch;
|
|
|
|
/**
|
|
* Import of MediaWiki dump files in the local index.
|
|
*/
|
|
public class IndexImportMediawiki_p {
|
|
|
|
/**
|
|
* Run conditions :
|
|
* - no MediaWiki import thread is running : allow to start a new import by filling the "file" parameter
|
|
* - the MediaWiki import thread is running : returns monitoring information.
|
|
* @param header servlet request header
|
|
* @param post request parameters. Supported keys :
|
|
* <ul>
|
|
* <li>file : a dump URL or file path on this YaCy server local file system</li>
|
|
* <li>iffresh : when set to true, the dump file is imported only if its last modified date is unknown or after the last import trial date on this same file. </li>
|
|
* <li>report : when set, display the currently running thread monitoring info, or the last import report when no one is running.
|
|
* Ignored when no import thread is known.</li>
|
|
* </ul>
|
|
* @param env server environment
|
|
* @return the servlet answer object
|
|
*/
|
|
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
|
final serverObjects prop = new serverObjects();
|
|
final Switchboard sb = (Switchboard) env;
|
|
|
|
if (MediawikiImporter.job != null && (MediawikiImporter.job.isAlive() || (post != null && post.containsKey("report")))) {
|
|
/* one import is running, or report was explicitly requested : no option to insert anything */
|
|
prop.put("import", 1);
|
|
/* Only refresh automatically when the job is running */
|
|
prop.put("refresh", MediawikiImporter.job.isAlive() ? 1 : 0);
|
|
final String jobErrorMessage = MediawikiImporter.job.status();
|
|
if( jobErrorMessage != null && !jobErrorMessage.isEmpty()) {
|
|
prop.put("import_status", 1);
|
|
prop.put("import_status_message", jobErrorMessage);
|
|
}
|
|
prop.put("import_thread", MediawikiImporter.job.isAlive() ? 2 : 0);
|
|
prop.put("import_dump", MediawikiImporter.job.source());
|
|
prop.put("import_count", MediawikiImporter.job.count());
|
|
prop.put("import_speed", MediawikiImporter.job.speed());
|
|
prop.put("import_runningHours", (MediawikiImporter.job.runningTime() / 60) / 60);
|
|
prop.put("import_runningMinutes", (MediawikiImporter.job.runningTime() / 60) % 60);
|
|
prop.put("import_remainingHours", (MediawikiImporter.job.remainingTime() / 60) / 60);
|
|
prop.put("import_remainingMinutes", (MediawikiImporter.job.remainingTime() / 60) % 60);
|
|
} else {
|
|
prop.put("import", 0);
|
|
prop.put("refresh", 0);
|
|
prop.put("import_prevReport", MediawikiImporter.job != null ? 1 : 0);
|
|
if (post == null) {
|
|
prop.put("import_status", 0);
|
|
|
|
/* Acquire a transaction token for the next POST form submission */
|
|
final String token = TransactionManager.getTransactionToken(header);
|
|
prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, token);
|
|
prop.put("import_" + TransactionManager.TRANSACTION_TOKEN_PARAM, token);
|
|
|
|
} else {
|
|
if (post.containsKey("file")) {
|
|
/* Check the transaction is valid */
|
|
TransactionManager.checkPostTransaction(header, post);
|
|
|
|
String file = post.get("file");
|
|
MultiProtocolURL sourceURL = null;
|
|
int status = 0;
|
|
String sourceFilePath = "";
|
|
final Row lastExecutedCall = WorkTables.selectLastExecutedApiCall("IndexImportMediawiki_p.html", post, sb);
|
|
Date lastExecutionDate = null;
|
|
if (lastExecutedCall != null) {
|
|
lastExecutionDate = lastExecutedCall.get(WorkTables.TABLE_API_COL_DATE_LAST_EXEC, (Date) null);
|
|
}
|
|
try {
|
|
sourceURL = new MultiProtocolURL(file);
|
|
if(sourceURL.isFile()) {
|
|
final File sourcefile = sourceURL.getFSFile();
|
|
sourceFilePath = sourcefile.getAbsolutePath();
|
|
if (!sourcefile.exists()) {
|
|
status = 2;
|
|
} else if (!sourcefile.canRead()) {
|
|
status = 3;
|
|
} else if (sourcefile.isDirectory()) {
|
|
status = 4;
|
|
}
|
|
}
|
|
|
|
if (status == 0 && post.getBoolean("iffresh")) {
|
|
long lastModified = getLastModified(sourceURL);
|
|
if (lastExecutionDate != null && lastModified != 0L && Instant.ofEpochMilli(lastModified)
|
|
.isBefore(lastExecutionDate.toInstant())) {
|
|
status = 5;
|
|
prop.put("import_status_lastImportDate", GenericFormatter
|
|
.formatSafely(lastExecutionDate.toInstant(), GenericFormatter.FORMAT_SIMPLE));
|
|
|
|
/* the import is not performed, but we increase here the api call count */
|
|
if(sb.tables != null) {
|
|
byte[] lastExecutedCallPk = lastExecutedCall.getPK();
|
|
if(lastExecutedCallPk != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
|
|
post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
|
|
}
|
|
sb.tables.recordAPICall(post, "IndexImportMediawiki_p.html", WorkTables.TABLE_API_TYPE_DUMP, "MediaWiki Dump Import for " + sourceURL);
|
|
}
|
|
}
|
|
}
|
|
} catch (MalformedURLException e) {
|
|
status = 1;
|
|
}
|
|
if (status == 0) {
|
|
/* store this call as an api call */
|
|
if(sb.tables != null) {
|
|
/* We avoid creating a duplicate of any already recorded API call with the same parameters */
|
|
if(lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
|
|
byte[] lastExecutedCallPk = lastExecutedCall.getPK();
|
|
if(lastExecutedCallPk != null) {
|
|
post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
|
|
}
|
|
}
|
|
sb.tables.recordAPICall(post, "IndexImportMediawiki_p.html", WorkTables.TABLE_API_TYPE_DUMP, "MediaWiki Dump Import for " + sourceURL);
|
|
}
|
|
|
|
MediawikiImporter.job = new MediawikiImporter(sourceURL, sb.surrogatesInPath);
|
|
MediawikiImporter.job.start();
|
|
prop.put("import_dump", MediawikiImporter.job.source());
|
|
prop.put("import_thread", 1);
|
|
prop.put("import", 1);
|
|
prop.put("refresh", 1);
|
|
} else {
|
|
prop.put("import_status", status);
|
|
prop.put("import_status_sourceFile", sourceFilePath);
|
|
|
|
/* Acquire a transaction token for the next POST form submission */
|
|
final String token = TransactionManager.getTransactionToken(header);
|
|
prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, token);
|
|
prop.put("import_" + TransactionManager.TRANSACTION_TOKEN_PARAM, token);
|
|
}
|
|
prop.put("import_count", 0);
|
|
prop.put("import_speed", 0);
|
|
prop.put("import_runningHours", 0);
|
|
prop.put("import_runningMinutes", 0);
|
|
prop.put("import_remainingHours", 0);
|
|
prop.put("import_remainingMinutes", 0);
|
|
}
|
|
}
|
|
}
|
|
return prop;
|
|
}
|
|
|
|
/**
|
|
* @param fileURL the file URL. Must not be null.
|
|
* @return the last modified date for the file at fileURL, or 0L when unknown or when an error occurred
|
|
*/
|
|
private static long getLastModified(MultiProtocolURL fileURL) {
|
|
long lastModified = 0l;
|
|
try {
|
|
if (fileURL.isHTTP() || fileURL.isHTTPS()) {
|
|
/* http(s) : we do not use MultiprotocolURL.lastModified() which always returns 0L for these protocols */
|
|
HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent);
|
|
HttpResponse headResponse = httpClient.HEADResponse(fileURL, false);
|
|
if (headResponse != null && headResponse.getStatusLine() != null
|
|
&& headResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
|
|
Header lastModifiedHeader = headResponse
|
|
.getFirstHeader(HeaderFramework.LAST_MODIFIED);
|
|
if (lastModifiedHeader != null) {
|
|
Date lastModifiedDate = HeaderFramework.parseHTTPDate(lastModifiedHeader.getValue());
|
|
if(lastModifiedDate != null) {
|
|
lastModified = lastModifiedDate.getTime();
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
lastModified = fileURL.lastModified();
|
|
}
|
|
} catch (IOException ignored) {
|
|
ConcurrentLog.warn("IndexImportMediawiki_p", "Could not retrieve last modified date for dump file at " + fileURL);
|
|
}
|
|
return lastModified;
|
|
}
|
|
}
|