- remote proxy configuration can now be "really" changed on the fly and takes effect immediately - adding possibility to disable remote proxy usage for yacy->yacy communication - adding possibility to disable remote proxy usage for ssl - restructuring proxy configuration so that it is stored in a single place now *) Adding possibility to import a foreign word DB (or even more of them in parallel) at runtime into the peers DB - this can be done by calling IndexImport_p.html - ATTENTION: please not that at the moment this thread must be aborted via gui before a normal server shutdown is done. - TODO: integrating IndexImport Thread into normal server shutdown - TODO: Adding posibility to import crawl-queues, etc. from foreign peers - TODO: removing old import function from yacy.java and calling the new routines instead git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@968 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
222607ef0f
commit
02d9af1a70
@ -0,0 +1,105 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': Index Import</title>
|
||||
#[metas]#
|
||||
<meta http-equiv="REFRESH" content="30">
|
||||
</head>
|
||||
<body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
|
||||
#[header]#
|
||||
<br><br>
|
||||
<h2>Index DB Import</h2>
|
||||
|
||||
<p>The local index currenly consists of (at least) #[wcount]# reverse word indexes and #[ucount]# URL references</p>
|
||||
<hr>
|
||||
#(error)#<!-- 0 -->
|
||||
::<!-- 1 -->
|
||||
<p><font color="red"><b>#[error_msg]#</b></font></p>
|
||||
::<!-- 2 -->
|
||||
<p><font color="red"><b>Import Job with the same path already started</b></font></p>
|
||||
#(/error)#
|
||||
<h3>Starting new Job</h3>
|
||||
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
|
||||
<table border="0" cellpadding="2" cellspacing="1">
|
||||
<tr>
|
||||
<td title="Path to the PLASMADB directory of the foreign peer">Import Path:</td>
|
||||
<td><input name="importPath" type="text" size="50" value=""></td>
|
||||
<td><input type="submit" name="startIndexDbImport" value="Start New Import"></td>
|
||||
</tr>
|
||||
</table>
|
||||
</form>
|
||||
|
||||
<hr>
|
||||
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
|
||||
<h3>Currently running jobs</h3>
|
||||
<p>
|
||||
<table border="0" cellpadding="2" cellspacing="1">
|
||||
<tr class="TableHeader" valign="bottom">
|
||||
<td class="small" width="150">Path</td>
|
||||
<td class="small" >Status</td>
|
||||
<td class="small" >%</td>
|
||||
<td class="small" >Elapsed<br>Time</td>
|
||||
<td class="small" >Estimated<br>Time</td>
|
||||
<td class="small" >Word Hash</td>
|
||||
<td class="small" ># URLs</td>
|
||||
<td class="small" ># Word<br>Entities</td>
|
||||
<td class="small" ># Word<br>Entries</td>
|
||||
<td class="small" >Stop Import</td>
|
||||
</tr>
|
||||
#{running.jobs}#
|
||||
<tr class="TableCellLight">
|
||||
<td class="small">#[path]#</td>
|
||||
<td class="small"><font color="#(stopped)#red::green#(/stopped)#">#(stopped)#Finished::Running#(/stopped)#</font></td>
|
||||
<td class="small" align="right">#[percent]#</td>
|
||||
<td class="small" align="right">#[elapsed]#</td>
|
||||
<td class="small" align="right">#[estimated]#</td>
|
||||
<td class="small" align="right"><tt>#[wordHash]#</tt></td>
|
||||
<td class="small" align="rigth">#[url_num]#</td>
|
||||
<td class="small" align="rigth">#[word_entity_num]#</td>
|
||||
<td class="small" align="rigth">#[word_entry_num]#</td>
|
||||
<td class="small">
|
||||
#(stopped)#::
|
||||
<input type="submit" name="stopIndexDbImport" value="Stop Index Transfer">
|
||||
<input type="hidden" name="jobNr" value="#[job_nr]#">
|
||||
#(/stopped)#
|
||||
</td>
|
||||
</tr>
|
||||
#{/running.jobs}#
|
||||
</table>
|
||||
</form>
|
||||
|
||||
<hr>
|
||||
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
|
||||
<h3>Finished jobs</h3>
|
||||
<p>
|
||||
<table border="0" cellpadding="2" cellspacing="1">
|
||||
<tr class="TableHeader" valign="bottom">
|
||||
<td class="small" width="150">Path</td>
|
||||
<td class="small" >Status</td>
|
||||
<td class="small" >%</td>
|
||||
<td class="small" >Elapsed<br>Time</td>
|
||||
<td class="small" >Word Hash</td>
|
||||
<td class="small" ># URLs</td>
|
||||
<td class="small" ># Word<br>Entities</td>
|
||||
<td class="small" ># Word<br>Entries</td>
|
||||
</tr>
|
||||
#{finished.jobs}#
|
||||
<tr class="TableCellLight">
|
||||
<td class="small">#[path]#</td>
|
||||
<td class="small"><font color="#(stopped)#red::green::red#(/stopped)#">#(stopped)#Finished::<b>Error:</b> #[errorMsg]##(/stopped)#</font></td>
|
||||
<td class="small" align="right">#[percent]#</td>
|
||||
<td class="small" align="right">#[elapsed]#</td>
|
||||
<td class="small" align="right"><tt>#[wordHash]#</tt></td>
|
||||
<td class="small" align="rigth">#[url_num]#</td>
|
||||
<td class="small" align="rigth">#[word_entity_num]#</td>
|
||||
<td class="small" align="rigth">#[word_entry_num]#</td>
|
||||
</tr>
|
||||
#{/finished.jobs}#
|
||||
</table>
|
||||
<input type="submit" name="clearFinishedJobList" value="Clear List">
|
||||
</form>
|
||||
<p><font size="-3"><i>Last Refresh:</i> #[date]#</font></p>
|
||||
|
||||
#[footer]#
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,196 @@
|
||||
//IndexTransfer_p.java
|
||||
//-----------------------
|
||||
//part of the AnomicHTTPD caching proxy
|
||||
//(C) by Michael Peter Christen; mc@anomic.de
|
||||
//first published on http://www.anomic.de
|
||||
//Frankfurt, Germany, 2005
|
||||
//
|
||||
//This file is contributed by Martin Thelian
|
||||
//
|
||||
// $LastChangedDate: 2005-10-17 17:46:12 +0200 (Mo, 17 Okt 2005) $
|
||||
// $LastChangedRevision: 947 $
|
||||
// $LastChangedBy: borg-0300 $
|
||||
//
|
||||
//This program is free software; you can redistribute it and/or modify
|
||||
//it under the terms of the GNU General Public License as published by
|
||||
//the Free Software Foundation; either version 2 of the License, or
|
||||
//(at your option) any later version.
|
||||
//
|
||||
//This program is distributed in the hope that it will be useful,
|
||||
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
//GNU General Public License for more details.
|
||||
//
|
||||
//You should have received a copy of the GNU General Public License
|
||||
//along with this program; if not, write to the Free Software
|
||||
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
//Using this software in any meaning (reading, learning, copying, compiling,
|
||||
//running) means that you agree that the Author(s) is (are) not responsible
|
||||
//for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
//by usage of this softare or this documentation. The usage of this software
|
||||
//is on your own risk. The installation and usage (starting/running) of this
|
||||
//software may allow other people or application to access your computer and
|
||||
//any attached devices and is highly dependent on the configuration of the
|
||||
//software which must be done by the user of the software; the author(s) is
|
||||
//(are) also not responsible for proper configuration and usage of the
|
||||
//software, even if provoked by documentation provided together with
|
||||
//the software.
|
||||
//
|
||||
//Any changes to this file according to the GPL as documented in the file
|
||||
//gpl.txt aside this file in the shipment you received can be done to the
|
||||
//lines that follows this copyright notice here, but changes must not be
|
||||
//done inside the copyright notive above. A re-distribution must contain
|
||||
//the intact and unchanged copyright notice.
|
||||
//Contributions and changes to the program code must be marked as such.
|
||||
|
||||
//You must compile this file with
|
||||
//javac -classpath .:../Classes IndexControl_p.java
|
||||
//if the shell's current path is HTROOT
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Date;
|
||||
import java.util.Vector;
|
||||
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaDbImporter;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverDate;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
||||
public final class IndexImport_p {
|
||||
|
||||
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
|
||||
// return variable that accumulates replacements
|
||||
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
|
||||
serverObjects prop = new serverObjects();
|
||||
|
||||
int activeCount = 0;
|
||||
|
||||
if (post != null) {
|
||||
if (post.containsKey("startIndexDbImport")) {
|
||||
try {
|
||||
// getting the import path
|
||||
String importPath = (String) post.get("importPath");
|
||||
boolean startImport = true;
|
||||
|
||||
// check if there is an already running thread with the same import path
|
||||
Thread[] importThreads = new Thread[plasmaDbImporter.runningJobs.activeCount()*2];
|
||||
activeCount = plasmaDbImporter.runningJobs.enumerate(importThreads);
|
||||
|
||||
for (int i=0; i < activeCount; i++) {
|
||||
plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i];
|
||||
if (currThread.getImportRoot().equals(new File(importPath))) {
|
||||
prop.put("error",2);
|
||||
startImport = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (startImport) {
|
||||
plasmaDbImporter newImporter = new plasmaDbImporter(switchboard.wordIndex,switchboard.urlPool.loadedURL,importPath);
|
||||
newImporter.start();
|
||||
|
||||
prop.put("LOCATION","");
|
||||
return prop;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
prop.put("error",1);
|
||||
prop.put("error_error_msg",e.toString());
|
||||
}
|
||||
} else if (post.containsKey("clearFinishedJobList")) {
|
||||
plasmaDbImporter.finishedJobs.clear();
|
||||
prop.put("LOCATION","");
|
||||
return prop;
|
||||
} else if (post.containsKey("stopIndexDbImport")) {
|
||||
// getting the job nr of the thread that should be stopped
|
||||
String jobNr = (String) post.get("jobNr");
|
||||
|
||||
Thread[] importThreads = new Thread[plasmaDbImporter.runningJobs.activeCount()*2];
|
||||
activeCount = plasmaDbImporter.runningJobs.enumerate(importThreads);
|
||||
|
||||
for (int i=0; i < activeCount; i++) {
|
||||
plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i];
|
||||
if (currThread.getJobNr() == Integer.valueOf(jobNr).intValue()) {
|
||||
currThread.stoppIt();
|
||||
try {
|
||||
currThread.join();
|
||||
} catch (InterruptedException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
prop.put("LOCATION","");
|
||||
return prop;
|
||||
}
|
||||
}
|
||||
|
||||
prop.put("wcount", Integer.toString(switchboard.wordIndex.size()));
|
||||
prop.put("ucount", Integer.toString(switchboard.urlPool.loadedURL.size()));
|
||||
|
||||
/*
|
||||
* Loop over all currently running jobs
|
||||
*/
|
||||
Thread[] importThreads = new Thread[plasmaDbImporter.runningJobs.activeCount()*2];
|
||||
activeCount = plasmaDbImporter.runningJobs.enumerate(importThreads);
|
||||
|
||||
for (int i=0; i < activeCount; i++) {
|
||||
plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i];
|
||||
|
||||
File importPath = currThread.getImportRoot();
|
||||
String currWordHash = currThread.getCurrentWordhash();
|
||||
long currWordEntryCount = currThread.getWordEntryCounter();
|
||||
long currWordEntityCounter = currThread.getWordEntityCounter();
|
||||
long currUrlCounter = currThread.getUrlCounter();
|
||||
long currImportDbSize = currThread.getImportWordDbSize();
|
||||
long estimatedTime = currThread.getEstimatedTime();
|
||||
long elapsedTime = currThread.getElapsedTime();
|
||||
int jobNr = currThread.getJobNr();
|
||||
int percent = currThread.getProcessingStatus();
|
||||
|
||||
boolean isRunning = currThread.isAlive();
|
||||
|
||||
prop.put("running.jobs_" + i + "_path", importPath.toString());
|
||||
prop.put("running.jobs_" + i + "_stopped", isRunning ? 1:0);
|
||||
prop.put("running.jobs_" + i + "_percent", Integer.toString(percent));
|
||||
prop.put("running.jobs_" + i + "_elapsed", serverDate.intervalToString(elapsedTime));
|
||||
prop.put("running.jobs_" + i + "_estimated", serverDate.intervalToString(estimatedTime));
|
||||
prop.put("running.jobs_" + i + "_wordHash", currWordHash);
|
||||
prop.put("running.jobs_" + i + "_url_num", Long.toString(currUrlCounter));
|
||||
prop.put("running.jobs_" + i + "_word_entity_num", Long.toString(currWordEntityCounter));
|
||||
prop.put("running.jobs_" + i + "_word_entry_num", Long.toString(currWordEntryCount));
|
||||
prop.put("running.jobs_" + i + "_stopped_job_nr", Integer.toString(jobNr));
|
||||
}
|
||||
prop.put("running.jobs",activeCount);
|
||||
|
||||
/*
|
||||
* Loop over all finished jobs
|
||||
*/
|
||||
Vector finishedJobs = (Vector) plasmaDbImporter.finishedJobs.clone();
|
||||
for (int i=0; i<finishedJobs.size(); i++) {
|
||||
plasmaDbImporter currThread = (plasmaDbImporter) finishedJobs.get(i);
|
||||
String error = currThread.getError();
|
||||
prop.put("finished.jobs_" + i + "_path", currThread.getImportRoot().toString());
|
||||
if (error != null) {
|
||||
prop.put("finished.jobs_" + i + "_stopped", 2);
|
||||
prop.put("finished.jobs_" + i + "_stopped_errorMsg", error);
|
||||
} else {
|
||||
prop.put("finished.jobs_" + i + "_stopped", 0);
|
||||
}
|
||||
prop.put("finished.jobs_" + i + "_percent", Integer.toString(currThread.getProcessingStatus()));
|
||||
prop.put("finished.jobs_" + i + "_elapsed", serverDate.intervalToString(currThread.getElapsedTime()));
|
||||
prop.put("finished.jobs_" + i + "_wordHash", currThread.getCurrentWordhash());
|
||||
prop.put("finished.jobs_" + i + "_url_num", Long.toString(currThread.getUrlCounter()));
|
||||
prop.put("finished.jobs_" + i + "_word_entity_num", Long.toString(currThread.getWordEntityCounter()));
|
||||
prop.put("finished.jobs_" + i + "_word_entry_num", Long.toString(currThread.getWordEntryCounter()));
|
||||
}
|
||||
prop.put("finished.jobs",finishedJobs.size());
|
||||
|
||||
prop.put("date",(new Date()).toString());
|
||||
return prop;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,181 @@
|
||||
//httpRemoteProxyConfig.java
|
||||
//-----------------------
|
||||
//part of the AnomicHTTPD caching proxy
|
||||
//(C) by Michael Peter Christen; mc@anomic.de
|
||||
//first published on http://www.anomic.de
|
||||
//Frankfurt, Germany, 2004
|
||||
//
|
||||
//this file was contributed by Martin Thelian
|
||||
//$LastChangedDate$
|
||||
//$LastChangedBy$
|
||||
//$LastChangedRevision$
|
||||
//
|
||||
//This program is free software; you can redistribute it and/or modify
|
||||
//it under the terms of the GNU General Public License as published by
|
||||
//the Free Software Foundation; either version 2 of the License, or
|
||||
//(at your option) any later version.
|
||||
//
|
||||
//This program is distributed in the hope that it will be useful,
|
||||
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
//GNU General Public License for more details.
|
||||
//
|
||||
//You should have received a copy of the GNU General Public License
|
||||
//along with this program; if not, write to the Free Software
|
||||
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
//Using this software in any meaning (reading, learning, copying, compiling,
|
||||
//running) means that you agree that the Author(s) is (are) not responsible
|
||||
//for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
//by usage of this softare or this documentation. The usage of this software
|
||||
//is on your own risk. The installation and usage (starting/running) of this
|
||||
//software may allow other people or application to access your computer and
|
||||
//any attached devices and is highly dependent on the configuration of the
|
||||
//software which must be done by the user of the software; the author(s) is
|
||||
//(are) also not responsible for proper configuration and usage of the
|
||||
//software, even if provoked by documentation provided together with
|
||||
//the software.
|
||||
//
|
||||
//Any changes to this file according to the GPL as documented in the file
|
||||
//gpl.txt aside this file in the shipment you received can be done to the
|
||||
//lines that follows this copyright notice here, but changes must not be
|
||||
//done inside the copyright notive above. A re-distribution must contain
|
||||
//the intact and unchanged copyright notice.
|
||||
//Contributions and changes to the program code must be marked as such.
|
||||
|
||||
//You must compile this file with
|
||||
//javac -classpath .:../Classes Settings_p.java
|
||||
//if the shell's current path is HTROOT
|
||||
|
||||
package de.anomic.http;
|
||||
|
||||
import java.util.HashSet;
|
||||
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
|
||||
public final class httpRemoteProxyConfig {
|
||||
|
||||
/*
|
||||
* Remote Proxy configuration
|
||||
*/
|
||||
private boolean remoteProxyUse;
|
||||
private boolean remoteProxyUse4Yacy;
|
||||
private boolean remoteProxyUse4SSL;
|
||||
|
||||
private String remoteProxyHost;
|
||||
private int remoteProxyPort;
|
||||
private String remoteProxyUser;
|
||||
private String remoteProxyPwd;
|
||||
|
||||
private String remoteProxyNoProxy = "";
|
||||
private String[] remoteProxyNoProxyPatterns = null;
|
||||
|
||||
public final HashSet remoteProxyAllowProxySet = new HashSet();
|
||||
public final HashSet remoteProxyDisallowProxySet = new HashSet();
|
||||
|
||||
public boolean useProxy() {
|
||||
return this.remoteProxyUse;
|
||||
}
|
||||
|
||||
public boolean useProxy4Yacy() {
|
||||
return this.remoteProxyUse4Yacy;
|
||||
}
|
||||
|
||||
public boolean useProxy4SSL() {
|
||||
return this.remoteProxyUse4SSL;
|
||||
}
|
||||
|
||||
public String getProxyHost() {
|
||||
return this.remoteProxyHost;
|
||||
}
|
||||
|
||||
public int getProxyPort() {
|
||||
return this.remoteProxyPort;
|
||||
}
|
||||
|
||||
public String getProxyUser() {
|
||||
return this.remoteProxyUser;
|
||||
}
|
||||
|
||||
public String getProxyPwd() {
|
||||
return this.remoteProxyPwd;
|
||||
}
|
||||
|
||||
public String getProxyNoProxy() {
|
||||
return this.remoteProxyNoProxy;
|
||||
}
|
||||
|
||||
public String[] getProxyNoProxyPatterns() {
|
||||
return this.remoteProxyNoProxyPatterns;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuffer toStrBuf = new StringBuffer();
|
||||
|
||||
toStrBuf
|
||||
.append("Status: ").append(this.remoteProxyUse?"ON":"OFF").append(" | ")
|
||||
.append("Host: ");
|
||||
if ((this.remoteProxyUser != null) && (this.remoteProxyUser.length() > 0)) {
|
||||
toStrBuf.append(this.remoteProxyUser)
|
||||
.append("@");
|
||||
}
|
||||
toStrBuf
|
||||
.append((this.remoteProxyHost==null)?"unknown":this.remoteProxyHost).append(":").append(this.remoteProxyPort).append(" | ")
|
||||
.append("Usage: HTTP");
|
||||
if (this.remoteProxyUse4Yacy) toStrBuf.append(" YACY");
|
||||
if (this.remoteProxyUse4SSL) toStrBuf.append(" SSL");
|
||||
toStrBuf.append(" | ")
|
||||
.append("No Proxy for: ")
|
||||
.append(this.remoteProxyNoProxy);
|
||||
|
||||
|
||||
return toStrBuf.toString();
|
||||
}
|
||||
|
||||
public static httpRemoteProxyConfig init(
|
||||
String proxyHostName,
|
||||
int proxyHostPort
|
||||
) {
|
||||
httpRemoteProxyConfig newConfig = new httpRemoteProxyConfig();
|
||||
|
||||
newConfig.remoteProxyUse = true;
|
||||
newConfig.remoteProxyUse4SSL = true;
|
||||
newConfig.remoteProxyUse4Yacy = true;
|
||||
newConfig.remoteProxyHost = proxyHostName;
|
||||
newConfig.remoteProxyPort = proxyHostPort;
|
||||
|
||||
return newConfig;
|
||||
}
|
||||
|
||||
public static httpRemoteProxyConfig init(plasmaSwitchboard sb) {
|
||||
httpRemoteProxyConfig newConfig = new httpRemoteProxyConfig();
|
||||
|
||||
// determining if remote proxy usage is enabled
|
||||
newConfig.remoteProxyUse = sb.getConfig("remoteProxyUse", "false").equalsIgnoreCase("true");
|
||||
|
||||
// determining if remote proxy should be used for yacy -> yacy communication
|
||||
newConfig.remoteProxyUse4Yacy = sb.getConfig("remoteProxyUse4Yacy", "true").equalsIgnoreCase("true");
|
||||
|
||||
// determining if remote proxy should be used for ssl connections
|
||||
newConfig.remoteProxyUse4SSL = sb.getConfig("remoteProxyUse4SSL", "true").equalsIgnoreCase("true");
|
||||
|
||||
// reading the proxy host name
|
||||
newConfig.remoteProxyHost = sb.getConfig("remoteProxyHost", "").trim();
|
||||
|
||||
// reading the proxy host port
|
||||
try {
|
||||
newConfig.remoteProxyPort = Integer.parseInt(sb.getConfig("remoteProxyPort", "3128"));
|
||||
} catch (NumberFormatException e) {
|
||||
newConfig.remoteProxyPort = 3128;
|
||||
}
|
||||
|
||||
newConfig.remoteProxyUser = sb.getConfig("remoteProxyUser", "").trim();
|
||||
newConfig.remoteProxyPwd = sb.getConfig("remoteProxyPwd", "").trim();
|
||||
|
||||
// determining addresses for which the remote proxy should not be used
|
||||
newConfig.remoteProxyNoProxy = sb.getConfig("remoteProxyNoProxy","").trim();
|
||||
newConfig.remoteProxyNoProxyPatterns = newConfig.remoteProxyNoProxy.split(",");
|
||||
|
||||
return newConfig;
|
||||
}
|
||||
}
|
@ -0,0 +1,255 @@
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Vector;
|
||||
|
||||
import de.anomic.server.serverDate;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
|
||||
public class plasmaDbImporter extends Thread {
|
||||
|
||||
public static final Vector finishedJobs = new Vector();
|
||||
public static final ThreadGroup runningJobs = new ThreadGroup("DbImport");
|
||||
public static int currMaxJobNr = 0;
|
||||
|
||||
private final int jobNr;
|
||||
private final plasmaCrawlLURL homeUrlDB;
|
||||
private final plasmaWordIndex homeWordIndex;
|
||||
|
||||
private final plasmaCrawlLURL importUrlDB;
|
||||
private final plasmaWordIndex importWordIndex;
|
||||
private final String importPath;
|
||||
private final File importRoot;
|
||||
private final int importStartSize;
|
||||
|
||||
private final serverLog log;
|
||||
private boolean stopped = false;
|
||||
private boolean paused = false;
|
||||
private String wordHash = "------------";
|
||||
|
||||
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = wordChunkStart;
|
||||
String wordChunkStartHash = "------------", wordChunkEndHash;
|
||||
private long urlCounter = 0, wordCounter = 0, entryCounter = 0;
|
||||
|
||||
private long globalStart = System.currentTimeMillis();
|
||||
private long globalEnd;
|
||||
|
||||
private String error;
|
||||
|
||||
public void stoppIt() {
|
||||
this.stopped = true;
|
||||
}
|
||||
|
||||
public String getError() {
|
||||
return this.error;
|
||||
}
|
||||
|
||||
public int getJobNr() {
|
||||
return this.jobNr;
|
||||
}
|
||||
|
||||
public String getCurrentWordhash() {
|
||||
return this.wordHash;
|
||||
}
|
||||
|
||||
public long getUrlCounter() {
|
||||
return this.urlCounter;
|
||||
}
|
||||
|
||||
public long getWordEntityCounter() {
|
||||
return this.wordCounter;
|
||||
}
|
||||
|
||||
public long getWordEntryCounter() {
|
||||
return this.entryCounter;
|
||||
}
|
||||
|
||||
public File getImportRoot() {
|
||||
return this.importRoot;
|
||||
}
|
||||
|
||||
public int getImportWordDbSize() {
|
||||
return this.importWordIndex.size();
|
||||
}
|
||||
|
||||
public plasmaDbImporter(plasmaWordIndex theHomeIndexDB, plasmaCrawlLURL theHomeUrlDB, String theImportPath) throws IOException {
|
||||
super(runningJobs,"DB-Import_" + theImportPath);
|
||||
|
||||
this.log = new serverLog("DB-IMPORT");
|
||||
|
||||
synchronized(runningJobs) {
|
||||
this.jobNr = currMaxJobNr;
|
||||
currMaxJobNr++;
|
||||
}
|
||||
|
||||
if (theImportPath == null) throw new NullPointerException();
|
||||
this.importPath = theImportPath;
|
||||
this.importRoot = new File(theImportPath);
|
||||
|
||||
if (theHomeIndexDB == null) throw new NullPointerException();
|
||||
this.homeWordIndex = theHomeIndexDB;
|
||||
|
||||
if (theHomeUrlDB == null) throw new NullPointerException();
|
||||
this.homeUrlDB = theHomeUrlDB;
|
||||
|
||||
if (this.homeWordIndex.getRoot().equals(importRoot)) {
|
||||
throw new IllegalArgumentException("Import and home DB directory must not be equal");
|
||||
}
|
||||
|
||||
// configure import DB
|
||||
String errorMsg = null;
|
||||
if (!this.importRoot.exists()) errorMsg = "Import directory does not exist.";
|
||||
if (!this.importRoot.canRead()) errorMsg = "Import directory is not readable.";
|
||||
if (!this.importRoot.canWrite()) errorMsg = "Import directory is not writeable";
|
||||
if (!this.importRoot.isDirectory()) errorMsg = "ImportDirectory is not a directory.";
|
||||
if (errorMsg != null) {
|
||||
this.log.logSevere(errorMsg + "\nName: " + this.importRoot.getAbsolutePath());
|
||||
throw new IllegalArgumentException(errorMsg);
|
||||
}
|
||||
|
||||
this.log.logFine("Initializing source word index db.");
|
||||
this.importWordIndex = new plasmaWordIndex(this.importRoot, 8*1024*1024, this.log);
|
||||
this.log.logFine("Initializing import URL db.");
|
||||
this.importUrlDB = new plasmaCrawlLURL(new File(this.importRoot, "urlHash.db"), 4*1024*1024);
|
||||
this.importStartSize = this.importWordIndex.size();
|
||||
}
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
importWordsDB();
|
||||
} finally {
|
||||
globalEnd = System.currentTimeMillis();
|
||||
finishedJobs.add(this);
|
||||
}
|
||||
}
|
||||
|
||||
public long getTotalRuntime() {
|
||||
return (this.globalEnd == 0)?System.currentTimeMillis()-this.globalStart:this.globalEnd-this.globalStart;
|
||||
}
|
||||
|
||||
public int getProcessingStatus() {
|
||||
return (this.importStartSize-this.importWordIndex.size())/(this.importStartSize/100);
|
||||
}
|
||||
|
||||
public long getElapsedTime() {
|
||||
return System.currentTimeMillis()-this.globalStart;
|
||||
}
|
||||
|
||||
public long getEstimatedTime() {
|
||||
return (this.wordCounter==0)?0:this.importWordIndex.size()*((System.currentTimeMillis()-this.globalStart)/this.wordCounter);
|
||||
}
|
||||
|
||||
public void importWordsDB() {
|
||||
this.log.logInfo("STARTING DB-IMPORT");
|
||||
|
||||
try {
|
||||
this.log.logInfo("Importing DB from '" + this.importRoot.getAbsolutePath() + "' to '" + this.homeWordIndex.getRoot().getAbsolutePath() + "'.");
|
||||
this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs.");
|
||||
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs.");
|
||||
|
||||
// iterate over all words from import db
|
||||
|
||||
Iterator importWordHashIterator = this.importWordIndex.wordHashes(wordChunkStartHash, true, true);
|
||||
while (!isAborted() && importWordHashIterator.hasNext()) {
|
||||
|
||||
plasmaWordIndexEntity importWordIdxEntity = null;
|
||||
try {
|
||||
wordCounter++;
|
||||
wordHash = (String) importWordHashIterator.next();
|
||||
importWordIdxEntity = importWordIndex.getEntity(wordHash, true);
|
||||
|
||||
if (importWordIdxEntity.size() == 0) {
|
||||
importWordIdxEntity.deleteComplete();
|
||||
continue;
|
||||
}
|
||||
|
||||
// creating a container used to hold the imported entries
|
||||
plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size());
|
||||
|
||||
// the combined container will fit, read the container
|
||||
Iterator importWordIdxEntries = importWordIdxEntity.elements(true);
|
||||
plasmaWordIndexEntry importWordIdxEntry;
|
||||
while (importWordIdxEntries.hasNext()) {
|
||||
|
||||
// testing if import process was aborted
|
||||
if (isAborted()) break;
|
||||
|
||||
// getting next word index entry
|
||||
entryCounter++;
|
||||
importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
|
||||
String urlHash = importWordIdxEntry.getUrlHash();
|
||||
if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) {
|
||||
urlCounter++;
|
||||
|
||||
// importing the new url
|
||||
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash);
|
||||
this.homeUrlDB.newEntry(urlEntry);
|
||||
|
||||
if (urlCounter % 500 == 0) {
|
||||
this.log.logFine(urlCounter + " URLs processed so far.");
|
||||
}
|
||||
}
|
||||
|
||||
// adding word index entity to container
|
||||
newContainer.add(importWordIdxEntry,System.currentTimeMillis());
|
||||
|
||||
if (entryCounter % 500 == 0) {
|
||||
this.log.logFine(entryCounter + " word entries and " + wordCounter + " word entries processed so far.");
|
||||
}
|
||||
}
|
||||
|
||||
// testing if import process was aborted
|
||||
if (isAborted()) break;
|
||||
|
||||
// importing entity container to home db
|
||||
homeWordIndex.addEntries(newContainer, true);
|
||||
|
||||
// delete complete index entity file
|
||||
importWordIdxEntity.close();
|
||||
importWordIndex.deleteIndex(wordHash);
|
||||
|
||||
// print out some statistical information
|
||||
if (wordCounter%500 == 0) {
|
||||
wordChunkEndHash = wordHash;
|
||||
wordChunkEnd = System.currentTimeMillis();
|
||||
long duration = wordChunkEnd - wordChunkStart;
|
||||
log.logInfo(wordCounter + " word entities imported " +
|
||||
"[" + wordChunkStartHash + " .. " + wordChunkEndHash + "] " +
|
||||
this.getProcessingStatus() + "%\n" +
|
||||
"Speed: "+ 500*1000/duration + " word entities/s" +
|
||||
" | Elapsed time: " + serverDate.intervalToString(getElapsedTime()) +
|
||||
" | Estimated time: " + serverDate.intervalToString(getEstimatedTime()) + "\n" +
|
||||
"Home Words = " + homeWordIndex.size() +
|
||||
" | Import Words = " + importWordIndex.size());
|
||||
wordChunkStart = wordChunkEnd;
|
||||
wordChunkStartHash = wordChunkEndHash;
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.logSevere("Import of word entity '" + wordHash + "' failed.",e);
|
||||
} finally {
|
||||
if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {}
|
||||
}
|
||||
}
|
||||
|
||||
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeUrlDB.size() + " URLs.");
|
||||
this.log.logInfo("Import word index contains " + importWordIndex.size() + " words and " + importUrlDB.size() + " URLs.");
|
||||
|
||||
this.log.logInfo("DB-IMPORT FINISHED");
|
||||
} catch (Exception e) {
|
||||
this.log.logSevere("Database import failed.",e);
|
||||
e.printStackTrace();
|
||||
this.error = e.toString();
|
||||
} finally {
|
||||
if (importUrlDB != null) try { importUrlDB.close(); } catch (Exception e){}
|
||||
if (importWordIndex != null) try { importWordIndex.close(5000); } catch (Exception e){}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isAborted() {
|
||||
return (this.stopped) || Thread.currentThread().isInterrupted();
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue