- moved all index generation servlets to it's own main menu item, including proxy indexing - removed external index import because this operation is not recommended any more. Joining an index can simply be done by moving the index files from one peer to the other peer; they will be merged automatically - fix to prevent endless loops when disconnecting http sessions - fix to prevent application of bad blacklist entries that can cause a 'Dangling meta character' exception git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6558 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
ab3cf60dbe
commit
a3b8b7b5c5
@ -1,270 +0,0 @@
|
|||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
||||||
<head>
|
|
||||||
<!--
|
|
||||||
<title>YaCy '#[clientname]#': Index Import</title>
|
|
||||||
//-->
|
|
||||||
<title>YaCy '#[clientname]#': Crawling Queue Import</title>
|
|
||||||
#%env/templates/metas.template%#
|
|
||||||
<meta http-equiv="REFRESH" content="30" />
|
|
||||||
</head>
|
|
||||||
<body id="IndexImport">
|
|
||||||
#%env/templates/header.template%#
|
|
||||||
#%env/templates/submenuIndexControl.template%#
|
|
||||||
<!--
|
|
||||||
<h2>Index DB Import</h2>
|
|
||||||
<p>The local index currently consists of (at least) #[wcount]# reverse word indexes and #[ucount]# URL references.</p>
|
|
||||||
//-->
|
|
||||||
<h2>Crawling Queue Import</h2>
|
|
||||||
#(error)#<!-- 0 -->
|
|
||||||
::<!-- 1 -->
|
|
||||||
<p class="error">#[error_msg]#</p>
|
|
||||||
::<!-- 2 -->
|
|
||||||
<p class="error">Import Job with the same path already started.</p>
|
|
||||||
::<!-- 3 -->
|
|
||||||
<p class="error">#[error_msg]#</p>
|
|
||||||
<p class="error"><code>#[error_stackTrace]#</code></p>
|
|
||||||
#(/error)#
|
|
||||||
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
|
|
||||||
<h3>Starting new Job</h3>
|
|
||||||
<table border="0" cellpadding="2" cellspacing="1">
|
|
||||||
<tr class="TableCellLight">
|
|
||||||
<td>Import Type:</td>
|
|
||||||
<td title="the path to the database that should be imported"></td>
|
|
||||||
<td title="the cache size that should be used for the import db">Cache Size</td>
|
|
||||||
<td>
|
|
||||||
<select name="cacheSize" size="1">
|
|
||||||
<option value="2097152">2 MB</option>
|
|
||||||
<option value="4194304">4 MB</option>
|
|
||||||
<option value="8388608" selected="selected">8 MB</option>
|
|
||||||
<option value="16777216">16 MB</option>
|
|
||||||
<option value="33554432">32 MB</option>
|
|
||||||
<option value="67108864">64 MB</option>
|
|
||||||
<option value="134217728">128 MB</option>
|
|
||||||
</select>
|
|
||||||
</td>
|
|
||||||
<td><a href="#usage">Usage Examples</a></td>
|
|
||||||
</tr>
|
|
||||||
<tr class="TableCellLight">
|
|
||||||
<td title="Path to the PLASMADB directory of the foreign peer">Import Path:</td>
|
|
||||||
<td colspan="3"><input name="importPlasmaPath" type="text" size="50" value="" /></td>
|
|
||||||
<td></td>
|
|
||||||
</tr>
|
|
||||||
<tr class="TableCellLight">
|
|
||||||
<td title="Path to the primary Index directory of the foreign peer">Import Path:</td>
|
|
||||||
<td colspan="3"><input name="importIndexPrimaryPath" type="text" size="50" value="" /></td>
|
|
||||||
<td></td>
|
|
||||||
</tr>
|
|
||||||
<tr class="TableCellLight">
|
|
||||||
<td title="Path to the secondary Index directory of the foreign peer">Import Path:</td>
|
|
||||||
<td colspan="3"><input name="importIndexSecondaryPath" type="text" size="50" value="" /></td>
|
|
||||||
<td><input type="submit" name="startIndexDbImport" value="Start Import" /></td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
<p class="warning"><strong>Attention:</strong><br />Always do a backup of your source and destination database before starting to use this import function.</p>
|
|
||||||
</form>
|
|
||||||
|
|
||||||
<hr />
|
|
||||||
<h3>Currently running jobs</h3>
|
|
||||||
<table border="0" cellpadding="2" cellspacing="1">
|
|
||||||
<colgroup>
|
|
||||||
<col />
|
|
||||||
<col width="150" />
|
|
||||||
<col span="7" />
|
|
||||||
</colgroup>
|
|
||||||
<tr class="TableHeader" valign="bottom">
|
|
||||||
<td>Job Type</td>
|
|
||||||
<td>Job Name</td>
|
|
||||||
<td>Status</td>
|
|
||||||
<td>%</td>
|
|
||||||
<td>Elapsed<br />Time</td>
|
|
||||||
<td>Time<br />Left</td>
|
|
||||||
<td>Import Status</td>
|
|
||||||
<td>Abort Import</td>
|
|
||||||
<td>Pause Import</td>
|
|
||||||
</tr>
|
|
||||||
#{running.jobs}#
|
|
||||||
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
|
|
||||||
<tr class="TableCellLight">
|
|
||||||
<td>#[type]#</td>
|
|
||||||
<td title="#[fullName]#">#[shortName]#</td>
|
|
||||||
<td style="color:#(runningStatus)#red::green::blue#(/runningStatus)#;">#(runningStatus)#Finished::Running::Paused#(/runningStatus)#</td>
|
|
||||||
<td align="right">#[percent]#</td>
|
|
||||||
<td align="right">#[elapsed]#</td>
|
|
||||||
<td align="right">#[estimated]#</td>
|
|
||||||
<td align="left"><tt>#[status]#</tt></td>
|
|
||||||
<td>
|
|
||||||
<input type="hidden" name="jobNr" value="#[job_nr]#" />
|
|
||||||
#(stopped)#::
|
|
||||||
<input type="submit" name="stopIndexDbImport" value="Abort" />
|
|
||||||
#(/stopped)#
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
#(paused)#
|
|
||||||
<input type="submit" name="pauseIndexDbImport" value="Pause" />
|
|
||||||
::
|
|
||||||
<input type="submit" name="continueIndexDbImport" value="Continue" />
|
|
||||||
#(/paused)#
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</form>
|
|
||||||
#{/running.jobs}#
|
|
||||||
</table>
|
|
||||||
|
|
||||||
|
|
||||||
<hr />
|
|
||||||
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
|
|
||||||
<h3>Finished jobs</h3>
|
|
||||||
<table border="0" cellpadding="2" cellspacing="1">
|
|
||||||
<colgroup>
|
|
||||||
<col />
|
|
||||||
<col width="150" />
|
|
||||||
<col span="4" />
|
|
||||||
</colgroup>
|
|
||||||
<tr class="TableHeader" valign="bottom">
|
|
||||||
<td>Job Type</td>
|
|
||||||
<td>Path</td>
|
|
||||||
<td>Status</td>
|
|
||||||
<td>%</td>
|
|
||||||
<td>Elapsed<br />Time</td>
|
|
||||||
<td>Import Status</td>
|
|
||||||
</tr>
|
|
||||||
#{finished.jobs}#
|
|
||||||
<tr class="TableCellLight">
|
|
||||||
<td>#[type]#</td>
|
|
||||||
<td title="#[fullName]#">#[shortName]#</td>
|
|
||||||
<td><font color="#(runningStatus)#green::red::blue#(/runningStatus)#">#(runningStatus)#Finished::<b>Error:</b> #[errorMsg]#::Paused#(/runningStatus)#</font></td>
|
|
||||||
<td align="right">#[percent]#</td>
|
|
||||||
<td align="right">#[elapsed]#</td>
|
|
||||||
<td align="left"><tt>#[status]#</tt></td>
|
|
||||||
</tr>
|
|
||||||
#{/finished.jobs}#
|
|
||||||
</table>
|
|
||||||
<fieldset>
|
|
||||||
<input type="submit" name="clearFinishedJobList" value="Clear List" />
|
|
||||||
</fieldset>
|
|
||||||
</form>
|
|
||||||
<p><em>Last Refresh:</em> #[date]#</p>
|
|
||||||
<hr />
|
|
||||||
<h2 id="usage">Usage Examples:</h2>
|
|
||||||
|
|
||||||
<!--<h3>Plasma DB Import:</h3>
|
|
||||||
<p>
|
|
||||||
<strong>Example Path:</strong> <tt>E:\PLASMADB\</tt>
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
<strong>Requirements:</strong>
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
You need to have at least the following directories and files in this path:
|
|
||||||
</p>
|
|
||||||
<table border="1" cellpadding="2" cellspacing="1">
|
|
||||||
<tr class="example">
|
|
||||||
<td>Name</td>
|
|
||||||
<td>Type</td>
|
|
||||||
<td>Writeable</td>
|
|
||||||
<td>Description</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><tt>urlHash.db</tt></td>
|
|
||||||
<td>File</td>
|
|
||||||
<td>No</td>
|
|
||||||
<td>The LoadedURL Database containing all loaded and indexed URLs</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><tt>ACLUSTER</tt></td>
|
|
||||||
<td>Directory</td>
|
|
||||||
<td>Yes</td>
|
|
||||||
<td>The assortment directory containing parts of the word index.</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><tt>WORDS</tt></td>
|
|
||||||
<td>Directory</td>
|
|
||||||
<td>Yes</td>
|
|
||||||
<td>The words directory containing parts of the word index.</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<h3>Assortment Import:</h3>
|
|
||||||
<p>
|
|
||||||
<strong>Example Path:</strong> <tt>E:\PLASMADB\ACLUSTER\indexAssortment001.db</tt>
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
<strong>Requirements:</strong>
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
You need to have at least the following directories and files in this path:
|
|
||||||
</p>
|
|
||||||
<table border="1" cellpadding="2" cellspacing="1">
|
|
||||||
<tr class="example">
|
|
||||||
<td>Name</td>
|
|
||||||
<td>Type</td>
|
|
||||||
<td>Writeable</td>
|
|
||||||
<td>Description</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><tt>indexAssortment001.db</tt></td>
|
|
||||||
<td>File</td>
|
|
||||||
<td>No</td>
|
|
||||||
<td>The assortment file that should be imported.<br />
|
|
||||||
<strong>Attention:</strong> The assortment file must have the postfix "[0-9]{3}\.db".
|
|
||||||
If you would like to import an assortment file from the <tt>PLASMADB\ACLUSTER\ABKP</tt>
|
|
||||||
you have to rename it first.
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
<p>
|
|
||||||
<strong>Notes:</strong>
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
Please note that the imported words are useless if the destination peer doesn't know
|
|
||||||
the URLs the imported words belongs to.
|
|
||||||
</p>-->
|
|
||||||
|
|
||||||
<h3>Crawling Queue Import:</h3>
|
|
||||||
<p>
|
|
||||||
<strong>Example Path:</strong> <tt>E:\PLASMADB\</tt>
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
<strong>Requirements:</strong>
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
You need to have at least the following directories and files in this path:
|
|
||||||
</p>
|
|
||||||
<table border="1" cellpadding="2" cellspacing="1">
|
|
||||||
<tr class="example">
|
|
||||||
<td>Name</td>
|
|
||||||
<td>Type</td>
|
|
||||||
<td>Writeable</td>
|
|
||||||
<td>Description</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><tt>crawlProfiles0.db</tt></td>
|
|
||||||
<td>File</td>
|
|
||||||
<td>No</td>
|
|
||||||
<td>Contains data about the crawljob an URL belongs to</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><tt>urlNotice1.db</tt></td>
|
|
||||||
<td>File</td>
|
|
||||||
<td>Yes</td>
|
|
||||||
<td>The crawling queue</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><tt>urlNoticeImage0.stack</tt></td>
|
|
||||||
<td rowspan="8">File</td>
|
|
||||||
<td rowspan="8">Yes</td>
|
|
||||||
<td rowspan="8">Various stack files that belong to the crawling queue</td>
|
|
||||||
</tr>
|
|
||||||
<tr><td><tt>urlNoticeImage0.stack</tt></td></tr>
|
|
||||||
<tr><td><tt>urlNoticeLimit0.stack</tt></td></tr>
|
|
||||||
<tr><td><tt>urlNoticeLocal0.stack</tt></td></tr>
|
|
||||||
<tr><td><tt>urlNoticeMovie0.stack</tt></td></tr>
|
|
||||||
<tr><td><tt>urlNoticeMusic0.stack</tt></td></tr>
|
|
||||||
<tr><td><tt>urlNoticeOverhang0.stack</tt></td></tr>
|
|
||||||
<tr><td><tt>urlNoticeRemote0.stack</tt></td></tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
#%env/templates/footer.template%#
|
|
||||||
</body>
|
|
||||||
</html>
|
|
@ -1,197 +0,0 @@
|
|||||||
//IndexTransfer_p.java
|
|
||||||
//-----------------------
|
|
||||||
//part of the AnomicHTTPD caching proxy
|
|
||||||
//(C) by Michael Peter Christen; mc@yacy.net
|
|
||||||
//first published on http://www.anomic.de
|
|
||||||
//Frankfurt, Germany, 2005
|
|
||||||
//
|
|
||||||
//This file is contributed by Martin Thelian
|
|
||||||
//
|
|
||||||
// $LastChangedDate$
|
|
||||||
// $LastChangedRevision$
|
|
||||||
// $LastChangedBy$
|
|
||||||
//
|
|
||||||
//This program is free software; you can redistribute it and/or modify
|
|
||||||
//it under the terms of the GNU General Public License as published by
|
|
||||||
//the Free Software Foundation; either version 2 of the License, or
|
|
||||||
//(at your option) any later version.
|
|
||||||
//
|
|
||||||
//This program is distributed in the hope that it will be useful,
|
|
||||||
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
//GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
//You should have received a copy of the GNU General Public License
|
|
||||||
//along with this program; if not, write to the Free Software
|
|
||||||
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
//You must compile this file with
|
|
||||||
//javac -classpath .:../Classes IndexControl_p.java
|
|
||||||
//if the shell's current path is HTROOT
|
|
||||||
|
|
||||||
import java.io.PrintStream;
|
|
||||||
import java.util.Date;
|
|
||||||
|
|
||||||
import net.yacy.kelondro.logging.Log;
|
|
||||||
import net.yacy.kelondro.util.ByteBuffer;
|
|
||||||
import net.yacy.kelondro.util.DateFormatter;
|
|
||||||
|
|
||||||
import de.anomic.crawler.Importer;
|
|
||||||
import de.anomic.crawler.NoticeURLImporter;
|
|
||||||
import de.anomic.http.server.RequestHeader;
|
|
||||||
import de.anomic.search.Segment;
|
|
||||||
import de.anomic.search.Segments;
|
|
||||||
import de.anomic.search.Switchboard;
|
|
||||||
import de.anomic.server.serverObjects;
|
|
||||||
import de.anomic.server.serverSwitch;
|
|
||||||
|
|
||||||
public final class IndexImport_p {
|
|
||||||
|
|
||||||
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
|
||||||
// return variable that accumulates replacements
|
|
||||||
final Switchboard sb = (Switchboard) env;
|
|
||||||
final serverObjects prop = new serverObjects();
|
|
||||||
|
|
||||||
int activeCount = 0;
|
|
||||||
|
|
||||||
// get segment
|
|
||||||
Segment indexSegment = null;
|
|
||||||
if (post != null && post.containsKey("segment")) {
|
|
||||||
String segmentName = post.get("segment");
|
|
||||||
if (sb.indexSegments.segmentExist(segmentName)) {
|
|
||||||
indexSegment = sb.indexSegments.segment(segmentName);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// take default segment
|
|
||||||
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (post != null) {
|
|
||||||
if (post.containsKey("startIndexDbImport")) {
|
|
||||||
try {
|
|
||||||
final boolean startImport = true;
|
|
||||||
if (startImport) {
|
|
||||||
final Importer importerThread = new NoticeURLImporter(
|
|
||||||
sb.queuesRoot,
|
|
||||||
sb.crawlQueues,
|
|
||||||
sb.crawler.profilesActiveCrawls,
|
|
||||||
sb.dbImportManager);
|
|
||||||
|
|
||||||
if (importerThread != null) {
|
|
||||||
importerThread.setJobID(sb.dbImportManager.generateUniqueJobID());
|
|
||||||
importerThread.startIt();
|
|
||||||
}
|
|
||||||
prop.put("LOCATION","");
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
} catch (final Exception e) {
|
|
||||||
final ByteBuffer errorMsg = new ByteBuffer(100);
|
|
||||||
final PrintStream errorOut = new PrintStream(errorMsg);
|
|
||||||
Log.logException(e);
|
|
||||||
|
|
||||||
prop.put("error", "3");
|
|
||||||
prop.putHTML("error_error_msg",e.toString());
|
|
||||||
prop.putHTML("error_error_stackTrace",errorMsg.toString().replaceAll("\n","<br>"));
|
|
||||||
|
|
||||||
errorOut.close();
|
|
||||||
}
|
|
||||||
} else if (post.containsKey("clearFinishedJobList")) {
|
|
||||||
sb.dbImportManager.finishedJobs.clear();
|
|
||||||
prop.put("LOCATION", "");
|
|
||||||
return prop;
|
|
||||||
} else if (
|
|
||||||
(post.containsKey("stopIndexDbImport")) ||
|
|
||||||
(post.containsKey("pauseIndexDbImport")) ||
|
|
||||||
(post.containsKey("continueIndexDbImport"))
|
|
||||||
) {
|
|
||||||
// get the job nr of the thread
|
|
||||||
final String jobID = post.get("jobNr");
|
|
||||||
final Importer importer = sb.dbImportManager.getImporterByID(Integer.valueOf(jobID).intValue());
|
|
||||||
if (importer != null) {
|
|
||||||
if (post.containsKey("stopIndexDbImport")) {
|
|
||||||
try {
|
|
||||||
importer.stopIt();
|
|
||||||
} catch (final InterruptedException e) {
|
|
||||||
// TODO Auto-generated catch block
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
} else if (post.containsKey("pauseIndexDbImport")) {
|
|
||||||
importer.pauseIt();
|
|
||||||
} else if (post.containsKey("continueIndexDbImport")) {
|
|
||||||
importer.continueIt();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prop.put("LOCATION","");
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
prop.putNum("wcount", indexSegment.termIndex().sizesMax());
|
|
||||||
prop.putNum("ucount", indexSegment.urlMetadata().size());
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Loop over all currently running jobs
|
|
||||||
*/
|
|
||||||
final Importer[] importThreads = sb.dbImportManager.getRunningImporter();
|
|
||||||
activeCount = importThreads.length;
|
|
||||||
|
|
||||||
for (int i=0; i < activeCount; i++) {
|
|
||||||
final Importer currThread = importThreads[i];
|
|
||||||
|
|
||||||
// get import type
|
|
||||||
prop.put("running.jobs_" + i + "_type", currThread.getJobType());
|
|
||||||
|
|
||||||
// root path of the source db
|
|
||||||
final String fullName = currThread.getJobName();
|
|
||||||
final String shortName = (fullName.length()>30)?fullName.substring(0,12) + "..." + fullName.substring(fullName.length()-22,fullName.length()):fullName;
|
|
||||||
prop.put("running.jobs_" + i + "_fullName",fullName);
|
|
||||||
prop.put("running.jobs_" + i + "_shortName",shortName);
|
|
||||||
|
|
||||||
// specifies if the importer is still running
|
|
||||||
prop.put("running.jobs_" + i + "_stopped", currThread.isStopped() ? "0" : "1");
|
|
||||||
|
|
||||||
// specifies if the importer was paused
|
|
||||||
prop.put("running.jobs_" + i + "_paused", currThread.isPaused() ? "1" : "0");
|
|
||||||
|
|
||||||
// setting the status
|
|
||||||
prop.put("running.jobs_" + i + "_runningStatus", currThread.isPaused() ? "2" : currThread.isStopped() ? "0" : "1");
|
|
||||||
|
|
||||||
// other information
|
|
||||||
prop.putNum("running.jobs_" + i + "_percent", currThread.getProcessingStatusPercent());
|
|
||||||
prop.put("running.jobs_" + i + "_elapsed", DateFormatter.formatInterval(currThread.getElapsedTime()));
|
|
||||||
prop.put("running.jobs_" + i + "_estimated", DateFormatter.formatInterval(currThread.getEstimatedTime()));
|
|
||||||
prop.putHTML("running.jobs_" + i + "_status", currThread.getStatus().replaceAll("\n", "<br>"));
|
|
||||||
|
|
||||||
// job number of the importer thread
|
|
||||||
prop.put("running.jobs_" + i + "_job_nr", currThread.getJobID());
|
|
||||||
}
|
|
||||||
prop.put("running.jobs", activeCount);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Loop over all finished jobs
|
|
||||||
*/
|
|
||||||
final Importer[] finishedJobs = sb.dbImportManager.getFinishedImporter();
|
|
||||||
for (int i=0; i<finishedJobs.length; i++) {
|
|
||||||
final Importer currThread = finishedJobs[i];
|
|
||||||
final String error = currThread.getError();
|
|
||||||
final String fullName = currThread.getJobName();
|
|
||||||
final String shortName = (fullName.length()>30)?fullName.substring(0,12) + "..." + fullName.substring(fullName.length()-22,fullName.length()):fullName;
|
|
||||||
prop.put("finished.jobs_" + i + "_type", currThread.getJobType());
|
|
||||||
prop.put("finished.jobs_" + i + "_fullName", fullName);
|
|
||||||
prop.put("finished.jobs_" + i + "_shortName", shortName);
|
|
||||||
if (error != null) {
|
|
||||||
prop.put("finished.jobs_" + i + "_runningStatus", "1");
|
|
||||||
prop.putHTML("finished.jobs_" + i + "_runningStatus_errorMsg", error.replaceAll("\n", "<br>"));
|
|
||||||
} else {
|
|
||||||
prop.put("finished.jobs_" + i + "_runningStatus", "0");
|
|
||||||
}
|
|
||||||
prop.putNum("finished.jobs_" + i + "_percent", currThread.getProcessingStatusPercent());
|
|
||||||
prop.put("finished.jobs_" + i + "_elapsed", DateFormatter.formatInterval(currThread.getElapsedTime()));
|
|
||||||
prop.putHTML("finished.jobs_" + i + "_status", currThread.getStatus().replaceAll("\n", "<br>"));
|
|
||||||
}
|
|
||||||
prop.put("finished.jobs",finishedJobs.length);
|
|
||||||
|
|
||||||
prop.put("date",(new Date()).toString());
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
}
|
|
@ -0,0 +1,29 @@
|
|||||||
|
<div class="SubMenu">
|
||||||
|
<h3>Web Crawler</h3>
|
||||||
|
</div>
|
||||||
|
<div class="SubMenu">
|
||||||
|
<div class="SubMenugroup">
|
||||||
|
<h3>Processing Monitor</h3>
|
||||||
|
<ul class="SubMenu">
|
||||||
|
<li><a href="/WatchCrawler_p.html" class="MenuItemLink lock">Crawler Queues</a></li>
|
||||||
|
<li><a href="/IndexCreateLoaderQueue_p.html" class="MenuItemLink lock">Loader</a></li>
|
||||||
|
<li><a href="/IndexCreateParserErrors_p.html" class="MenuItemLink lock">Parser Errors</a></li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="SubMenugroup">
|
||||||
|
<h3>Queues</h3>
|
||||||
|
<ul class="SubMenu">
|
||||||
|
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li>
|
||||||
|
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li>
|
||||||
|
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="SubMenugroup">
|
||||||
|
<h3>Crawler Steering</h3>
|
||||||
|
<ul class="SubMenu">
|
||||||
|
<li><a href="/CrawlProfileEditor_p.html" class="MenuItemLink lock">Crawl Profile Editor</a></li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</div>
|
@ -1,43 +1,9 @@
|
|||||||
<div class="SubMenu">
|
<div class="SubMenu">
|
||||||
<h3>Web Crawler</h3>
|
<h3>Index Creation</h3>
|
||||||
</div>
|
<ul class="SubMenu">
|
||||||
<div class="SubMenu">
|
<li><a href="/ConfigWikiSearch.html" class="MenuItemLink">Indexing of Media Wikis</a></li>
|
||||||
|
<li><a href="/ConfigPHPBB3Search.html" class="MenuItemLink">Indexing of phpBB3 Forums</a></li>
|
||||||
<div class="SubMenugroup">
|
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Crawl Start (Advanced)</a></li>
|
||||||
<h3>Crawler Steering</h3>
|
<li><a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink lock">Scraping Proxy Configuration</a></li>
|
||||||
<ul class="SubMenu">
|
</ul>
|
||||||
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Crawl Start</a></li>
|
|
||||||
<li><a href="/CrawlProfileEditor_p.html" class="MenuItemLink lock">Crawl Profile Editor</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="SubMenugroup">
|
|
||||||
<h3>Processing Monitor</h3>
|
|
||||||
<ul class="SubMenu">
|
|
||||||
<li><a href="/WatchCrawler_p.html" class="MenuItemLink lock">Crawler Queues</a></li>
|
|
||||||
<li><a href="/IndexCreateLoaderQueue_p.html" class="MenuItemLink lock">Loader</a></li>
|
|
||||||
<li><a href="/IndexCreateParserErrors_p.html" class="MenuItemLink lock">Parser Errors</a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="SubMenugroup">
|
|
||||||
<h3>Queues</h3>
|
|
||||||
<ul class="SubMenu">
|
|
||||||
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li>
|
|
||||||
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li>
|
|
||||||
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li>
|
|
||||||
<!--<li><a href="/IndexCreateWWWOverhangQueue_p.html" class="MenuItemLink"><em class="lock">Overhang</em></a></li>-->
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!---
|
|
||||||
<div class="SubMenugroup">
|
|
||||||
<h3>Media Crawl Queues</h3>
|
|
||||||
<ul class="SubMenu">
|
|
||||||
<li><a href="/IndexCreateImageQueue_p.html" class="MenuItemLink"><em class="lock">Images</em></a></li>
|
|
||||||
<li><a href="/IndexCreateMovieQueue_p.html" class="MenuItemLink"><em class="lock">Movies</em></a></li>
|
|
||||||
<li><a href="/IndexCreateMusicQueue_p.html" class="MenuItemLink"><em class="lock">Music</em></a></li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
-->
|
|
||||||
</div>
|
</div>
|
@ -1,228 +0,0 @@
|
|||||||
package de.anomic.crawler;
|
|
||||||
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.TreeSet;
|
|
||||||
|
|
||||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
|
||||||
import net.yacy.kelondro.data.word.WordReference;
|
|
||||||
import net.yacy.kelondro.logging.Log;
|
|
||||||
import net.yacy.kelondro.rwi.Reference;
|
|
||||||
import net.yacy.kelondro.rwi.ReferenceContainer;
|
|
||||||
import net.yacy.kelondro.util.DateFormatter;
|
|
||||||
|
|
||||||
import de.anomic.search.Segment;
|
|
||||||
|
|
||||||
public class ExternalIndexImporter extends AbstractImporter implements Importer {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* the source word index (the DB to import)
|
|
||||||
*/
|
|
||||||
private final Segment importWordIndex;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* the destination word index (the home DB)
|
|
||||||
*/
|
|
||||||
protected Segment homeWordIndex;
|
|
||||||
private final int importStartSize;
|
|
||||||
|
|
||||||
private byte[] wordHash = "------------".getBytes();
|
|
||||||
|
|
||||||
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = this.wordChunkStart;
|
|
||||||
byte[] wordChunkStartHash = "------------".getBytes(), wordChunkEndHash;
|
|
||||||
private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0;
|
|
||||||
|
|
||||||
|
|
||||||
public ExternalIndexImporter(final Segment homeWI, final Segment importWI) {
|
|
||||||
super("PLASMADB");
|
|
||||||
this.homeWordIndex = homeWI;
|
|
||||||
this.importWordIndex = importWI;
|
|
||||||
this.importStartSize = this.importWordIndex.termIndex().sizesMax();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @see Importer#getJobName()
|
|
||||||
*/
|
|
||||||
public String getJobName() {
|
|
||||||
return this.importWordIndex.getLocation().toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @see Importer#getStatus()
|
|
||||||
*/
|
|
||||||
public String getStatus() {
|
|
||||||
final StringBuilder theStatus = new StringBuilder();
|
|
||||||
|
|
||||||
theStatus.append("Hash=").append(this.wordHash).append("\n");
|
|
||||||
theStatus.append("#URL=").append(this.urlCounter).append("\n");
|
|
||||||
theStatus.append("#Word Entity=").append(this.wordCounter).append("\n");
|
|
||||||
theStatus.append("#Word Entry={").append(this.entryCounter);
|
|
||||||
theStatus.append(" ,NotBound=").append(this.notBoundEntryCounter).append("}");
|
|
||||||
|
|
||||||
return theStatus.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void run() {
|
|
||||||
try {
|
|
||||||
importWordsDB();
|
|
||||||
} finally {
|
|
||||||
this.globalEnd = System.currentTimeMillis();
|
|
||||||
//this.sb.dbImportManager.finishedJobs.add(this);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @see Importer#getProcessingStatusPercent()
|
|
||||||
*/
|
|
||||||
public int getProcessingStatusPercent() {
|
|
||||||
// thid seems to be better:
|
|
||||||
// (this.importStartSize-this.importWordIndex.size())*100/((this.importStartSize==0)?1:this.importStartSize);
|
|
||||||
// but maxint (2,147,483,647) could be exceeded when WordIndexes reach 20M entries
|
|
||||||
//return (this.importStartSize-this.importWordIndex.size())/((this.importStartSize<100)?1:(this.importStartSize)/100);
|
|
||||||
return (int)(this.wordCounter)/((this.importStartSize<100)?1:(this.importStartSize)/100);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @see Importer#getElapsedTime()
|
|
||||||
*/
|
|
||||||
public long getEstimatedTime() {
|
|
||||||
return (this.wordCounter==0)?0:((this.importStartSize*getElapsedTime())/this.wordCounter)-getElapsedTime();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void importWordsDB() {
|
|
||||||
this.log.logInfo("STARTING DB-IMPORT");
|
|
||||||
|
|
||||||
try {
|
|
||||||
this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation().getAbsolutePath() + "'");
|
|
||||||
this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
|
|
||||||
this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
|
|
||||||
|
|
||||||
final HashSet<String> unknownUrlBuffer = new HashSet<String>();
|
|
||||||
final HashSet<String> importedUrlBuffer = new HashSet<String>();
|
|
||||||
|
|
||||||
// iterate over all words from import db
|
|
||||||
//Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, CrawlSwitchboard.RL_WORDFILES, false);
|
|
||||||
Iterator<ReferenceContainer<WordReference>> indexContainerIterator = this.importWordIndex.termIndex().references(this.wordChunkStartHash, false, 100, false).iterator();
|
|
||||||
while (!isAborted() && indexContainerIterator.hasNext()) {
|
|
||||||
|
|
||||||
final TreeSet<String> entityUrls = new TreeSet<String>();
|
|
||||||
ReferenceContainer<WordReference> newContainer = null;
|
|
||||||
try {
|
|
||||||
this.wordCounter++;
|
|
||||||
newContainer = indexContainerIterator.next();
|
|
||||||
this.wordHash = newContainer.getTermHash();
|
|
||||||
|
|
||||||
// loop throug the entities of the container and get the
|
|
||||||
// urlhash
|
|
||||||
final Iterator<WordReference> importWordIdxEntries = newContainer.entries();
|
|
||||||
Reference importWordIdxEntry;
|
|
||||||
while (importWordIdxEntries.hasNext()) {
|
|
||||||
// testing if import process was aborted
|
|
||||||
if (isAborted()) break;
|
|
||||||
|
|
||||||
// getting next word index entry
|
|
||||||
importWordIdxEntry = importWordIdxEntries.next();
|
|
||||||
final String urlHash = importWordIdxEntry.metadataHash();
|
|
||||||
entityUrls.add(urlHash);
|
|
||||||
}
|
|
||||||
|
|
||||||
final Iterator<String> urlIter = entityUrls.iterator();
|
|
||||||
while (urlIter.hasNext()) {
|
|
||||||
if (isAborted()) break;
|
|
||||||
final String urlHash = urlIter.next();
|
|
||||||
|
|
||||||
if (!importedUrlBuffer.contains(urlHash)) {
|
|
||||||
if (unknownUrlBuffer.contains(urlHash)) {
|
|
||||||
// url known as unknown
|
|
||||||
unknownUrlBuffer.add(urlHash);
|
|
||||||
notBoundEntryCounter++;
|
|
||||||
newContainer.remove(urlHash);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// we need to import the url
|
|
||||||
|
|
||||||
// getting the url entry
|
|
||||||
final URIMetadataRow urlEntry = this.importWordIndex.urlMetadata().load(urlHash, null, 0);
|
|
||||||
if (urlEntry != null) {
|
|
||||||
|
|
||||||
/* write it into the home url db */
|
|
||||||
homeWordIndex.urlMetadata().store(urlEntry);
|
|
||||||
importedUrlBuffer.add(urlHash);
|
|
||||||
this.urlCounter++;
|
|
||||||
|
|
||||||
if (this.urlCounter % 500 == 0) {
|
|
||||||
this.log.logFine(this.urlCounter + " URLs processed so far.");
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
unknownUrlBuffer.add(urlHash);
|
|
||||||
notBoundEntryCounter++;
|
|
||||||
newContainer.remove(urlHash);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
//} else {
|
|
||||||
// already known url
|
|
||||||
}
|
|
||||||
this.entryCounter++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// testing if import process was aborted
|
|
||||||
if (isAborted()) break;
|
|
||||||
|
|
||||||
// importing entity container to home db
|
|
||||||
if (!newContainer.isEmpty()) { homeWordIndex.termIndex().add(newContainer); }
|
|
||||||
|
|
||||||
// delete complete index entity file
|
|
||||||
this.importWordIndex.termIndex().delete(this.wordHash);
|
|
||||||
|
|
||||||
// print out some statistical information
|
|
||||||
if (this.entryCounter % 500 == 0) {
|
|
||||||
this.log.logFine(this.entryCounter + " word entries and " + this.wordCounter + " word entities processed so far.");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.wordCounter%500 == 0) {
|
|
||||||
this.wordChunkEndHash = this.wordHash;
|
|
||||||
this.wordChunkEnd = System.currentTimeMillis();
|
|
||||||
final long duration = this.wordChunkEnd - this.wordChunkStart;
|
|
||||||
this.log.logInfo(this.wordCounter + " word entities imported " +
|
|
||||||
"[" + this.wordChunkStartHash + " .. " + this.wordChunkEndHash + "] " +
|
|
||||||
this.getProcessingStatusPercent() + "%\n" +
|
|
||||||
"Speed: "+ 500*1000/duration + " word entities/s" +
|
|
||||||
" | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) +
|
|
||||||
" | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" +
|
|
||||||
"Home Words = " + homeWordIndex.termIndex().sizesMax() +
|
|
||||||
" | Import Words = " + this.importWordIndex.termIndex().sizesMax());
|
|
||||||
this.wordChunkStart = this.wordChunkEnd;
|
|
||||||
this.wordChunkStartHash = this.wordChunkEndHash;
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (final Exception e) {
|
|
||||||
this.log.logSevere("Import of word entity '" + this.wordHash + "' failed.",e);
|
|
||||||
} finally {
|
|
||||||
if (newContainer != null) newContainer.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!indexContainerIterator.hasNext()) {
|
|
||||||
// We may not be finished yet, try to get the next chunk of wordHashes
|
|
||||||
final TreeSet<ReferenceContainer<WordReference>> containers = this.importWordIndex.termIndex().references(this.wordHash, false, 100, false);
|
|
||||||
indexContainerIterator = containers.iterator();
|
|
||||||
// Make sure we don't get the same wordhash twice, but don't skip a word
|
|
||||||
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getTermHash()))) {
|
|
||||||
indexContainerIterator = containers.iterator();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
|
|
||||||
this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
|
|
||||||
} catch (final Exception e) {
|
|
||||||
this.log.logSevere("Database import failed.",e);
|
|
||||||
Log.logException(e);
|
|
||||||
this.error = e.toString();
|
|
||||||
} finally {
|
|
||||||
this.log.logInfo("Import process finished.");
|
|
||||||
if (this.importWordIndex != null) try { this.importWordIndex.close(); } catch (final Exception e){}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,228 +0,0 @@
|
|||||||
package de.anomic.crawler;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
import net.yacy.kelondro.logging.Log;
|
|
||||||
import net.yacy.kelondro.util.FileUtils;
|
|
||||||
|
|
||||||
import de.anomic.crawler.CrawlSwitchboard;
|
|
||||||
import de.anomic.crawler.retrieval.Request;
|
|
||||||
|
|
||||||
public class NoticeURLImporter extends AbstractImporter implements Importer {
|
|
||||||
|
|
||||||
private File plasmaPath = null;
|
|
||||||
private final HashSet<String> importProfileHandleCache = new HashSet<String>();
|
|
||||||
private CrawlProfile importProfileDB;
|
|
||||||
private final NoticedURL importNurlDB;
|
|
||||||
private final int importStartSize;
|
|
||||||
private int urlCount = 0;
|
|
||||||
private int profileCount = 0;
|
|
||||||
private final CrawlQueues crawlQueues;
|
|
||||||
private final CrawlProfile activeCrawls;
|
|
||||||
private final ImporterManager dbImportManager;
|
|
||||||
|
|
||||||
public NoticeURLImporter(final File crawlerPath, final CrawlQueues crawlQueues, final CrawlProfile activeCrawls, final ImporterManager dbImportManager) {
|
|
||||||
super("NURL");
|
|
||||||
this.crawlQueues = crawlQueues;
|
|
||||||
this.activeCrawls = activeCrawls;
|
|
||||||
this.dbImportManager = dbImportManager;
|
|
||||||
|
|
||||||
// TODO: we need more error handling here
|
|
||||||
this.plasmaPath = crawlerPath;
|
|
||||||
final File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db");
|
|
||||||
final File profileDbFile = new File(plasmaPath, CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
|
|
||||||
|
|
||||||
String errorMsg = null;
|
|
||||||
if (!plasmaPath.exists())
|
|
||||||
errorMsg = "The import path '" + plasmaPath + "' does not exist.";
|
|
||||||
else if (!plasmaPath.isDirectory())
|
|
||||||
errorMsg = "The import path '" + plasmaPath + "' is not a directory.";
|
|
||||||
else if (!plasmaPath.canRead())
|
|
||||||
errorMsg = "The import path '" + plasmaPath + "' is not readable.";
|
|
||||||
else if (!plasmaPath.canWrite())
|
|
||||||
errorMsg = "The import path '" + plasmaPath + "' is not writeable.";
|
|
||||||
|
|
||||||
else if (!noticeUrlDbFile.exists())
|
|
||||||
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist.";
|
|
||||||
else if (noticeUrlDbFile.isDirectory())
|
|
||||||
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not a file.";
|
|
||||||
else if (!noticeUrlDbFile.canRead())
|
|
||||||
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not readable.";
|
|
||||||
else if (!noticeUrlDbFile.canWrite())
|
|
||||||
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not writeable.";
|
|
||||||
|
|
||||||
else if (!profileDbFile.exists())
|
|
||||||
errorMsg = "The profileDB file '" + profileDbFile + "' does not exist.";
|
|
||||||
else if (profileDbFile.isDirectory())
|
|
||||||
errorMsg = "The profileDB file '" + profileDbFile + "' is not a file.";
|
|
||||||
else if (!profileDbFile.canRead())
|
|
||||||
errorMsg = "The profileDB file '" + profileDbFile + "' is not readable.";
|
|
||||||
// else if (!profileDbFile.canWrite())
|
|
||||||
// errorMsg = "The profileDB file '" + profileDbFile + "' is not writeable.";
|
|
||||||
|
|
||||||
if (errorMsg != null) {
|
|
||||||
this.log.logSevere(errorMsg);
|
|
||||||
throw new IllegalArgumentException(errorMsg);
|
|
||||||
}
|
|
||||||
|
|
||||||
// init noticeUrlDB
|
|
||||||
this.log.logInfo("Initializing the source noticeUrlDB");
|
|
||||||
this.importNurlDB = new NoticedURL(plasmaPath, false, false);
|
|
||||||
this.importStartSize = this.importNurlDB.size();
|
|
||||||
//int stackSize = this.importNurlDB.stackSize();
|
|
||||||
|
|
||||||
// init profile DB
|
|
||||||
this.log.logInfo("Initializing the source profileDB");
|
|
||||||
try {
|
|
||||||
this.importProfileDB = new CrawlProfile(profileDbFile);
|
|
||||||
} catch (IOException e) {
|
|
||||||
FileUtils.deletedelete(profileDbFile);
|
|
||||||
try {
|
|
||||||
this.importProfileDB = new CrawlProfile(profileDbFile);
|
|
||||||
} catch (IOException e1) {
|
|
||||||
Log.logException(e1);
|
|
||||||
this.importProfileDB = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getEstimatedTime() {
|
|
||||||
return (this.urlCount==0)?0:((this.importStartSize*getElapsedTime())/(this.urlCount))-getElapsedTime();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getJobName() {
|
|
||||||
return this.plasmaPath.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getProcessingStatusPercent() {
|
|
||||||
return (this.urlCount)/((this.importStartSize<100)?1:(this.importStartSize)/100);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getStatus() {
|
|
||||||
final StringBuilder theStatus = new StringBuilder();
|
|
||||||
|
|
||||||
theStatus.append("#URLs=").append(this.urlCount).append("\n");
|
|
||||||
theStatus.append("#Profiles=").append(this.profileCount);
|
|
||||||
|
|
||||||
return theStatus.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void run() {
|
|
||||||
try {
|
|
||||||
// waiting on init thread to finish
|
|
||||||
//this.importNurlDB.waitOnInitThread();
|
|
||||||
|
|
||||||
// the stack types we want to import
|
|
||||||
final int[] stackTypes = new int[] {
|
|
||||||
NoticedURL.STACK_TYPE_CORE,
|
|
||||||
NoticedURL.STACK_TYPE_LIMIT,
|
|
||||||
NoticedURL.STACK_TYPE_REMOTE,
|
|
||||||
-1};
|
|
||||||
|
|
||||||
// looping through the various stacks
|
|
||||||
for (int stackType=0; stackType< stackTypes.length; stackType++) {
|
|
||||||
if (stackTypes[stackType] != -1) {
|
|
||||||
this.log.logInfo("Starting to import stacktype '" + stackTypes[stackType] + "' containing '" + this.importNurlDB.stackSize(stackTypes[stackType]) + "' entries.");
|
|
||||||
} else {
|
|
||||||
this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack.");
|
|
||||||
}
|
|
||||||
|
|
||||||
// getting an iterator and loop through the URL entries
|
|
||||||
final Iterator<Request> entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null;
|
|
||||||
while (true) {
|
|
||||||
|
|
||||||
String nextHash = null;
|
|
||||||
Request nextEntry = null;
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (stackTypes[stackType] != -1) {
|
|
||||||
if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break;
|
|
||||||
|
|
||||||
this.urlCount++;
|
|
||||||
nextEntry = this.importNurlDB.pop(stackTypes[stackType], false, null);
|
|
||||||
nextHash = nextEntry.url().hash();
|
|
||||||
} else {
|
|
||||||
if (!entryIter.hasNext()) break;
|
|
||||||
|
|
||||||
this.urlCount++;
|
|
||||||
nextEntry = entryIter.next();
|
|
||||||
nextHash = nextEntry.url().hash();
|
|
||||||
}
|
|
||||||
} catch (final IOException e) {
|
|
||||||
this.log.logWarning("Unable to import entry: " + e.toString());
|
|
||||||
|
|
||||||
if ((stackTypes[stackType] != -1) &&(this.importNurlDB.stackSize(stackTypes[stackType]) == 0)) break;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// getting a handler to the crawling profile the url belongs to
|
|
||||||
try {
|
|
||||||
final String profileHandle = nextEntry.profileHandle();
|
|
||||||
if (profileHandle == null) {
|
|
||||||
this.log.logWarning("Profile handle of url entry '" + nextHash + "' unknown.");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if we havn't imported the profile until yet we need to do it now
|
|
||||||
if (!this.importProfileHandleCache.contains(profileHandle)) {
|
|
||||||
|
|
||||||
// testing if the profile is already known
|
|
||||||
final CrawlProfile.entry profileEntry = this.activeCrawls.getEntry(profileHandle);
|
|
||||||
|
|
||||||
// if not we need to import it
|
|
||||||
if (profileEntry == null) {
|
|
||||||
// copy and store the source profile entry into the destination db
|
|
||||||
final CrawlProfile.entry sourceEntry = this.importProfileDB.getEntry(profileHandle);
|
|
||||||
if (sourceEntry != null) {
|
|
||||||
this.profileCount++;
|
|
||||||
this.importProfileHandleCache.add(profileHandle);
|
|
||||||
HashMap<String, String> mapclone = new HashMap<String, String>();
|
|
||||||
mapclone.putAll(sourceEntry.map());
|
|
||||||
this.activeCrawls.newEntry(mapclone);
|
|
||||||
} else {
|
|
||||||
this.log.logWarning("Profile '" + profileHandle + "' of url entry '" + nextHash + "' unknown.");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// if the url does not alredy exists in the destination stack we insert it now
|
|
||||||
if (!this.crawlQueues.noticeURL.existsInStack(nextHash)) {
|
|
||||||
this.crawlQueues.noticeURL.push((stackTypes[stackType] != -1) ? stackTypes[stackType] : NoticedURL.STACK_TYPE_CORE, nextEntry);
|
|
||||||
}
|
|
||||||
|
|
||||||
// removing hash from the import db
|
|
||||||
} finally {
|
|
||||||
this.importNurlDB.removeByURLHash(nextHash);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.urlCount % 100 == 0) {
|
|
||||||
if (this.log.isFine()) this.log.logFine(this.urlCount + " URLs and '" + this.profileCount + "' profile entries processed so far.");
|
|
||||||
}
|
|
||||||
if (this.isAborted()) break;
|
|
||||||
}
|
|
||||||
this.log.logInfo("Finished to import stacktype '" + stackTypes[stackType] + "'");
|
|
||||||
}
|
|
||||||
|
|
||||||
//int size = this.importNurlDB.size();
|
|
||||||
//int stackSize = this.importNurlDB.stackSize();
|
|
||||||
|
|
||||||
// TODO: what todo with nurlDB entries that do not exist in any stack?
|
|
||||||
|
|
||||||
} catch (final Exception e) {
|
|
||||||
this.error = e.toString();
|
|
||||||
this.log.logSevere("Import process had detected an error",e);
|
|
||||||
} finally {
|
|
||||||
this.log.logInfo("Import process finished.");
|
|
||||||
this.globalEnd = System.currentTimeMillis();
|
|
||||||
this.dbImportManager.finishedJobs.add(this);
|
|
||||||
this.importNurlDB.close();
|
|
||||||
this.importProfileDB.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in new issue