some redesign of the main menu structure:

- moved all index generation servlets to it's own main menu item, including proxy indexing
- removed external index import because this operation is not recommended any more. Joining an index can simply be done by moving the index files from one peer to the other peer; they will be merged automatically
- fix to prevent endless loops when disconnecting http sessions
- fix to prevent application of bad blacklist entries that can cause a 'Dangling meta character' exception

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6558 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent ab3cf60dbe
commit a3b8b7b5c5

@ -6,7 +6,7 @@
</head>
<body id="IndexCreate">
#%env/templates/header.template%#
#%env/templates/submenuPortalIntegration.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Integration in phpBB3</h2>
<p>
It is possible to insert forum pages into the YaCy index using a databse import of forum postings.

@ -6,7 +6,7 @@
</head>
<body id="IndexCreate">
#%env/templates/header.template%#
#%env/templates/submenuPortalIntegration.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Integration in MediaWiki</h2>
<p>
It is possible to insert wiki pages into the YaCy index using a web crawl on that pages.

@ -6,7 +6,7 @@
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Crawl Profile Editor</h2>
<p>
Crawl profiles hold information about a specific URL which is internally used to perform the crawl it belongs to.

@ -6,7 +6,7 @@
</head>
<body id="IndexCreateLoaderQueue">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Loader Queue</h2>
<p>

@ -6,7 +6,7 @@
</head>
<body id="IndexCreateIndexingQueue">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Parser Errors</h2>
#(rejected)#

@ -6,7 +6,7 @@
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Global Crawl Queue</h2>
<p>
This queue stores the urls that shall be sent to other peers to perform a remote crawl.

@ -6,7 +6,7 @@
</head>
<body id="IndexCreateWWWLocalQueue">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Local Crawl Queue</h2>
<p>
This queue stores the urls that shall be crawled localy by this peer.

@ -6,7 +6,7 @@
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Remote Crawl Queue</h2>
<p>
This queue stores the urls that other peers sent to you in order to perform a remote crawl for them.

@ -1,270 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<!--
<title>YaCy '#[clientname]#': Index Import</title>
//-->
<title>YaCy '#[clientname]#': Crawling Queue Import</title>
#%env/templates/metas.template%#
<meta http-equiv="REFRESH" content="30" />
</head>
<body id="IndexImport">
#%env/templates/header.template%#
#%env/templates/submenuIndexControl.template%#
<!--
<h2>Index DB Import</h2>
<p>The local index currently consists of (at least) #[wcount]# reverse word indexes and #[ucount]# URL references.</p>
//-->
<h2>Crawling Queue Import</h2>
#(error)#<!-- 0 -->
::<!-- 1 -->
<p class="error">#[error_msg]#</p>
::<!-- 2 -->
<p class="error">Import Job with the same path already started.</p>
::<!-- 3 -->
<p class="error">#[error_msg]#</p>
<p class="error"><code>#[error_stackTrace]#</code></p>
#(/error)#
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
<h3>Starting new Job</h3>
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableCellLight">
<td>Import&nbsp;Type:</td>
<td title="the path to the database that should be imported"></td>
<td title="the cache size that should be used for the import db">Cache Size</td>
<td>
<select name="cacheSize" size="1">
<option value="2097152">2 MB</option>
<option value="4194304">4 MB</option>
<option value="8388608" selected="selected">8 MB</option>
<option value="16777216">16 MB</option>
<option value="33554432">32 MB</option>
<option value="67108864">64 MB</option>
<option value="134217728">128 MB</option>
</select>
</td>
<td><a href="#usage">Usage Examples</a></td>
</tr>
<tr class="TableCellLight">
<td title="Path to the PLASMADB directory of the foreign peer">Import&nbsp;Path:</td>
<td colspan="3"><input name="importPlasmaPath" type="text" size="50" value="" /></td>
<td></td>
</tr>
<tr class="TableCellLight">
<td title="Path to the primary Index directory of the foreign peer">Import&nbsp;Path:</td>
<td colspan="3"><input name="importIndexPrimaryPath" type="text" size="50" value="" /></td>
<td></td>
</tr>
<tr class="TableCellLight">
<td title="Path to the secondary Index directory of the foreign peer">Import&nbsp;Path:</td>
<td colspan="3"><input name="importIndexSecondaryPath" type="text" size="50" value="" /></td>
<td><input type="submit" name="startIndexDbImport" value="Start Import" /></td>
</tr>
</table>
<p class="warning"><strong>Attention:</strong><br />Always do a backup of your source and destination database before starting to use this import function.</p>
</form>
<hr />
<h3>Currently running jobs</h3>
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col />
<col width="150" />
<col span="7" />
</colgroup>
<tr class="TableHeader" valign="bottom">
<td>Job Type</td>
<td>Job Name</td>
<td>Status</td>
<td>%</td>
<td>Elapsed<br />Time</td>
<td>Time<br />Left</td>
<td>Import Status</td>
<td>Abort Import</td>
<td>Pause Import</td>
</tr>
#{running.jobs}#
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
<tr class="TableCellLight">
<td>#[type]#</td>
<td title="#[fullName]#">#[shortName]#</td>
<td style="color:#(runningStatus)#red::green::blue#(/runningStatus)#;">#(runningStatus)#Finished::Running::Paused#(/runningStatus)#</td>
<td align="right">#[percent]#</td>
<td align="right">#[elapsed]#</td>
<td align="right">#[estimated]#</td>
<td align="left"><tt>#[status]#</tt></td>
<td>
<input type="hidden" name="jobNr" value="#[job_nr]#" />
#(stopped)#::
<input type="submit" name="stopIndexDbImport" value="Abort" />
#(/stopped)#
</td>
<td>
#(paused)#
<input type="submit" name="pauseIndexDbImport" value="Pause" />
::
<input type="submit" name="continueIndexDbImport" value="Continue" />
#(/paused)#
</td>
</tr>
</form>
#{/running.jobs}#
</table>
<hr />
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
<h3>Finished jobs</h3>
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col />
<col width="150" />
<col span="4" />
</colgroup>
<tr class="TableHeader" valign="bottom">
<td>Job Type</td>
<td>Path</td>
<td>Status</td>
<td>%</td>
<td>Elapsed<br />Time</td>
<td>Import Status</td>
</tr>
#{finished.jobs}#
<tr class="TableCellLight">
<td>#[type]#</td>
<td title="#[fullName]#">#[shortName]#</td>
<td><font color="#(runningStatus)#green::red::blue#(/runningStatus)#">#(runningStatus)#Finished::<b>Error:</b> #[errorMsg]#::Paused#(/runningStatus)#</font></td>
<td align="right">#[percent]#</td>
<td align="right">#[elapsed]#</td>
<td align="left"><tt>#[status]#</tt></td>
</tr>
#{/finished.jobs}#
</table>
<fieldset>
<input type="submit" name="clearFinishedJobList" value="Clear List" />
</fieldset>
</form>
<p><em>Last Refresh:</em> #[date]#</p>
<hr />
<h2 id="usage">Usage Examples:</h2>
<!--<h3>Plasma DB Import:</h3>
<p>
<strong>Example Path:</strong> <tt>E:\PLASMADB\</tt>
</p>
<p>
<strong>Requirements:</strong>
</p>
<p>
You need to have at least the following directories and files in this path:
</p>
<table border="1" cellpadding="2" cellspacing="1">
<tr class="example">
<td>Name</td>
<td>Type</td>
<td>Writeable</td>
<td>Description</td>
</tr>
<tr>
<td><tt>urlHash.db</tt></td>
<td>File</td>
<td>No</td>
<td>The LoadedURL Database containing all loaded and indexed URLs</td>
</tr>
<tr>
<td><tt>ACLUSTER</tt></td>
<td>Directory</td>
<td>Yes</td>
<td>The assortment directory containing parts of the word index.</td>
</tr>
<tr>
<td><tt>WORDS</tt></td>
<td>Directory</td>
<td>Yes</td>
<td>The words directory containing parts of the word index.</td>
</tr>
</table>
<h3>Assortment Import:</h3>
<p>
<strong>Example Path:</strong> <tt>E:\PLASMADB\ACLUSTER\indexAssortment001.db</tt>
</p>
<p>
<strong>Requirements:</strong>
</p>
<p>
You need to have at least the following directories and files in this path:
</p>
<table border="1" cellpadding="2" cellspacing="1">
<tr class="example">
<td>Name</td>
<td>Type</td>
<td>Writeable</td>
<td>Description</td>
</tr>
<tr>
<td><tt>indexAssortment001.db</tt></td>
<td>File</td>
<td>No</td>
<td>The assortment file that should be imported.<br />
<strong>Attention:</strong> The assortment file must have the postfix "[0-9]{3}\.db".
If you would like to import an assortment file from the <tt>PLASMADB\ACLUSTER\ABKP</tt>
you have to rename it first.
</td>
</tr>
</table>
<p>
<strong>Notes:</strong>
</p>
<p>
Please note that the imported words are useless if the destination peer doesn't know
the URLs the imported words belongs to.
</p>-->
<h3>Crawling Queue Import:</h3>
<p>
<strong>Example Path:</strong> <tt>E:\PLASMADB\</tt>
</p>
<p>
<strong>Requirements:</strong>
</p>
<p>
You need to have at least the following directories and files in this path:
</p>
<table border="1" cellpadding="2" cellspacing="1">
<tr class="example">
<td>Name</td>
<td>Type</td>
<td>Writeable</td>
<td>Description</td>
</tr>
<tr>
<td><tt>crawlProfiles0.db</tt></td>
<td>File</td>
<td>No</td>
<td>Contains data about the crawljob an URL belongs to</td>
</tr>
<tr>
<td><tt>urlNotice1.db</tt></td>
<td>File</td>
<td>Yes</td>
<td>The crawling queue</td>
</tr>
<tr>
<td><tt>urlNoticeImage0.stack</tt></td>
<td rowspan="8">File</td>
<td rowspan="8">Yes</td>
<td rowspan="8">Various stack files that belong to the crawling queue</td>
</tr>
<tr><td><tt>urlNoticeImage0.stack</tt></td></tr>
<tr><td><tt>urlNoticeLimit0.stack</tt></td></tr>
<tr><td><tt>urlNoticeLocal0.stack</tt></td></tr>
<tr><td><tt>urlNoticeMovie0.stack</tt></td></tr>
<tr><td><tt>urlNoticeMusic0.stack</tt></td></tr>
<tr><td><tt>urlNoticeOverhang0.stack</tt></td></tr>
<tr><td><tt>urlNoticeRemote0.stack</tt></td></tr>
</table>
#%env/templates/footer.template%#
</body>
</html>

@ -1,197 +0,0 @@
//IndexTransfer_p.java
//-----------------------
//part of the AnomicHTTPD caching proxy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//This file is contributed by Martin Thelian
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//You must compile this file with
//javac -classpath .:../Classes IndexControl_p.java
//if the shell's current path is HTROOT
import java.io.PrintStream;
import java.util.Date;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.crawler.Importer;
import de.anomic.crawler.NoticeURLImporter;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segment;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public final class IndexImport_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
int activeCount = 0;
// get segment
Segment indexSegment = null;
if (post != null && post.containsKey("segment")) {
String segmentName = post.get("segment");
if (sb.indexSegments.segmentExist(segmentName)) {
indexSegment = sb.indexSegments.segment(segmentName);
}
} else {
// take default segment
indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
if (post != null) {
if (post.containsKey("startIndexDbImport")) {
try {
final boolean startImport = true;
if (startImport) {
final Importer importerThread = new NoticeURLImporter(
sb.queuesRoot,
sb.crawlQueues,
sb.crawler.profilesActiveCrawls,
sb.dbImportManager);
if (importerThread != null) {
importerThread.setJobID(sb.dbImportManager.generateUniqueJobID());
importerThread.startIt();
}
prop.put("LOCATION","");
return prop;
}
} catch (final Exception e) {
final ByteBuffer errorMsg = new ByteBuffer(100);
final PrintStream errorOut = new PrintStream(errorMsg);
Log.logException(e);
prop.put("error", "3");
prop.putHTML("error_error_msg",e.toString());
prop.putHTML("error_error_stackTrace",errorMsg.toString().replaceAll("\n","<br>"));
errorOut.close();
}
} else if (post.containsKey("clearFinishedJobList")) {
sb.dbImportManager.finishedJobs.clear();
prop.put("LOCATION", "");
return prop;
} else if (
(post.containsKey("stopIndexDbImport")) ||
(post.containsKey("pauseIndexDbImport")) ||
(post.containsKey("continueIndexDbImport"))
) {
// get the job nr of the thread
final String jobID = post.get("jobNr");
final Importer importer = sb.dbImportManager.getImporterByID(Integer.valueOf(jobID).intValue());
if (importer != null) {
if (post.containsKey("stopIndexDbImport")) {
try {
importer.stopIt();
} catch (final InterruptedException e) {
// TODO Auto-generated catch block
Log.logException(e);
}
} else if (post.containsKey("pauseIndexDbImport")) {
importer.pauseIt();
} else if (post.containsKey("continueIndexDbImport")) {
importer.continueIt();
}
}
prop.put("LOCATION","");
return prop;
}
}
prop.putNum("wcount", indexSegment.termIndex().sizesMax());
prop.putNum("ucount", indexSegment.urlMetadata().size());
/*
* Loop over all currently running jobs
*/
final Importer[] importThreads = sb.dbImportManager.getRunningImporter();
activeCount = importThreads.length;
for (int i=0; i < activeCount; i++) {
final Importer currThread = importThreads[i];
// get import type
prop.put("running.jobs_" + i + "_type", currThread.getJobType());
// root path of the source db
final String fullName = currThread.getJobName();
final String shortName = (fullName.length()>30)?fullName.substring(0,12) + "..." + fullName.substring(fullName.length()-22,fullName.length()):fullName;
prop.put("running.jobs_" + i + "_fullName",fullName);
prop.put("running.jobs_" + i + "_shortName",shortName);
// specifies if the importer is still running
prop.put("running.jobs_" + i + "_stopped", currThread.isStopped() ? "0" : "1");
// specifies if the importer was paused
prop.put("running.jobs_" + i + "_paused", currThread.isPaused() ? "1" : "0");
// setting the status
prop.put("running.jobs_" + i + "_runningStatus", currThread.isPaused() ? "2" : currThread.isStopped() ? "0" : "1");
// other information
prop.putNum("running.jobs_" + i + "_percent", currThread.getProcessingStatusPercent());
prop.put("running.jobs_" + i + "_elapsed", DateFormatter.formatInterval(currThread.getElapsedTime()));
prop.put("running.jobs_" + i + "_estimated", DateFormatter.formatInterval(currThread.getEstimatedTime()));
prop.putHTML("running.jobs_" + i + "_status", currThread.getStatus().replaceAll("\n", "<br>"));
// job number of the importer thread
prop.put("running.jobs_" + i + "_job_nr", currThread.getJobID());
}
prop.put("running.jobs", activeCount);
/*
* Loop over all finished jobs
*/
final Importer[] finishedJobs = sb.dbImportManager.getFinishedImporter();
for (int i=0; i<finishedJobs.length; i++) {
final Importer currThread = finishedJobs[i];
final String error = currThread.getError();
final String fullName = currThread.getJobName();
final String shortName = (fullName.length()>30)?fullName.substring(0,12) + "..." + fullName.substring(fullName.length()-22,fullName.length()):fullName;
prop.put("finished.jobs_" + i + "_type", currThread.getJobType());
prop.put("finished.jobs_" + i + "_fullName", fullName);
prop.put("finished.jobs_" + i + "_shortName", shortName);
if (error != null) {
prop.put("finished.jobs_" + i + "_runningStatus", "1");
prop.putHTML("finished.jobs_" + i + "_runningStatus_errorMsg", error.replaceAll("\n", "<br>"));
} else {
prop.put("finished.jobs_" + i + "_runningStatus", "0");
}
prop.putNum("finished.jobs_" + i + "_percent", currThread.getProcessingStatusPercent());
prop.put("finished.jobs_" + i + "_elapsed", DateFormatter.formatInterval(currThread.getElapsedTime()));
prop.putHTML("finished.jobs_" + i + "_status", currThread.getStatus().replaceAll("\n", "<br>"));
}
prop.put("finished.jobs",finishedJobs.length);
prop.put("date",(new Date()).toString());
return prop;
}
}

@ -6,14 +6,13 @@
</head>
<body id="ProxyIndexingMonitor">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Indexing with Proxy</h2>
<p>
This is the control page for web pages that your peer has indexed during the current application run-time
as result of proxy fetch/prefetch.
<strong>No personal or protected page is indexed</strong>;
YaCy can be used to 'scrape' content from pages that pass the integrated caching HTTP proxy.
When scraping proxy pages then <strong>no personal or protected page is indexed</strong>;
those pages are detected by properties in the HTTP header (like Cookie-Use, or HTTP Authorization)
or by POST-Parameters (either in URL or as HTTP protocol)
and automatically excluded from indexing.
or by POST-Parameters (either in URL or as HTTP protocol) and automatically excluded from indexing.
</p>
<form action="ProxyIndexingMonitor_p.html" method="post" enctype="multipart/form-data">

@ -1,6 +1,5 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
#(forwardToCrawlStart)#::<meta http-equiv="REFRESH" content="5; url=/CrawlStart_p.html">#(/forwardToCrawlStart)#
<head>
<title>YaCy '#[clientname]#': Crawler Queues</title>
#%env/templates/metas.template%#
@ -10,7 +9,7 @@
<script type="text/javascript" src="/js/WatchCrawler.js"></script></head>
<body id="watchCrawler" onload="initWatchCrawler();">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Crawler Queues</h2>
<noscript><p>(Please enable JavaScript to automatically update this page!)</p></noscript>
<p> Next update in <input type="text" id="nextUpdate" onfocus="changeInterval()" onblur="newInterval()" size="2" /> seconds. <img src="/env/grafics/empty.gif" id="ajax" alt="empty"/>

@ -105,13 +105,6 @@ public class WatchCrawler_p {
if (post != null) {
// a crawl start
if ((post.containsKey("autoforward")) &&
(sb.crawlQueues.coreCrawlJobSize() == 0) &&
(sb.crawlQueues.remoteTriggeredCrawlJobSize() == 0) &&
(sb.getIndexingProcessorsQueueSize() < 30)) {
prop.put("forwardToCrawlStart", "1");
}
if (post.containsKey("continue")) {
// continue queue
final String queue = post.get("continue", "");

@ -60,12 +60,12 @@
<li class="menugroup" id="menugroupCrawlerControl">
<h3>Index&nbsp;Control</h3>
<ul class="menu">
<li><a href="/WatchCrawler_p.html?autoforward=" class="MenuItemLink lock">Web Crawler</a></li>
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Index Creation</a></li>
<li><a href="/WatchCrawler_p.html" class="MenuItemLink lock">Crawler Monitor</a></li>
<li><a href="/CrawlResults.html?process=5&amp;autoforward=" class="MenuItemLink">Crawl Results</a></li>
<li><a href="/ContentIntegrationPHPBB3_p.html" class="MenuItemLink lock">External Content</a></li>
<li><a href="/ContentIntegrationPHPBB3_p.html" class="MenuItemLink lock">Content Import</a></li>
<li><a href="/IndexControlRWIs_p.html" class="MenuItemLink lock">Index Administration</a></li>
<li><a href="/Blacklist_p.html" class="MenuItemLink lock">Filter &amp; Blacklists</a></li>
<li><a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink lock">Indexing with Proxy</a></li>
</ul>
</li>
<li class="menugroup" id="menugroupIntegration">

@ -0,0 +1,29 @@
<div class="SubMenu">
<h3>Web Crawler</h3>
</div>
<div class="SubMenu">
<div class="SubMenugroup">
<h3>Processing Monitor</h3>
<ul class="SubMenu">
<li><a href="/WatchCrawler_p.html" class="MenuItemLink lock">Crawler Queues</a></li>
<li><a href="/IndexCreateLoaderQueue_p.html" class="MenuItemLink lock">Loader</a></li>
<li><a href="/IndexCreateParserErrors_p.html" class="MenuItemLink lock">Parser Errors</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Queues</h3>
<ul class="SubMenu">
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li>
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li>
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Crawler Steering</h3>
<ul class="SubMenu">
<li><a href="/CrawlProfileEditor_p.html" class="MenuItemLink lock">Crawl Profile Editor</a></li>
</ul>
</div>
</div>

@ -3,6 +3,5 @@
<ul class="SubMenu">
<li><a href="/IndexControlRWIs_p.html" class="MenuItemLink lock">RWI Admin</a></li>
<li><a href="/IndexControlURLs_p.html" class="MenuItemLink lock">URL Reference Admin</a></li>
<li><a href="/IndexImport_p.html" class="MenuItemLink lock">Queue Import</a></li>
</ul>
</div>

@ -1,43 +1,9 @@
<div class="SubMenu">
<h3>Web Crawler</h3>
</div>
<div class="SubMenu">
<div class="SubMenugroup">
<h3>Crawler Steering</h3>
<ul class="SubMenu">
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Crawl Start</a></li>
<li><a href="/CrawlProfileEditor_p.html" class="MenuItemLink lock">Crawl Profile Editor</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Processing Monitor</h3>
<ul class="SubMenu">
<li><a href="/WatchCrawler_p.html" class="MenuItemLink lock">Crawler Queues</a></li>
<li><a href="/IndexCreateLoaderQueue_p.html" class="MenuItemLink lock">Loader</a></li>
<li><a href="/IndexCreateParserErrors_p.html" class="MenuItemLink lock">Parser Errors</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Queues</h3>
<ul class="SubMenu">
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li>
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li>
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li>
<!--<li><a href="/IndexCreateWWWOverhangQueue_p.html" class="MenuItemLink"><em class="lock">Overhang</em></a></li>-->
</ul>
</div>
<!---
<div class="SubMenugroup">
<h3>Media Crawl Queues</h3>
<ul class="SubMenu">
<li><a href="/IndexCreateImageQueue_p.html" class="MenuItemLink"><em class="lock">Images</em></a></li>
<li><a href="/IndexCreateMovieQueue_p.html" class="MenuItemLink"><em class="lock">Movies</em></a></li>
<li><a href="/IndexCreateMusicQueue_p.html" class="MenuItemLink"><em class="lock">Music</em></a></li>
</ul>
</div>
-->
<h3>Index Creation</h3>
<ul class="SubMenu">
<li><a href="/ConfigWikiSearch.html" class="MenuItemLink">Indexing of Media Wikis</a></li>
<li><a href="/ConfigPHPBB3Search.html" class="MenuItemLink">Indexing of phpBB3 Forums</a></li>
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Crawl Start (Advanced)</a></li>
<li><a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink lock">Scraping Proxy Configuration</a></li>
</ul>
</div>

@ -4,7 +4,5 @@
<li><a href="/ConfigLiveSearch.html" class="MenuItemLink">Live Search Anywhere</a></li>
<li><a href="/ConfigPortal.html" class="MenuItemLink">Generic Search Portal</a></li>
<li><a href="/ConfigSearchBox.html" class="MenuItemLink">Search Box Anywhere</a></li>
<li><a href="/ConfigWikiSearch.html" class="MenuItemLink">Search Integration for Wikis</a></li>
<li><a href="/ConfigPHPBB3Search.html" class="MenuItemLink">Search Integration for phpBB3</a></li>
</ul>
</div>

@ -1,228 +0,0 @@
package de.anomic.crawler;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.search.Segment;
public class ExternalIndexImporter extends AbstractImporter implements Importer {
/**
* the source word index (the DB to import)
*/
private final Segment importWordIndex;
/**
* the destination word index (the home DB)
*/
protected Segment homeWordIndex;
private final int importStartSize;
private byte[] wordHash = "------------".getBytes();
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = this.wordChunkStart;
byte[] wordChunkStartHash = "------------".getBytes(), wordChunkEndHash;
private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0;
public ExternalIndexImporter(final Segment homeWI, final Segment importWI) {
super("PLASMADB");
this.homeWordIndex = homeWI;
this.importWordIndex = importWI;
this.importStartSize = this.importWordIndex.termIndex().sizesMax();
}
/**
* @see Importer#getJobName()
*/
public String getJobName() {
return this.importWordIndex.getLocation().toString();
}
/**
* @see Importer#getStatus()
*/
public String getStatus() {
final StringBuilder theStatus = new StringBuilder();
theStatus.append("Hash=").append(this.wordHash).append("\n");
theStatus.append("#URL=").append(this.urlCounter).append("\n");
theStatus.append("#Word Entity=").append(this.wordCounter).append("\n");
theStatus.append("#Word Entry={").append(this.entryCounter);
theStatus.append(" ,NotBound=").append(this.notBoundEntryCounter).append("}");
return theStatus.toString();
}
public void run() {
try {
importWordsDB();
} finally {
this.globalEnd = System.currentTimeMillis();
//this.sb.dbImportManager.finishedJobs.add(this);
}
}
/**
* @see Importer#getProcessingStatusPercent()
*/
public int getProcessingStatusPercent() {
// thid seems to be better:
// (this.importStartSize-this.importWordIndex.size())*100/((this.importStartSize==0)?1:this.importStartSize);
// but maxint (2,147,483,647) could be exceeded when WordIndexes reach 20M entries
//return (this.importStartSize-this.importWordIndex.size())/((this.importStartSize<100)?1:(this.importStartSize)/100);
return (int)(this.wordCounter)/((this.importStartSize<100)?1:(this.importStartSize)/100);
}
/**
* @see Importer#getElapsedTime()
*/
public long getEstimatedTime() {
return (this.wordCounter==0)?0:((this.importStartSize*getElapsedTime())/this.wordCounter)-getElapsedTime();
}
public void importWordsDB() {
this.log.logInfo("STARTING DB-IMPORT");
try {
this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation().getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
final HashSet<String> unknownUrlBuffer = new HashSet<String>();
final HashSet<String> importedUrlBuffer = new HashSet<String>();
// iterate over all words from import db
//Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, CrawlSwitchboard.RL_WORDFILES, false);
Iterator<ReferenceContainer<WordReference>> indexContainerIterator = this.importWordIndex.termIndex().references(this.wordChunkStartHash, false, 100, false).iterator();
while (!isAborted() && indexContainerIterator.hasNext()) {
final TreeSet<String> entityUrls = new TreeSet<String>();
ReferenceContainer<WordReference> newContainer = null;
try {
this.wordCounter++;
newContainer = indexContainerIterator.next();
this.wordHash = newContainer.getTermHash();
// loop throug the entities of the container and get the
// urlhash
final Iterator<WordReference> importWordIdxEntries = newContainer.entries();
Reference importWordIdxEntry;
while (importWordIdxEntries.hasNext()) {
// testing if import process was aborted
if (isAborted()) break;
// getting next word index entry
importWordIdxEntry = importWordIdxEntries.next();
final String urlHash = importWordIdxEntry.metadataHash();
entityUrls.add(urlHash);
}
final Iterator<String> urlIter = entityUrls.iterator();
while (urlIter.hasNext()) {
if (isAborted()) break;
final String urlHash = urlIter.next();
if (!importedUrlBuffer.contains(urlHash)) {
if (unknownUrlBuffer.contains(urlHash)) {
// url known as unknown
unknownUrlBuffer.add(urlHash);
notBoundEntryCounter++;
newContainer.remove(urlHash);
continue;
}
// we need to import the url
// getting the url entry
final URIMetadataRow urlEntry = this.importWordIndex.urlMetadata().load(urlHash, null, 0);
if (urlEntry != null) {
/* write it into the home url db */
homeWordIndex.urlMetadata().store(urlEntry);
importedUrlBuffer.add(urlHash);
this.urlCounter++;
if (this.urlCounter % 500 == 0) {
this.log.logFine(this.urlCounter + " URLs processed so far.");
}
} else {
unknownUrlBuffer.add(urlHash);
notBoundEntryCounter++;
newContainer.remove(urlHash);
continue;
}
//} else {
// already known url
}
this.entryCounter++;
}
// testing if import process was aborted
if (isAborted()) break;
// importing entity container to home db
if (!newContainer.isEmpty()) { homeWordIndex.termIndex().add(newContainer); }
// delete complete index entity file
this.importWordIndex.termIndex().delete(this.wordHash);
// print out some statistical information
if (this.entryCounter % 500 == 0) {
this.log.logFine(this.entryCounter + " word entries and " + this.wordCounter + " word entities processed so far.");
}
if (this.wordCounter%500 == 0) {
this.wordChunkEndHash = this.wordHash;
this.wordChunkEnd = System.currentTimeMillis();
final long duration = this.wordChunkEnd - this.wordChunkStart;
this.log.logInfo(this.wordCounter + " word entities imported " +
"[" + this.wordChunkStartHash + " .. " + this.wordChunkEndHash + "] " +
this.getProcessingStatusPercent() + "%\n" +
"Speed: "+ 500*1000/duration + " word entities/s" +
" | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) +
" | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" +
"Home Words = " + homeWordIndex.termIndex().sizesMax() +
" | Import Words = " + this.importWordIndex.termIndex().sizesMax());
this.wordChunkStart = this.wordChunkEnd;
this.wordChunkStartHash = this.wordChunkEndHash;
}
} catch (final Exception e) {
this.log.logSevere("Import of word entity '" + this.wordHash + "' failed.",e);
} finally {
if (newContainer != null) newContainer.clear();
}
if (!indexContainerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
final TreeSet<ReferenceContainer<WordReference>> containers = this.importWordIndex.termIndex().references(this.wordHash, false, 100, false);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getTermHash()))) {
indexContainerIterator = containers.iterator();
}
}
}
this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
} catch (final Exception e) {
this.log.logSevere("Database import failed.",e);
Log.logException(e);
this.error = e.toString();
} finally {
this.log.logInfo("Import process finished.");
if (this.importWordIndex != null) try { this.importWordIndex.close(); } catch (final Exception e){}
}
}
}

@ -1,228 +0,0 @@
package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.retrieval.Request;
public class NoticeURLImporter extends AbstractImporter implements Importer {
private File plasmaPath = null;
private final HashSet<String> importProfileHandleCache = new HashSet<String>();
private CrawlProfile importProfileDB;
private final NoticedURL importNurlDB;
private final int importStartSize;
private int urlCount = 0;
private int profileCount = 0;
private final CrawlQueues crawlQueues;
private final CrawlProfile activeCrawls;
private final ImporterManager dbImportManager;
public NoticeURLImporter(final File crawlerPath, final CrawlQueues crawlQueues, final CrawlProfile activeCrawls, final ImporterManager dbImportManager) {
super("NURL");
this.crawlQueues = crawlQueues;
this.activeCrawls = activeCrawls;
this.dbImportManager = dbImportManager;
// TODO: we need more error handling here
this.plasmaPath = crawlerPath;
final File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db");
final File profileDbFile = new File(plasmaPath, CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
String errorMsg = null;
if (!plasmaPath.exists())
errorMsg = "The import path '" + plasmaPath + "' does not exist.";
else if (!plasmaPath.isDirectory())
errorMsg = "The import path '" + plasmaPath + "' is not a directory.";
else if (!plasmaPath.canRead())
errorMsg = "The import path '" + plasmaPath + "' is not readable.";
else if (!plasmaPath.canWrite())
errorMsg = "The import path '" + plasmaPath + "' is not writeable.";
else if (!noticeUrlDbFile.exists())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist.";
else if (noticeUrlDbFile.isDirectory())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not a file.";
else if (!noticeUrlDbFile.canRead())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not readable.";
else if (!noticeUrlDbFile.canWrite())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not writeable.";
else if (!profileDbFile.exists())
errorMsg = "The profileDB file '" + profileDbFile + "' does not exist.";
else if (profileDbFile.isDirectory())
errorMsg = "The profileDB file '" + profileDbFile + "' is not a file.";
else if (!profileDbFile.canRead())
errorMsg = "The profileDB file '" + profileDbFile + "' is not readable.";
// else if (!profileDbFile.canWrite())
// errorMsg = "The profileDB file '" + profileDbFile + "' is not writeable.";
if (errorMsg != null) {
this.log.logSevere(errorMsg);
throw new IllegalArgumentException(errorMsg);
}
// init noticeUrlDB
this.log.logInfo("Initializing the source noticeUrlDB");
this.importNurlDB = new NoticedURL(plasmaPath, false, false);
this.importStartSize = this.importNurlDB.size();
//int stackSize = this.importNurlDB.stackSize();
// init profile DB
this.log.logInfo("Initializing the source profileDB");
try {
this.importProfileDB = new CrawlProfile(profileDbFile);
} catch (IOException e) {
FileUtils.deletedelete(profileDbFile);
try {
this.importProfileDB = new CrawlProfile(profileDbFile);
} catch (IOException e1) {
Log.logException(e1);
this.importProfileDB = null;
}
}
}
public long getEstimatedTime() {
return (this.urlCount==0)?0:((this.importStartSize*getElapsedTime())/(this.urlCount))-getElapsedTime();
}
public String getJobName() {
return this.plasmaPath.toString();
}
public int getProcessingStatusPercent() {
return (this.urlCount)/((this.importStartSize<100)?1:(this.importStartSize)/100);
}
public String getStatus() {
final StringBuilder theStatus = new StringBuilder();
theStatus.append("#URLs=").append(this.urlCount).append("\n");
theStatus.append("#Profiles=").append(this.profileCount);
return theStatus.toString();
}
public void run() {
try {
// waiting on init thread to finish
//this.importNurlDB.waitOnInitThread();
// the stack types we want to import
final int[] stackTypes = new int[] {
NoticedURL.STACK_TYPE_CORE,
NoticedURL.STACK_TYPE_LIMIT,
NoticedURL.STACK_TYPE_REMOTE,
-1};
// looping through the various stacks
for (int stackType=0; stackType< stackTypes.length; stackType++) {
if (stackTypes[stackType] != -1) {
this.log.logInfo("Starting to import stacktype '" + stackTypes[stackType] + "' containing '" + this.importNurlDB.stackSize(stackTypes[stackType]) + "' entries.");
} else {
this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack.");
}
// getting an iterator and loop through the URL entries
final Iterator<Request> entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null;
while (true) {
String nextHash = null;
Request nextEntry = null;
try {
if (stackTypes[stackType] != -1) {
if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break;
this.urlCount++;
nextEntry = this.importNurlDB.pop(stackTypes[stackType], false, null);
nextHash = nextEntry.url().hash();
} else {
if (!entryIter.hasNext()) break;
this.urlCount++;
nextEntry = entryIter.next();
nextHash = nextEntry.url().hash();
}
} catch (final IOException e) {
this.log.logWarning("Unable to import entry: " + e.toString());
if ((stackTypes[stackType] != -1) &&(this.importNurlDB.stackSize(stackTypes[stackType]) == 0)) break;
continue;
}
// getting a handler to the crawling profile the url belongs to
try {
final String profileHandle = nextEntry.profileHandle();
if (profileHandle == null) {
this.log.logWarning("Profile handle of url entry '" + nextHash + "' unknown.");
continue;
}
// if we havn't imported the profile until yet we need to do it now
if (!this.importProfileHandleCache.contains(profileHandle)) {
// testing if the profile is already known
final CrawlProfile.entry profileEntry = this.activeCrawls.getEntry(profileHandle);
// if not we need to import it
if (profileEntry == null) {
// copy and store the source profile entry into the destination db
final CrawlProfile.entry sourceEntry = this.importProfileDB.getEntry(profileHandle);
if (sourceEntry != null) {
this.profileCount++;
this.importProfileHandleCache.add(profileHandle);
HashMap<String, String> mapclone = new HashMap<String, String>();
mapclone.putAll(sourceEntry.map());
this.activeCrawls.newEntry(mapclone);
} else {
this.log.logWarning("Profile '" + profileHandle + "' of url entry '" + nextHash + "' unknown.");
continue;
}
}
}
// if the url does not alredy exists in the destination stack we insert it now
if (!this.crawlQueues.noticeURL.existsInStack(nextHash)) {
this.crawlQueues.noticeURL.push((stackTypes[stackType] != -1) ? stackTypes[stackType] : NoticedURL.STACK_TYPE_CORE, nextEntry);
}
// removing hash from the import db
} finally {
this.importNurlDB.removeByURLHash(nextHash);
}
if (this.urlCount % 100 == 0) {
if (this.log.isFine()) this.log.logFine(this.urlCount + " URLs and '" + this.profileCount + "' profile entries processed so far.");
}
if (this.isAborted()) break;
}
this.log.logInfo("Finished to import stacktype '" + stackTypes[stackType] + "'");
}
//int size = this.importNurlDB.size();
//int stackSize = this.importNurlDB.stackSize();
// TODO: what todo with nurlDB entries that do not exist in any stack?
} catch (final Exception e) {
this.error = e.toString();
this.log.logSevere("Import process had detected an error",e);
} finally {
this.log.logInfo("Import process finished.");
this.globalEnd = System.currentTimeMillis();
this.dbImportManager.finishedJobs.add(this);
this.importNurlDB.close();
this.importProfileDB.close();
}
}
}

@ -542,7 +542,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread {
if (this.controlSocket != null) try {
this.controlSocket.close();
log.logInfo("Closing main socket of thread '" + this.getName() + "'");
//this.controlSocket = null;
this.controlSocket = null;
} catch (final Exception e) {}
}
@ -808,7 +808,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread {
}
public boolean isSSL() {
return this.controlSocket instanceof SSLSocket;
return this.controlSocket != null && this.controlSocket instanceof SSLSocket;
}
}

@ -42,6 +42,7 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.SetTools;
@ -329,6 +330,11 @@ public class Blacklist {
if (!matched && (app = blacklistMapMatched.get(hostlow)) != null) {
for (int i=app.size()-1; !matched && i>-1; i--) {
pp = app.get(i);
if (pp.indexOf("?*") > 0) {
// prevent "Dangling meta character '*'" exception
Log.logWarning("Blacklist", "ignored blacklist path to prevent 'Dangling meta character' exception: " + pp);
continue;
}
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
}

Loading…
Cancel
Save