diff --git a/htroot/ConfigPHPBB3Search.html b/htroot/ConfigPHPBB3Search.html index 39013786c..575258602 100644 --- a/htroot/ConfigPHPBB3Search.html +++ b/htroot/ConfigPHPBB3Search.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuPortalIntegration.template%# + #%env/templates/submenuIndexCreate.template%#

Integration in phpBB3

It is possible to insert forum pages into the YaCy index using a databse import of forum postings. diff --git a/htroot/ConfigWikiSearch.html b/htroot/ConfigWikiSearch.html index d156e9e0e..f4bd8d35b 100644 --- a/htroot/ConfigWikiSearch.html +++ b/htroot/ConfigWikiSearch.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuPortalIntegration.template%# + #%env/templates/submenuIndexCreate.template%#

Integration in MediaWiki

It is possible to insert wiki pages into the YaCy index using a web crawl on that pages. diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html index df04fae36..b2ddb86e0 100644 --- a/htroot/CrawlProfileEditor_p.html +++ b/htroot/CrawlProfileEditor_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Crawl Profile Editor

Crawl profiles hold information about a specific URL which is internally used to perform the crawl it belongs to. diff --git a/htroot/IndexCreateLoaderQueue_p.html b/htroot/IndexCreateLoaderQueue_p.html index c68116386..5257fecf3 100644 --- a/htroot/IndexCreateLoaderQueue_p.html +++ b/htroot/IndexCreateLoaderQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Loader Queue

diff --git a/htroot/IndexCreateParserErrors_p.html b/htroot/IndexCreateParserErrors_p.html index 1cffbc24e..b3e8c60b1 100644 --- a/htroot/IndexCreateParserErrors_p.html +++ b/htroot/IndexCreateParserErrors_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Parser Errors

#(rejected)# diff --git a/htroot/IndexCreateWWWGlobalQueue_p.html b/htroot/IndexCreateWWWGlobalQueue_p.html index b2c8d8dec..f5258fbc2 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.html +++ b/htroot/IndexCreateWWWGlobalQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Global Crawl Queue

This queue stores the urls that shall be sent to other peers to perform a remote crawl. diff --git a/htroot/IndexCreateWWWLocalQueue_p.html b/htroot/IndexCreateWWWLocalQueue_p.html index fbfd4d714..06c82a121 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.html +++ b/htroot/IndexCreateWWWLocalQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Local Crawl Queue

This queue stores the urls that shall be crawled localy by this peer. diff --git a/htroot/IndexCreateWWWRemoteQueue_p.html b/htroot/IndexCreateWWWRemoteQueue_p.html index cdde8bb54..d771bf227 100644 --- a/htroot/IndexCreateWWWRemoteQueue_p.html +++ b/htroot/IndexCreateWWWRemoteQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Remote Crawl Queue

This queue stores the urls that other peers sent to you in order to perform a remote crawl for them. diff --git a/htroot/IndexImport_p.html b/htroot/IndexImport_p.html deleted file mode 100644 index 172b7957b..000000000 --- a/htroot/IndexImport_p.html +++ /dev/null @@ -1,270 +0,0 @@ - - - - - YaCy '#[clientname]#': Crawling Queue Import - #%env/templates/metas.template%# - - - - #%env/templates/header.template%# - #%env/templates/submenuIndexControl.template%# - -

Crawling Queue Import

- #(error)# - :: -

#[error_msg]#

- :: -

Import Job with the same path already started.

- :: -

#[error_msg]#

-

#[error_stackTrace]#

- #(/error)# -
-

Starting new Job

- - - - - - - - - - - - - - - - - - - - - - - -
Import Type:Cache Size - - Usage Examples
Import Path:
Import Path:
Import Path:
-

Attention:
Always do a backup of your source and destination database before starting to use this import function.

-
- -
-

Currently running jobs

- - - - - - - - - - - - - - - - - - #{running.jobs}# - - - - - - - - - - - - - - #{/running.jobs}# -
Job TypeJob NameStatus%Elapsed
Time
Time
Left
Import StatusAbort ImportPause Import
#[type]##[shortName]##(runningStatus)#Finished::Running::Paused#(/runningStatus)##[percent]##[elapsed]##[estimated]##[status]# - - #(stopped)#:: - - #(/stopped)# - - #(paused)# - - :: - - #(/paused)# -
- - -
-
-

Finished jobs

- - - - - - - - - - - - - - - #{finished.jobs}# - - - - - - - - - #{/finished.jobs}# -
Job TypePathStatus%Elapsed
Time
Import Status
#[type]##[shortName]##(runningStatus)#Finished::Error: #[errorMsg]#::Paused#(/runningStatus)##[percent]##[elapsed]##[status]#
-
- -
-
-

Last Refresh: #[date]#

-
-

Usage Examples:

- - - -

Crawling Queue Import:

-

- Example Path: E:\PLASMADB\ -

-

- Requirements: -

-

- You need to have at least the following directories and files in this path: -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeWriteableDescription
crawlProfiles0.dbFileNoContains data about the crawljob an URL belongs to
urlNotice1.dbFileYesThe crawling queue
urlNoticeImage0.stackFileYesVarious stack files that belong to the crawling queue
urlNoticeImage0.stack
urlNoticeLimit0.stack
urlNoticeLocal0.stack
urlNoticeMovie0.stack
urlNoticeMusic0.stack
urlNoticeOverhang0.stack
urlNoticeRemote0.stack
- - #%env/templates/footer.template%# - - \ No newline at end of file diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java deleted file mode 100644 index 327c34dac..000000000 --- a/htroot/IndexImport_p.java +++ /dev/null @@ -1,197 +0,0 @@ -//IndexTransfer_p.java -//----------------------- -//part of the AnomicHTTPD caching proxy -//(C) by Michael Peter Christen; mc@yacy.net -//first published on http://www.anomic.de -//Frankfurt, Germany, 2005 -// -//This file is contributed by Martin Thelian -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -//This program is free software; you can redistribute it and/or modify -//it under the terms of the GNU General Public License as published by -//the Free Software Foundation; either version 2 of the License, or -//(at your option) any later version. -// -//This program is distributed in the hope that it will be useful, -//but WITHOUT ANY WARRANTY; without even the implied warranty of -//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. -// -//You should have received a copy of the GNU General Public License -//along with this program; if not, write to the Free Software -//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -//You must compile this file with -//javac -classpath .:../Classes IndexControl_p.java -//if the shell's current path is HTROOT - -import java.io.PrintStream; -import java.util.Date; - -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.ByteBuffer; -import net.yacy.kelondro.util.DateFormatter; - -import de.anomic.crawler.Importer; -import de.anomic.crawler.NoticeURLImporter; -import de.anomic.http.server.RequestHeader; -import de.anomic.search.Segment; -import de.anomic.search.Segments; -import de.anomic.search.Switchboard; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public final class IndexImport_p { - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - // return variable that accumulates replacements - final Switchboard sb = (Switchboard) env; - final serverObjects prop = new serverObjects(); - - int activeCount = 0; - - // get segment - Segment indexSegment = null; - if (post != null && post.containsKey("segment")) { - String segmentName = post.get("segment"); - if (sb.indexSegments.segmentExist(segmentName)) { - indexSegment = sb.indexSegments.segment(segmentName); - } - } else { - // take default segment - indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); - } - - if (post != null) { - if (post.containsKey("startIndexDbImport")) { - try { - final boolean startImport = true; - if (startImport) { - final Importer importerThread = new NoticeURLImporter( - sb.queuesRoot, - sb.crawlQueues, - sb.crawler.profilesActiveCrawls, - sb.dbImportManager); - - if (importerThread != null) { - importerThread.setJobID(sb.dbImportManager.generateUniqueJobID()); - importerThread.startIt(); - } - prop.put("LOCATION",""); - return prop; - } - } catch (final Exception e) { - final ByteBuffer errorMsg = new ByteBuffer(100); - final PrintStream errorOut = new PrintStream(errorMsg); - Log.logException(e); - - prop.put("error", "3"); - prop.putHTML("error_error_msg",e.toString()); - prop.putHTML("error_error_stackTrace",errorMsg.toString().replaceAll("\n","
")); - - errorOut.close(); - } - } else if (post.containsKey("clearFinishedJobList")) { - sb.dbImportManager.finishedJobs.clear(); - prop.put("LOCATION", ""); - return prop; - } else if ( - (post.containsKey("stopIndexDbImport")) || - (post.containsKey("pauseIndexDbImport")) || - (post.containsKey("continueIndexDbImport")) - ) { - // get the job nr of the thread - final String jobID = post.get("jobNr"); - final Importer importer = sb.dbImportManager.getImporterByID(Integer.valueOf(jobID).intValue()); - if (importer != null) { - if (post.containsKey("stopIndexDbImport")) { - try { - importer.stopIt(); - } catch (final InterruptedException e) { - // TODO Auto-generated catch block - Log.logException(e); - } - } else if (post.containsKey("pauseIndexDbImport")) { - importer.pauseIt(); - } else if (post.containsKey("continueIndexDbImport")) { - importer.continueIt(); - } - } - prop.put("LOCATION",""); - return prop; - } - } - - prop.putNum("wcount", indexSegment.termIndex().sizesMax()); - prop.putNum("ucount", indexSegment.urlMetadata().size()); - - /* - * Loop over all currently running jobs - */ - final Importer[] importThreads = sb.dbImportManager.getRunningImporter(); - activeCount = importThreads.length; - - for (int i=0; i < activeCount; i++) { - final Importer currThread = importThreads[i]; - - // get import type - prop.put("running.jobs_" + i + "_type", currThread.getJobType()); - - // root path of the source db - final String fullName = currThread.getJobName(); - final String shortName = (fullName.length()>30)?fullName.substring(0,12) + "..." + fullName.substring(fullName.length()-22,fullName.length()):fullName; - prop.put("running.jobs_" + i + "_fullName",fullName); - prop.put("running.jobs_" + i + "_shortName",shortName); - - // specifies if the importer is still running - prop.put("running.jobs_" + i + "_stopped", currThread.isStopped() ? "0" : "1"); - - // specifies if the importer was paused - prop.put("running.jobs_" + i + "_paused", currThread.isPaused() ? "1" : "0"); - - // setting the status - prop.put("running.jobs_" + i + "_runningStatus", currThread.isPaused() ? "2" : currThread.isStopped() ? "0" : "1"); - - // other information - prop.putNum("running.jobs_" + i + "_percent", currThread.getProcessingStatusPercent()); - prop.put("running.jobs_" + i + "_elapsed", DateFormatter.formatInterval(currThread.getElapsedTime())); - prop.put("running.jobs_" + i + "_estimated", DateFormatter.formatInterval(currThread.getEstimatedTime())); - prop.putHTML("running.jobs_" + i + "_status", currThread.getStatus().replaceAll("\n", "
")); - - // job number of the importer thread - prop.put("running.jobs_" + i + "_job_nr", currThread.getJobID()); - } - prop.put("running.jobs", activeCount); - - /* - * Loop over all finished jobs - */ - final Importer[] finishedJobs = sb.dbImportManager.getFinishedImporter(); - for (int i=0; i30)?fullName.substring(0,12) + "..." + fullName.substring(fullName.length()-22,fullName.length()):fullName; - prop.put("finished.jobs_" + i + "_type", currThread.getJobType()); - prop.put("finished.jobs_" + i + "_fullName", fullName); - prop.put("finished.jobs_" + i + "_shortName", shortName); - if (error != null) { - prop.put("finished.jobs_" + i + "_runningStatus", "1"); - prop.putHTML("finished.jobs_" + i + "_runningStatus_errorMsg", error.replaceAll("\n", "
")); - } else { - prop.put("finished.jobs_" + i + "_runningStatus", "0"); - } - prop.putNum("finished.jobs_" + i + "_percent", currThread.getProcessingStatusPercent()); - prop.put("finished.jobs_" + i + "_elapsed", DateFormatter.formatInterval(currThread.getElapsedTime())); - prop.putHTML("finished.jobs_" + i + "_status", currThread.getStatus().replaceAll("\n", "
")); - } - prop.put("finished.jobs",finishedJobs.length); - - prop.put("date",(new Date()).toString()); - return prop; - } -} diff --git a/htroot/ProxyIndexingMonitor_p.html b/htroot/ProxyIndexingMonitor_p.html index 7cd786a5f..865fd1c31 100644 --- a/htroot/ProxyIndexingMonitor_p.html +++ b/htroot/ProxyIndexingMonitor_p.html @@ -6,14 +6,13 @@ #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%#

Indexing with Proxy

- This is the control page for web pages that your peer has indexed during the current application run-time - as result of proxy fetch/prefetch. - No personal or protected page is indexed; + YaCy can be used to 'scrape' content from pages that pass the integrated caching HTTP proxy. + When scraping proxy pages then no personal or protected page is indexed; those pages are detected by properties in the HTTP header (like Cookie-Use, or HTTP Authorization) - or by POST-Parameters (either in URL or as HTTP protocol) - and automatically excluded from indexing. + or by POST-Parameters (either in URL or as HTTP protocol) and automatically excluded from indexing.

diff --git a/htroot/WatchCrawler_p.html b/htroot/WatchCrawler_p.html index 65d0456dd..a6c7e32d8 100644 --- a/htroot/WatchCrawler_p.html +++ b/htroot/WatchCrawler_p.html @@ -1,6 +1,5 @@ -#(forwardToCrawlStart)#::#(/forwardToCrawlStart)# YaCy '#[clientname]#': Crawler Queues #%env/templates/metas.template%# @@ -10,7 +9,7 @@ #%env/templates/header.template%# -#%env/templates/submenuIndexCreate.template%# +#%env/templates/submenuCrawlMonitor.template%#

Crawler Queues

Next update in seconds. empty diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index 0c753eb9b..a5335725f 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -105,13 +105,6 @@ public class WatchCrawler_p { if (post != null) { // a crawl start - if ((post.containsKey("autoforward")) && - (sb.crawlQueues.coreCrawlJobSize() == 0) && - (sb.crawlQueues.remoteTriggeredCrawlJobSize() == 0) && - (sb.getIndexingProcessorsQueueSize() < 30)) { - prop.put("forwardToCrawlStart", "1"); - } - if (post.containsKey("continue")) { // continue queue final String queue = post.get("continue", ""); diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template index 99a87c6dd..8b91ea940 100644 --- a/htroot/env/templates/header.template +++ b/htroot/env/templates/header.template @@ -60,12 +60,12 @@

  • Live Search Anywhere
  • Generic Search Portal
  • Search Box Anywhere
  • -
  • Search Integration for Wikis
  • -
  • Search Integration for phpBB3
  • \ No newline at end of file diff --git a/source/de/anomic/crawler/ExternalIndexImporter.java b/source/de/anomic/crawler/ExternalIndexImporter.java deleted file mode 100644 index c59245d7f..000000000 --- a/source/de/anomic/crawler/ExternalIndexImporter.java +++ /dev/null @@ -1,228 +0,0 @@ -package de.anomic.crawler; - -import java.util.HashSet; -import java.util.Iterator; -import java.util.TreeSet; - -import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.data.word.WordReference; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.rwi.Reference; -import net.yacy.kelondro.rwi.ReferenceContainer; -import net.yacy.kelondro.util.DateFormatter; - -import de.anomic.search.Segment; - -public class ExternalIndexImporter extends AbstractImporter implements Importer { - - /** - * the source word index (the DB to import) - */ - private final Segment importWordIndex; - - /** - * the destination word index (the home DB) - */ - protected Segment homeWordIndex; - private final int importStartSize; - - private byte[] wordHash = "------------".getBytes(); - - long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = this.wordChunkStart; - byte[] wordChunkStartHash = "------------".getBytes(), wordChunkEndHash; - private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0; - - - public ExternalIndexImporter(final Segment homeWI, final Segment importWI) { - super("PLASMADB"); - this.homeWordIndex = homeWI; - this.importWordIndex = importWI; - this.importStartSize = this.importWordIndex.termIndex().sizesMax(); - } - - /** - * @see Importer#getJobName() - */ - public String getJobName() { - return this.importWordIndex.getLocation().toString(); - } - - /** - * @see Importer#getStatus() - */ - public String getStatus() { - final StringBuilder theStatus = new StringBuilder(); - - theStatus.append("Hash=").append(this.wordHash).append("\n"); - theStatus.append("#URL=").append(this.urlCounter).append("\n"); - theStatus.append("#Word Entity=").append(this.wordCounter).append("\n"); - theStatus.append("#Word Entry={").append(this.entryCounter); - theStatus.append(" ,NotBound=").append(this.notBoundEntryCounter).append("}"); - - return theStatus.toString(); - } - - public void run() { - try { - importWordsDB(); - } finally { - this.globalEnd = System.currentTimeMillis(); - //this.sb.dbImportManager.finishedJobs.add(this); - } - } - - /** - * @see Importer#getProcessingStatusPercent() - */ - public int getProcessingStatusPercent() { - // thid seems to be better: - // (this.importStartSize-this.importWordIndex.size())*100/((this.importStartSize==0)?1:this.importStartSize); - // but maxint (2,147,483,647) could be exceeded when WordIndexes reach 20M entries - //return (this.importStartSize-this.importWordIndex.size())/((this.importStartSize<100)?1:(this.importStartSize)/100); - return (int)(this.wordCounter)/((this.importStartSize<100)?1:(this.importStartSize)/100); - } - - /** - * @see Importer#getElapsedTime() - */ - public long getEstimatedTime() { - return (this.wordCounter==0)?0:((this.importStartSize*getElapsedTime())/this.wordCounter)-getElapsedTime(); - } - - public void importWordsDB() { - this.log.logInfo("STARTING DB-IMPORT"); - - try { - this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation().getAbsolutePath() + "'"); - this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs."); - this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs."); - - final HashSet unknownUrlBuffer = new HashSet(); - final HashSet importedUrlBuffer = new HashSet(); - - // iterate over all words from import db - //Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, CrawlSwitchboard.RL_WORDFILES, false); - Iterator> indexContainerIterator = this.importWordIndex.termIndex().references(this.wordChunkStartHash, false, 100, false).iterator(); - while (!isAborted() && indexContainerIterator.hasNext()) { - - final TreeSet entityUrls = new TreeSet(); - ReferenceContainer newContainer = null; - try { - this.wordCounter++; - newContainer = indexContainerIterator.next(); - this.wordHash = newContainer.getTermHash(); - - // loop throug the entities of the container and get the - // urlhash - final Iterator importWordIdxEntries = newContainer.entries(); - Reference importWordIdxEntry; - while (importWordIdxEntries.hasNext()) { - // testing if import process was aborted - if (isAborted()) break; - - // getting next word index entry - importWordIdxEntry = importWordIdxEntries.next(); - final String urlHash = importWordIdxEntry.metadataHash(); - entityUrls.add(urlHash); - } - - final Iterator urlIter = entityUrls.iterator(); - while (urlIter.hasNext()) { - if (isAborted()) break; - final String urlHash = urlIter.next(); - - if (!importedUrlBuffer.contains(urlHash)) { - if (unknownUrlBuffer.contains(urlHash)) { - // url known as unknown - unknownUrlBuffer.add(urlHash); - notBoundEntryCounter++; - newContainer.remove(urlHash); - continue; - } - // we need to import the url - - // getting the url entry - final URIMetadataRow urlEntry = this.importWordIndex.urlMetadata().load(urlHash, null, 0); - if (urlEntry != null) { - - /* write it into the home url db */ - homeWordIndex.urlMetadata().store(urlEntry); - importedUrlBuffer.add(urlHash); - this.urlCounter++; - - if (this.urlCounter % 500 == 0) { - this.log.logFine(this.urlCounter + " URLs processed so far."); - } - - } else { - unknownUrlBuffer.add(urlHash); - notBoundEntryCounter++; - newContainer.remove(urlHash); - continue; - } - //} else { - // already known url - } - this.entryCounter++; - } - - // testing if import process was aborted - if (isAborted()) break; - - // importing entity container to home db - if (!newContainer.isEmpty()) { homeWordIndex.termIndex().add(newContainer); } - - // delete complete index entity file - this.importWordIndex.termIndex().delete(this.wordHash); - - // print out some statistical information - if (this.entryCounter % 500 == 0) { - this.log.logFine(this.entryCounter + " word entries and " + this.wordCounter + " word entities processed so far."); - } - - if (this.wordCounter%500 == 0) { - this.wordChunkEndHash = this.wordHash; - this.wordChunkEnd = System.currentTimeMillis(); - final long duration = this.wordChunkEnd - this.wordChunkStart; - this.log.logInfo(this.wordCounter + " word entities imported " + - "[" + this.wordChunkStartHash + " .. " + this.wordChunkEndHash + "] " + - this.getProcessingStatusPercent() + "%\n" + - "Speed: "+ 500*1000/duration + " word entities/s" + - " | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) + - " | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" + - "Home Words = " + homeWordIndex.termIndex().sizesMax() + - " | Import Words = " + this.importWordIndex.termIndex().sizesMax()); - this.wordChunkStart = this.wordChunkEnd; - this.wordChunkStartHash = this.wordChunkEndHash; - } - - } catch (final Exception e) { - this.log.logSevere("Import of word entity '" + this.wordHash + "' failed.",e); - } finally { - if (newContainer != null) newContainer.clear(); - } - - if (!indexContainerIterator.hasNext()) { - // We may not be finished yet, try to get the next chunk of wordHashes - final TreeSet> containers = this.importWordIndex.termIndex().references(this.wordHash, false, 100, false); - indexContainerIterator = containers.iterator(); - // Make sure we don't get the same wordhash twice, but don't skip a word - if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getTermHash()))) { - indexContainerIterator = containers.iterator(); - } - } - } - - this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs."); - this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs."); - } catch (final Exception e) { - this.log.logSevere("Database import failed.",e); - Log.logException(e); - this.error = e.toString(); - } finally { - this.log.logInfo("Import process finished."); - if (this.importWordIndex != null) try { this.importWordIndex.close(); } catch (final Exception e){} - } - } - -} diff --git a/source/de/anomic/crawler/NoticeURLImporter.java b/source/de/anomic/crawler/NoticeURLImporter.java deleted file mode 100644 index 6b4f04f58..000000000 --- a/source/de/anomic/crawler/NoticeURLImporter.java +++ /dev/null @@ -1,228 +0,0 @@ -package de.anomic.crawler; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; - -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.FileUtils; - -import de.anomic.crawler.CrawlSwitchboard; -import de.anomic.crawler.retrieval.Request; - -public class NoticeURLImporter extends AbstractImporter implements Importer { - - private File plasmaPath = null; - private final HashSet importProfileHandleCache = new HashSet(); - private CrawlProfile importProfileDB; - private final NoticedURL importNurlDB; - private final int importStartSize; - private int urlCount = 0; - private int profileCount = 0; - private final CrawlQueues crawlQueues; - private final CrawlProfile activeCrawls; - private final ImporterManager dbImportManager; - - public NoticeURLImporter(final File crawlerPath, final CrawlQueues crawlQueues, final CrawlProfile activeCrawls, final ImporterManager dbImportManager) { - super("NURL"); - this.crawlQueues = crawlQueues; - this.activeCrawls = activeCrawls; - this.dbImportManager = dbImportManager; - - // TODO: we need more error handling here - this.plasmaPath = crawlerPath; - final File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db"); - final File profileDbFile = new File(plasmaPath, CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES); - - String errorMsg = null; - if (!plasmaPath.exists()) - errorMsg = "The import path '" + plasmaPath + "' does not exist."; - else if (!plasmaPath.isDirectory()) - errorMsg = "The import path '" + plasmaPath + "' is not a directory."; - else if (!plasmaPath.canRead()) - errorMsg = "The import path '" + plasmaPath + "' is not readable."; - else if (!plasmaPath.canWrite()) - errorMsg = "The import path '" + plasmaPath + "' is not writeable."; - - else if (!noticeUrlDbFile.exists()) - errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist."; - else if (noticeUrlDbFile.isDirectory()) - errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not a file."; - else if (!noticeUrlDbFile.canRead()) - errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not readable."; - else if (!noticeUrlDbFile.canWrite()) - errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not writeable."; - - else if (!profileDbFile.exists()) - errorMsg = "The profileDB file '" + profileDbFile + "' does not exist."; - else if (profileDbFile.isDirectory()) - errorMsg = "The profileDB file '" + profileDbFile + "' is not a file."; - else if (!profileDbFile.canRead()) - errorMsg = "The profileDB file '" + profileDbFile + "' is not readable."; -// else if (!profileDbFile.canWrite()) -// errorMsg = "The profileDB file '" + profileDbFile + "' is not writeable."; - - if (errorMsg != null) { - this.log.logSevere(errorMsg); - throw new IllegalArgumentException(errorMsg); - } - - // init noticeUrlDB - this.log.logInfo("Initializing the source noticeUrlDB"); - this.importNurlDB = new NoticedURL(plasmaPath, false, false); - this.importStartSize = this.importNurlDB.size(); - //int stackSize = this.importNurlDB.stackSize(); - - // init profile DB - this.log.logInfo("Initializing the source profileDB"); - try { - this.importProfileDB = new CrawlProfile(profileDbFile); - } catch (IOException e) { - FileUtils.deletedelete(profileDbFile); - try { - this.importProfileDB = new CrawlProfile(profileDbFile); - } catch (IOException e1) { - Log.logException(e1); - this.importProfileDB = null; - } - } - } - - public long getEstimatedTime() { - return (this.urlCount==0)?0:((this.importStartSize*getElapsedTime())/(this.urlCount))-getElapsedTime(); - } - - public String getJobName() { - return this.plasmaPath.toString(); - } - - public int getProcessingStatusPercent() { - return (this.urlCount)/((this.importStartSize<100)?1:(this.importStartSize)/100); - } - - public String getStatus() { - final StringBuilder theStatus = new StringBuilder(); - - theStatus.append("#URLs=").append(this.urlCount).append("\n"); - theStatus.append("#Profiles=").append(this.profileCount); - - return theStatus.toString(); - } - - public void run() { - try { - // waiting on init thread to finish - //this.importNurlDB.waitOnInitThread(); - - // the stack types we want to import - final int[] stackTypes = new int[] { - NoticedURL.STACK_TYPE_CORE, - NoticedURL.STACK_TYPE_LIMIT, - NoticedURL.STACK_TYPE_REMOTE, - -1}; - - // looping through the various stacks - for (int stackType=0; stackType< stackTypes.length; stackType++) { - if (stackTypes[stackType] != -1) { - this.log.logInfo("Starting to import stacktype '" + stackTypes[stackType] + "' containing '" + this.importNurlDB.stackSize(stackTypes[stackType]) + "' entries."); - } else { - this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack."); - } - - // getting an iterator and loop through the URL entries - final Iterator entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null; - while (true) { - - String nextHash = null; - Request nextEntry = null; - - try { - if (stackTypes[stackType] != -1) { - if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break; - - this.urlCount++; - nextEntry = this.importNurlDB.pop(stackTypes[stackType], false, null); - nextHash = nextEntry.url().hash(); - } else { - if (!entryIter.hasNext()) break; - - this.urlCount++; - nextEntry = entryIter.next(); - nextHash = nextEntry.url().hash(); - } - } catch (final IOException e) { - this.log.logWarning("Unable to import entry: " + e.toString()); - - if ((stackTypes[stackType] != -1) &&(this.importNurlDB.stackSize(stackTypes[stackType]) == 0)) break; - continue; - } - - // getting a handler to the crawling profile the url belongs to - try { - final String profileHandle = nextEntry.profileHandle(); - if (profileHandle == null) { - this.log.logWarning("Profile handle of url entry '" + nextHash + "' unknown."); - continue; - } - - // if we havn't imported the profile until yet we need to do it now - if (!this.importProfileHandleCache.contains(profileHandle)) { - - // testing if the profile is already known - final CrawlProfile.entry profileEntry = this.activeCrawls.getEntry(profileHandle); - - // if not we need to import it - if (profileEntry == null) { - // copy and store the source profile entry into the destination db - final CrawlProfile.entry sourceEntry = this.importProfileDB.getEntry(profileHandle); - if (sourceEntry != null) { - this.profileCount++; - this.importProfileHandleCache.add(profileHandle); - HashMap mapclone = new HashMap(); - mapclone.putAll(sourceEntry.map()); - this.activeCrawls.newEntry(mapclone); - } else { - this.log.logWarning("Profile '" + profileHandle + "' of url entry '" + nextHash + "' unknown."); - continue; - } - } - } - - // if the url does not alredy exists in the destination stack we insert it now - if (!this.crawlQueues.noticeURL.existsInStack(nextHash)) { - this.crawlQueues.noticeURL.push((stackTypes[stackType] != -1) ? stackTypes[stackType] : NoticedURL.STACK_TYPE_CORE, nextEntry); - } - - // removing hash from the import db - } finally { - this.importNurlDB.removeByURLHash(nextHash); - } - - if (this.urlCount % 100 == 0) { - if (this.log.isFine()) this.log.logFine(this.urlCount + " URLs and '" + this.profileCount + "' profile entries processed so far."); - } - if (this.isAborted()) break; - } - this.log.logInfo("Finished to import stacktype '" + stackTypes[stackType] + "'"); - } - - //int size = this.importNurlDB.size(); - //int stackSize = this.importNurlDB.stackSize(); - - // TODO: what todo with nurlDB entries that do not exist in any stack? - - } catch (final Exception e) { - this.error = e.toString(); - this.log.logSevere("Import process had detected an error",e); - } finally { - this.log.logInfo("Import process finished."); - this.globalEnd = System.currentTimeMillis(); - this.dbImportManager.finishedJobs.add(this); - this.importNurlDB.close(); - this.importProfileDB.close(); - } - } - -} diff --git a/source/de/anomic/server/serverCore.java b/source/de/anomic/server/serverCore.java index f2bed9440..03f4a7f17 100644 --- a/source/de/anomic/server/serverCore.java +++ b/source/de/anomic/server/serverCore.java @@ -542,7 +542,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread { if (this.controlSocket != null) try { this.controlSocket.close(); log.logInfo("Closing main socket of thread '" + this.getName() + "'"); - //this.controlSocket = null; + this.controlSocket = null; } catch (final Exception e) {} } @@ -808,7 +808,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread { } public boolean isSSL() { - return this.controlSocket instanceof SSLSocket; + return this.controlSocket != null && this.controlSocket instanceof SSLSocket; } } diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index c2d4ef03f..002d355bd 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -42,6 +42,7 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.SetTools; @@ -329,6 +330,11 @@ public class Blacklist { if (!matched && (app = blacklistMapMatched.get(hostlow)) != null) { for (int i=app.size()-1; !matched && i>-1; i--) { pp = app.get(i); + if (pp.indexOf("?*") > 0) { + // prevent "Dangling meta character '*'" exception + Log.logWarning("Blacklist", "ignored blacklist path to prevent 'Dangling meta character' exception: " + pp); + continue; + } matched |= ((pp.equals("*")) || (path.matches(pp))); } }