From a3b8b7b5c5e9cd4ff78e1b0e8380f185797dcce3 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 10 Jan 2010 00:10:43 +0000 Subject: [PATCH] some redesign of the main menu structure: - moved all index generation servlets to it's own main menu item, including proxy indexing - removed external index import because this operation is not recommended any more. Joining an index can simply be done by moving the index files from one peer to the other peer; they will be merged automatically - fix to prevent endless loops when disconnecting http sessions - fix to prevent application of bad blacklist entries that can cause a 'Dangling meta character' exception git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6558 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ConfigPHPBB3Search.html | 2 +- htroot/ConfigWikiSearch.html | 2 +- htroot/CrawlProfileEditor_p.html | 2 +- htroot/IndexCreateLoaderQueue_p.html | 2 +- htroot/IndexCreateParserErrors_p.html | 2 +- htroot/IndexCreateWWWGlobalQueue_p.html | 2 +- htroot/IndexCreateWWWLocalQueue_p.html | 2 +- htroot/IndexCreateWWWRemoteQueue_p.html | 2 +- htroot/IndexImport_p.html | 270 ------------------ htroot/IndexImport_p.java | 197 ------------- htroot/ProxyIndexingMonitor_p.html | 9 +- htroot/WatchCrawler_p.html | 3 +- htroot/WatchCrawler_p.java | 7 - htroot/env/templates/header.template | 6 +- .../templates/submenuCrawlMonitor.template | 29 ++ .../templates/submenuIndexControl.template | 1 - .../env/templates/submenuIndexCreate.template | 48 +--- .../submenuPortalIntegration.template | 2 - .../anomic/crawler/ExternalIndexImporter.java | 228 --------------- .../de/anomic/crawler/NoticeURLImporter.java | 228 --------------- source/de/anomic/server/serverCore.java | 4 +- source/net/yacy/repository/Blacklist.java | 6 + 22 files changed, 60 insertions(+), 994 deletions(-) delete mode 100644 htroot/IndexImport_p.html delete mode 100644 htroot/IndexImport_p.java create mode 100644 htroot/env/templates/submenuCrawlMonitor.template delete mode 100644 source/de/anomic/crawler/ExternalIndexImporter.java delete mode 100644 source/de/anomic/crawler/NoticeURLImporter.java diff --git a/htroot/ConfigPHPBB3Search.html b/htroot/ConfigPHPBB3Search.html index 39013786c..575258602 100644 --- a/htroot/ConfigPHPBB3Search.html +++ b/htroot/ConfigPHPBB3Search.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuPortalIntegration.template%# + #%env/templates/submenuIndexCreate.template%#

Integration in phpBB3

It is possible to insert forum pages into the YaCy index using a databse import of forum postings. diff --git a/htroot/ConfigWikiSearch.html b/htroot/ConfigWikiSearch.html index d156e9e0e..f4bd8d35b 100644 --- a/htroot/ConfigWikiSearch.html +++ b/htroot/ConfigWikiSearch.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuPortalIntegration.template%# + #%env/templates/submenuIndexCreate.template%#

Integration in MediaWiki

It is possible to insert wiki pages into the YaCy index using a web crawl on that pages. diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html index df04fae36..b2ddb86e0 100644 --- a/htroot/CrawlProfileEditor_p.html +++ b/htroot/CrawlProfileEditor_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Crawl Profile Editor

Crawl profiles hold information about a specific URL which is internally used to perform the crawl it belongs to. diff --git a/htroot/IndexCreateLoaderQueue_p.html b/htroot/IndexCreateLoaderQueue_p.html index c68116386..5257fecf3 100644 --- a/htroot/IndexCreateLoaderQueue_p.html +++ b/htroot/IndexCreateLoaderQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Loader Queue

diff --git a/htroot/IndexCreateParserErrors_p.html b/htroot/IndexCreateParserErrors_p.html index 1cffbc24e..b3e8c60b1 100644 --- a/htroot/IndexCreateParserErrors_p.html +++ b/htroot/IndexCreateParserErrors_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Parser Errors

#(rejected)# diff --git a/htroot/IndexCreateWWWGlobalQueue_p.html b/htroot/IndexCreateWWWGlobalQueue_p.html index b2c8d8dec..f5258fbc2 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.html +++ b/htroot/IndexCreateWWWGlobalQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Global Crawl Queue

This queue stores the urls that shall be sent to other peers to perform a remote crawl. diff --git a/htroot/IndexCreateWWWLocalQueue_p.html b/htroot/IndexCreateWWWLocalQueue_p.html index fbfd4d714..06c82a121 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.html +++ b/htroot/IndexCreateWWWLocalQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Local Crawl Queue

This queue stores the urls that shall be crawled localy by this peer. diff --git a/htroot/IndexCreateWWWRemoteQueue_p.html b/htroot/IndexCreateWWWRemoteQueue_p.html index cdde8bb54..d771bf227 100644 --- a/htroot/IndexCreateWWWRemoteQueue_p.html +++ b/htroot/IndexCreateWWWRemoteQueue_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# + #%env/templates/submenuCrawlMonitor.template%#

Remote Crawl Queue

This queue stores the urls that other peers sent to you in order to perform a remote crawl for them. diff --git a/htroot/IndexImport_p.html b/htroot/IndexImport_p.html deleted file mode 100644 index 172b7957b..000000000 --- a/htroot/IndexImport_p.html +++ /dev/null @@ -1,270 +0,0 @@ - - - - - YaCy '#[clientname]#': Crawling Queue Import - #%env/templates/metas.template%# - - - - #%env/templates/header.template%# - #%env/templates/submenuIndexControl.template%# - -

Crawling Queue Import

- #(error)# - :: -

#[error_msg]#

- :: -

Import Job with the same path already started.

- :: -

#[error_msg]#

-

#[error_stackTrace]#

- #(/error)# -
-

Starting new Job

- - - - - - - - - - - - - - - - - - - - - - - -
Import Type:Cache Size - - Usage Examples
Import Path:
Import Path:
Import Path:
-

Attention:
Always do a backup of your source and destination database before starting to use this import function.

-
- -
-

Currently running jobs

- - - - - - - - - - - - - - - - - - #{running.jobs}# - - - - - - - - - - - - - - #{/running.jobs}# -
Job TypeJob NameStatus%Elapsed
Time
Time
Left
Import StatusAbort ImportPause Import
#[type]##[shortName]##(runningStatus)#Finished::Running::Paused#(/runningStatus)##[percent]##[elapsed]##[estimated]##[status]# - - #(stopped)#:: - - #(/stopped)# - - #(paused)# - - :: - - #(/paused)# -
- - -
-
-

Finished jobs

- - - - - - - - - - - - - - - #{finished.jobs}# - - - - - - - - - #{/finished.jobs}# -
Job TypePathStatus%Elapsed
Time
Import Status
#[type]##[shortName]##(runningStatus)#Finished::Error: #[errorMsg]#::Paused#(/runningStatus)##[percent]##[elapsed]##[status]#
-
- -
-
-

Last Refresh: #[date]#

-
-

Usage Examples:

- - - -

Crawling Queue Import:

-

- Example Path: E:\PLASMADB\ -

-

- Requirements: -

-

- You need to have at least the following directories and files in this path: -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeWriteableDescription
crawlProfiles0.dbFileNoContains data about the crawljob an URL belongs to
urlNotice1.dbFileYesThe crawling queue
urlNoticeImage0.stackFileYesVarious stack files that belong to the crawling queue
urlNoticeImage0.stack
urlNoticeLimit0.stack
urlNoticeLocal0.stack
urlNoticeMovie0.stack
urlNoticeMusic0.stack
urlNoticeOverhang0.stack
urlNoticeRemote0.stack
- - #%env/templates/footer.template%# - - \ No newline at end of file diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java deleted file mode 100644 index 327c34dac..000000000 --- a/htroot/IndexImport_p.java +++ /dev/null @@ -1,197 +0,0 @@ -//IndexTransfer_p.java -//----------------------- -//part of the AnomicHTTPD caching proxy -//(C) by Michael Peter Christen; mc@yacy.net -//first published on http://www.anomic.de -//Frankfurt, Germany, 2005 -// -//This file is contributed by Martin Thelian -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -//This program is free software; you can redistribute it and/or modify -//it under the terms of the GNU General Public License as published by -//the Free Software Foundation; either version 2 of the License, or -//(at your option) any later version. -// -//This program is distributed in the hope that it will be useful, -//but WITHOUT ANY WARRANTY; without even the implied warranty of -//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. -// -//You should have received a copy of the GNU General Public License -//along with this program; if not, write to the Free Software -//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -//You must compile this file with -//javac -classpath .:../Classes IndexControl_p.java -//if the shell's current path is HTROOT - -import java.io.PrintStream; -import java.util.Date; - -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.ByteBuffer; -import net.yacy.kelondro.util.DateFormatter; - -import de.anomic.crawler.Importer; -import de.anomic.crawler.NoticeURLImporter; -import de.anomic.http.server.RequestHeader; -import de.anomic.search.Segment; -import de.anomic.search.Segments; -import de.anomic.search.Switchboard; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public final class IndexImport_p { - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - // return variable that accumulates replacements - final Switchboard sb = (Switchboard) env; - final serverObjects prop = new serverObjects(); - - int activeCount = 0; - - // get segment - Segment indexSegment = null; - if (post != null && post.containsKey("segment")) { - String segmentName = post.get("segment"); - if (sb.indexSegments.segmentExist(segmentName)) { - indexSegment = sb.indexSegments.segment(segmentName); - } - } else { - // take default segment - indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); - } - - if (post != null) { - if (post.containsKey("startIndexDbImport")) { - try { - final boolean startImport = true; - if (startImport) { - final Importer importerThread = new NoticeURLImporter( - sb.queuesRoot, - sb.crawlQueues, - sb.crawler.profilesActiveCrawls, - sb.dbImportManager); - - if (importerThread != null) { - importerThread.setJobID(sb.dbImportManager.generateUniqueJobID()); - importerThread.startIt(); - } - prop.put("LOCATION",""); - return prop; - } - } catch (final Exception e) { - final ByteBuffer errorMsg = new ByteBuffer(100); - final PrintStream errorOut = new PrintStream(errorMsg); - Log.logException(e); - - prop.put("error", "3"); - prop.putHTML("error_error_msg",e.toString()); - prop.putHTML("error_error_stackTrace",errorMsg.toString().replaceAll("\n","
")); - - errorOut.close(); - } - } else if (post.containsKey("clearFinishedJobList")) { - sb.dbImportManager.finishedJobs.clear(); - prop.put("LOCATION", ""); - return prop; - } else if ( - (post.containsKey("stopIndexDbImport")) || - (post.containsKey("pauseIndexDbImport")) || - (post.containsKey("continueIndexDbImport")) - ) { - // get the job nr of the thread - final String jobID = post.get("jobNr"); - final Importer importer = sb.dbImportManager.getImporterByID(Integer.valueOf(jobID).intValue()); - if (importer != null) { - if (post.containsKey("stopIndexDbImport")) { - try { - importer.stopIt(); - } catch (final InterruptedException e) { - // TODO Auto-generated catch block - Log.logException(e); - } - } else if (post.containsKey("pauseIndexDbImport")) { - importer.pauseIt(); - } else if (post.containsKey("continueIndexDbImport")) { - importer.continueIt(); - } - } - prop.put("LOCATION",""); - return prop; - } - } - - prop.putNum("wcount", indexSegment.termIndex().sizesMax()); - prop.putNum("ucount", indexSegment.urlMetadata().size()); - - /* - * Loop over all currently running jobs - */ - final Importer[] importThreads = sb.dbImportManager.getRunningImporter(); - activeCount = importThreads.length; - - for (int i=0; i < activeCount; i++) { - final Importer currThread = importThreads[i]; - - // get import type - prop.put("running.jobs_" + i + "_type", currThread.getJobType()); - - // root path of the source db - final String fullName = currThread.getJobName(); - final String shortName = (fullName.length()>30)?fullName.substring(0,12) + "..." + fullName.substring(fullName.length()-22,fullName.length()):fullName; - prop.put("running.jobs_" + i + "_fullName",fullName); - prop.put("running.jobs_" + i + "_shortName",shortName); - - // specifies if the importer is still running - prop.put("running.jobs_" + i + "_stopped", currThread.isStopped() ? "0" : "1"); - - // specifies if the importer was paused - prop.put("running.jobs_" + i + "_paused", currThread.isPaused() ? "1" : "0"); - - // setting the status - prop.put("running.jobs_" + i + "_runningStatus", currThread.isPaused() ? "2" : currThread.isStopped() ? "0" : "1"); - - // other information - prop.putNum("running.jobs_" + i + "_percent", currThread.getProcessingStatusPercent()); - prop.put("running.jobs_" + i + "_elapsed", DateFormatter.formatInterval(currThread.getElapsedTime())); - prop.put("running.jobs_" + i + "_estimated", DateFormatter.formatInterval(currThread.getEstimatedTime())); - prop.putHTML("running.jobs_" + i + "_status", currThread.getStatus().replaceAll("\n", "
")); - - // job number of the importer thread - prop.put("running.jobs_" + i + "_job_nr", currThread.getJobID()); - } - prop.put("running.jobs", activeCount); - - /* - * Loop over all finished jobs - */ - final Importer[] finishedJobs = sb.dbImportManager.getFinishedImporter(); - for (int i=0; i30)?fullName.substring(0,12) + "..." + fullName.substring(fullName.length()-22,fullName.length()):fullName; - prop.put("finished.jobs_" + i + "_type", currThread.getJobType()); - prop.put("finished.jobs_" + i + "_fullName", fullName); - prop.put("finished.jobs_" + i + "_shortName", shortName); - if (error != null) { - prop.put("finished.jobs_" + i + "_runningStatus", "1"); - prop.putHTML("finished.jobs_" + i + "_runningStatus_errorMsg", error.replaceAll("\n", "
")); - } else { - prop.put("finished.jobs_" + i + "_runningStatus", "0"); - } - prop.putNum("finished.jobs_" + i + "_percent", currThread.getProcessingStatusPercent()); - prop.put("finished.jobs_" + i + "_elapsed", DateFormatter.formatInterval(currThread.getElapsedTime())); - prop.putHTML("finished.jobs_" + i + "_status", currThread.getStatus().replaceAll("\n", "
")); - } - prop.put("finished.jobs",finishedJobs.length); - - prop.put("date",(new Date()).toString()); - return prop; - } -} diff --git a/htroot/ProxyIndexingMonitor_p.html b/htroot/ProxyIndexingMonitor_p.html index 7cd786a5f..865fd1c31 100644 --- a/htroot/ProxyIndexingMonitor_p.html +++ b/htroot/ProxyIndexingMonitor_p.html @@ -6,14 +6,13 @@ #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%#

Indexing with Proxy

- This is the control page for web pages that your peer has indexed during the current application run-time - as result of proxy fetch/prefetch. - No personal or protected page is indexed; + YaCy can be used to 'scrape' content from pages that pass the integrated caching HTTP proxy. + When scraping proxy pages then no personal or protected page is indexed; those pages are detected by properties in the HTTP header (like Cookie-Use, or HTTP Authorization) - or by POST-Parameters (either in URL or as HTTP protocol) - and automatically excluded from indexing. + or by POST-Parameters (either in URL or as HTTP protocol) and automatically excluded from indexing.

diff --git a/htroot/WatchCrawler_p.html b/htroot/WatchCrawler_p.html index 65d0456dd..a6c7e32d8 100644 --- a/htroot/WatchCrawler_p.html +++ b/htroot/WatchCrawler_p.html @@ -1,6 +1,5 @@ -#(forwardToCrawlStart)#::#(/forwardToCrawlStart)# YaCy '#[clientname]#': Crawler Queues #%env/templates/metas.template%# @@ -10,7 +9,7 @@ #%env/templates/header.template%# -#%env/templates/submenuIndexCreate.template%# +#%env/templates/submenuCrawlMonitor.template%#

Crawler Queues

Next update in seconds. empty diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index 0c753eb9b..a5335725f 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -105,13 +105,6 @@ public class WatchCrawler_p { if (post != null) { // a crawl start - if ((post.containsKey("autoforward")) && - (sb.crawlQueues.coreCrawlJobSize() == 0) && - (sb.crawlQueues.remoteTriggeredCrawlJobSize() == 0) && - (sb.getIndexingProcessorsQueueSize() < 30)) { - prop.put("forwardToCrawlStart", "1"); - } - if (post.containsKey("continue")) { // continue queue final String queue = post.get("continue", ""); diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template index 99a87c6dd..8b91ea940 100644 --- a/htroot/env/templates/header.template +++ b/htroot/env/templates/header.template @@ -60,12 +60,12 @@

  • Live Search Anywhere
  • Generic Search Portal
  • Search Box Anywhere
  • -
  • Search Integration for Wikis
  • -
  • Search Integration for phpBB3
  • \ No newline at end of file diff --git a/source/de/anomic/crawler/ExternalIndexImporter.java b/source/de/anomic/crawler/ExternalIndexImporter.java deleted file mode 100644 index c59245d7f..000000000 --- a/source/de/anomic/crawler/ExternalIndexImporter.java +++ /dev/null @@ -1,228 +0,0 @@ -package de.anomic.crawler; - -import java.util.HashSet; -import java.util.Iterator; -import java.util.TreeSet; - -import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.data.word.WordReference; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.rwi.Reference; -import net.yacy.kelondro.rwi.ReferenceContainer; -import net.yacy.kelondro.util.DateFormatter; - -import de.anomic.search.Segment; - -public class ExternalIndexImporter extends AbstractImporter implements Importer { - - /** - * the source word index (the DB to import) - */ - private final Segment importWordIndex; - - /** - * the destination word index (the home DB) - */ - protected Segment homeWordIndex; - private final int importStartSize; - - private byte[] wordHash = "------------".getBytes(); - - long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = this.wordChunkStart; - byte[] wordChunkStartHash = "------------".getBytes(), wordChunkEndHash; - private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0; - - - public ExternalIndexImporter(final Segment homeWI, final Segment importWI) { - super("PLASMADB"); - this.homeWordIndex = homeWI; - this.importWordIndex = importWI; - this.importStartSize = this.importWordIndex.termIndex().sizesMax(); - } - - /** - * @see Importer#getJobName() - */ - public String getJobName() { - return this.importWordIndex.getLocation().toString(); - } - - /** - * @see Importer#getStatus() - */ - public String getStatus() { - final StringBuilder theStatus = new StringBuilder(); - - theStatus.append("Hash=").append(this.wordHash).append("\n"); - theStatus.append("#URL=").append(this.urlCounter).append("\n"); - theStatus.append("#Word Entity=").append(this.wordCounter).append("\n"); - theStatus.append("#Word Entry={").append(this.entryCounter); - theStatus.append(" ,NotBound=").append(this.notBoundEntryCounter).append("}"); - - return theStatus.toString(); - } - - public void run() { - try { - importWordsDB(); - } finally { - this.globalEnd = System.currentTimeMillis(); - //this.sb.dbImportManager.finishedJobs.add(this); - } - } - - /** - * @see Importer#getProcessingStatusPercent() - */ - public int getProcessingStatusPercent() { - // thid seems to be better: - // (this.importStartSize-this.importWordIndex.size())*100/((this.importStartSize==0)?1:this.importStartSize); - // but maxint (2,147,483,647) could be exceeded when WordIndexes reach 20M entries - //return (this.importStartSize-this.importWordIndex.size())/((this.importStartSize<100)?1:(this.importStartSize)/100); - return (int)(this.wordCounter)/((this.importStartSize<100)?1:(this.importStartSize)/100); - } - - /** - * @see Importer#getElapsedTime() - */ - public long getEstimatedTime() { - return (this.wordCounter==0)?0:((this.importStartSize*getElapsedTime())/this.wordCounter)-getElapsedTime(); - } - - public void importWordsDB() { - this.log.logInfo("STARTING DB-IMPORT"); - - try { - this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation().getAbsolutePath() + "'"); - this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs."); - this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs."); - - final HashSet unknownUrlBuffer = new HashSet(); - final HashSet importedUrlBuffer = new HashSet(); - - // iterate over all words from import db - //Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, CrawlSwitchboard.RL_WORDFILES, false); - Iterator> indexContainerIterator = this.importWordIndex.termIndex().references(this.wordChunkStartHash, false, 100, false).iterator(); - while (!isAborted() && indexContainerIterator.hasNext()) { - - final TreeSet entityUrls = new TreeSet(); - ReferenceContainer newContainer = null; - try { - this.wordCounter++; - newContainer = indexContainerIterator.next(); - this.wordHash = newContainer.getTermHash(); - - // loop throug the entities of the container and get the - // urlhash - final Iterator importWordIdxEntries = newContainer.entries(); - Reference importWordIdxEntry; - while (importWordIdxEntries.hasNext()) { - // testing if import process was aborted - if (isAborted()) break; - - // getting next word index entry - importWordIdxEntry = importWordIdxEntries.next(); - final String urlHash = importWordIdxEntry.metadataHash(); - entityUrls.add(urlHash); - } - - final Iterator urlIter = entityUrls.iterator(); - while (urlIter.hasNext()) { - if (isAborted()) break; - final String urlHash = urlIter.next(); - - if (!importedUrlBuffer.contains(urlHash)) { - if (unknownUrlBuffer.contains(urlHash)) { - // url known as unknown - unknownUrlBuffer.add(urlHash); - notBoundEntryCounter++; - newContainer.remove(urlHash); - continue; - } - // we need to import the url - - // getting the url entry - final URIMetadataRow urlEntry = this.importWordIndex.urlMetadata().load(urlHash, null, 0); - if (urlEntry != null) { - - /* write it into the home url db */ - homeWordIndex.urlMetadata().store(urlEntry); - importedUrlBuffer.add(urlHash); - this.urlCounter++; - - if (this.urlCounter % 500 == 0) { - this.log.logFine(this.urlCounter + " URLs processed so far."); - } - - } else { - unknownUrlBuffer.add(urlHash); - notBoundEntryCounter++; - newContainer.remove(urlHash); - continue; - } - //} else { - // already known url - } - this.entryCounter++; - } - - // testing if import process was aborted - if (isAborted()) break; - - // importing entity container to home db - if (!newContainer.isEmpty()) { homeWordIndex.termIndex().add(newContainer); } - - // delete complete index entity file - this.importWordIndex.termIndex().delete(this.wordHash); - - // print out some statistical information - if (this.entryCounter % 500 == 0) { - this.log.logFine(this.entryCounter + " word entries and " + this.wordCounter + " word entities processed so far."); - } - - if (this.wordCounter%500 == 0) { - this.wordChunkEndHash = this.wordHash; - this.wordChunkEnd = System.currentTimeMillis(); - final long duration = this.wordChunkEnd - this.wordChunkStart; - this.log.logInfo(this.wordCounter + " word entities imported " + - "[" + this.wordChunkStartHash + " .. " + this.wordChunkEndHash + "] " + - this.getProcessingStatusPercent() + "%\n" + - "Speed: "+ 500*1000/duration + " word entities/s" + - " | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) + - " | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" + - "Home Words = " + homeWordIndex.termIndex().sizesMax() + - " | Import Words = " + this.importWordIndex.termIndex().sizesMax()); - this.wordChunkStart = this.wordChunkEnd; - this.wordChunkStartHash = this.wordChunkEndHash; - } - - } catch (final Exception e) { - this.log.logSevere("Import of word entity '" + this.wordHash + "' failed.",e); - } finally { - if (newContainer != null) newContainer.clear(); - } - - if (!indexContainerIterator.hasNext()) { - // We may not be finished yet, try to get the next chunk of wordHashes - final TreeSet> containers = this.importWordIndex.termIndex().references(this.wordHash, false, 100, false); - indexContainerIterator = containers.iterator(); - // Make sure we don't get the same wordhash twice, but don't skip a word - if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getTermHash()))) { - indexContainerIterator = containers.iterator(); - } - } - } - - this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs."); - this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs."); - } catch (final Exception e) { - this.log.logSevere("Database import failed.",e); - Log.logException(e); - this.error = e.toString(); - } finally { - this.log.logInfo("Import process finished."); - if (this.importWordIndex != null) try { this.importWordIndex.close(); } catch (final Exception e){} - } - } - -} diff --git a/source/de/anomic/crawler/NoticeURLImporter.java b/source/de/anomic/crawler/NoticeURLImporter.java deleted file mode 100644 index 6b4f04f58..000000000 --- a/source/de/anomic/crawler/NoticeURLImporter.java +++ /dev/null @@ -1,228 +0,0 @@ -package de.anomic.crawler; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; - -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.FileUtils; - -import de.anomic.crawler.CrawlSwitchboard; -import de.anomic.crawler.retrieval.Request; - -public class NoticeURLImporter extends AbstractImporter implements Importer { - - private File plasmaPath = null; - private final HashSet importProfileHandleCache = new HashSet(); - private CrawlProfile importProfileDB; - private final NoticedURL importNurlDB; - private final int importStartSize; - private int urlCount = 0; - private int profileCount = 0; - private final CrawlQueues crawlQueues; - private final CrawlProfile activeCrawls; - private final ImporterManager dbImportManager; - - public NoticeURLImporter(final File crawlerPath, final CrawlQueues crawlQueues, final CrawlProfile activeCrawls, final ImporterManager dbImportManager) { - super("NURL"); - this.crawlQueues = crawlQueues; - this.activeCrawls = activeCrawls; - this.dbImportManager = dbImportManager; - - // TODO: we need more error handling here - this.plasmaPath = crawlerPath; - final File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db"); - final File profileDbFile = new File(plasmaPath, CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES); - - String errorMsg = null; - if (!plasmaPath.exists()) - errorMsg = "The import path '" + plasmaPath + "' does not exist."; - else if (!plasmaPath.isDirectory()) - errorMsg = "The import path '" + plasmaPath + "' is not a directory."; - else if (!plasmaPath.canRead()) - errorMsg = "The import path '" + plasmaPath + "' is not readable."; - else if (!plasmaPath.canWrite()) - errorMsg = "The import path '" + plasmaPath + "' is not writeable."; - - else if (!noticeUrlDbFile.exists()) - errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist."; - else if (noticeUrlDbFile.isDirectory()) - errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not a file."; - else if (!noticeUrlDbFile.canRead()) - errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not readable."; - else if (!noticeUrlDbFile.canWrite()) - errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not writeable."; - - else if (!profileDbFile.exists()) - errorMsg = "The profileDB file '" + profileDbFile + "' does not exist."; - else if (profileDbFile.isDirectory()) - errorMsg = "The profileDB file '" + profileDbFile + "' is not a file."; - else if (!profileDbFile.canRead()) - errorMsg = "The profileDB file '" + profileDbFile + "' is not readable."; -// else if (!profileDbFile.canWrite()) -// errorMsg = "The profileDB file '" + profileDbFile + "' is not writeable."; - - if (errorMsg != null) { - this.log.logSevere(errorMsg); - throw new IllegalArgumentException(errorMsg); - } - - // init noticeUrlDB - this.log.logInfo("Initializing the source noticeUrlDB"); - this.importNurlDB = new NoticedURL(plasmaPath, false, false); - this.importStartSize = this.importNurlDB.size(); - //int stackSize = this.importNurlDB.stackSize(); - - // init profile DB - this.log.logInfo("Initializing the source profileDB"); - try { - this.importProfileDB = new CrawlProfile(profileDbFile); - } catch (IOException e) { - FileUtils.deletedelete(profileDbFile); - try { - this.importProfileDB = new CrawlProfile(profileDbFile); - } catch (IOException e1) { - Log.logException(e1); - this.importProfileDB = null; - } - } - } - - public long getEstimatedTime() { - return (this.urlCount==0)?0:((this.importStartSize*getElapsedTime())/(this.urlCount))-getElapsedTime(); - } - - public String getJobName() { - return this.plasmaPath.toString(); - } - - public int getProcessingStatusPercent() { - return (this.urlCount)/((this.importStartSize<100)?1:(this.importStartSize)/100); - } - - public String getStatus() { - final StringBuilder theStatus = new StringBuilder(); - - theStatus.append("#URLs=").append(this.urlCount).append("\n"); - theStatus.append("#Profiles=").append(this.profileCount); - - return theStatus.toString(); - } - - public void run() { - try { - // waiting on init thread to finish - //this.importNurlDB.waitOnInitThread(); - - // the stack types we want to import - final int[] stackTypes = new int[] { - NoticedURL.STACK_TYPE_CORE, - NoticedURL.STACK_TYPE_LIMIT, - NoticedURL.STACK_TYPE_REMOTE, - -1}; - - // looping through the various stacks - for (int stackType=0; stackType< stackTypes.length; stackType++) { - if (stackTypes[stackType] != -1) { - this.log.logInfo("Starting to import stacktype '" + stackTypes[stackType] + "' containing '" + this.importNurlDB.stackSize(stackTypes[stackType]) + "' entries."); - } else { - this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack."); - } - - // getting an iterator and loop through the URL entries - final Iterator entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null; - while (true) { - - String nextHash = null; - Request nextEntry = null; - - try { - if (stackTypes[stackType] != -1) { - if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break; - - this.urlCount++; - nextEntry = this.importNurlDB.pop(stackTypes[stackType], false, null); - nextHash = nextEntry.url().hash(); - } else { - if (!entryIter.hasNext()) break; - - this.urlCount++; - nextEntry = entryIter.next(); - nextHash = nextEntry.url().hash(); - } - } catch (final IOException e) { - this.log.logWarning("Unable to import entry: " + e.toString()); - - if ((stackTypes[stackType] != -1) &&(this.importNurlDB.stackSize(stackTypes[stackType]) == 0)) break; - continue; - } - - // getting a handler to the crawling profile the url belongs to - try { - final String profileHandle = nextEntry.profileHandle(); - if (profileHandle == null) { - this.log.logWarning("Profile handle of url entry '" + nextHash + "' unknown."); - continue; - } - - // if we havn't imported the profile until yet we need to do it now - if (!this.importProfileHandleCache.contains(profileHandle)) { - - // testing if the profile is already known - final CrawlProfile.entry profileEntry = this.activeCrawls.getEntry(profileHandle); - - // if not we need to import it - if (profileEntry == null) { - // copy and store the source profile entry into the destination db - final CrawlProfile.entry sourceEntry = this.importProfileDB.getEntry(profileHandle); - if (sourceEntry != null) { - this.profileCount++; - this.importProfileHandleCache.add(profileHandle); - HashMap mapclone = new HashMap(); - mapclone.putAll(sourceEntry.map()); - this.activeCrawls.newEntry(mapclone); - } else { - this.log.logWarning("Profile '" + profileHandle + "' of url entry '" + nextHash + "' unknown."); - continue; - } - } - } - - // if the url does not alredy exists in the destination stack we insert it now - if (!this.crawlQueues.noticeURL.existsInStack(nextHash)) { - this.crawlQueues.noticeURL.push((stackTypes[stackType] != -1) ? stackTypes[stackType] : NoticedURL.STACK_TYPE_CORE, nextEntry); - } - - // removing hash from the import db - } finally { - this.importNurlDB.removeByURLHash(nextHash); - } - - if (this.urlCount % 100 == 0) { - if (this.log.isFine()) this.log.logFine(this.urlCount + " URLs and '" + this.profileCount + "' profile entries processed so far."); - } - if (this.isAborted()) break; - } - this.log.logInfo("Finished to import stacktype '" + stackTypes[stackType] + "'"); - } - - //int size = this.importNurlDB.size(); - //int stackSize = this.importNurlDB.stackSize(); - - // TODO: what todo with nurlDB entries that do not exist in any stack? - - } catch (final Exception e) { - this.error = e.toString(); - this.log.logSevere("Import process had detected an error",e); - } finally { - this.log.logInfo("Import process finished."); - this.globalEnd = System.currentTimeMillis(); - this.dbImportManager.finishedJobs.add(this); - this.importNurlDB.close(); - this.importProfileDB.close(); - } - } - -} diff --git a/source/de/anomic/server/serverCore.java b/source/de/anomic/server/serverCore.java index f2bed9440..03f4a7f17 100644 --- a/source/de/anomic/server/serverCore.java +++ b/source/de/anomic/server/serverCore.java @@ -542,7 +542,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread { if (this.controlSocket != null) try { this.controlSocket.close(); log.logInfo("Closing main socket of thread '" + this.getName() + "'"); - //this.controlSocket = null; + this.controlSocket = null; } catch (final Exception e) {} } @@ -808,7 +808,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread { } public boolean isSSL() { - return this.controlSocket instanceof SSLSocket; + return this.controlSocket != null && this.controlSocket instanceof SSLSocket; } } diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index c2d4ef03f..002d355bd 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -42,6 +42,7 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.SetTools; @@ -329,6 +330,11 @@ public class Blacklist { if (!matched && (app = blacklistMapMatched.get(hostlow)) != null) { for (int i=app.size()-1; !matched && i>-1; i--) { pp = app.get(i); + if (pp.indexOf("?*") > 0) { + // prevent "Dangling meta character '*'" exception + Log.logWarning("Blacklist", "ignored blacklist path to prevent 'Dangling meta character' exception: " + pp); + continue; + } matched |= ((pp.equals("*")) || (path.matches(pp))); } }