From 44fa94ac5205c088eed17199efd0b951f5d4d89b Mon Sep 17 00:00:00 2001 From: theli Date: Tue, 6 Dec 2005 10:41:19 +0000 Subject: [PATCH] *) Modifications for dbImport functionality - dbImporter threads are now shutdown by the switchboard on server shutdown - adding possibility to pause a importer thread via GUI - Bugfix for abort function See: http://www.yacy-forum.de/viewtopic.php?p=13363#13363 *) Modification of content parser configuration - now it's possible to configure which parsers should be enabled for the proxy, crawler, icap, etc. separately - *) htmlFilterContentScraper.java - adding regular expression to normalize URLs containing /../ and /./ parts *) httpc.java - adding functionality to unzip gzipped content - requested by roland: should be used later to allow gzipped seed lists git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1170 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexImport_p.html | 25 +- htroot/IndexImport_p.java | 69 +++-- htroot/QuickCrawlLink_p.html | 3 +- htroot/SettingsAck_p.html | 10 +- htroot/SettingsAck_p.java | 94 ++++-- htroot/Settings_Parser.inc | 18 +- htroot/Settings_p.java | 29 +- .../htmlFilter/htmlFilterContentScraper.java | 11 + source/de/anomic/http/httpc.java | 42 ++- source/de/anomic/http/httpdProxyHandler.java | 2 +- source/de/anomic/icap/icapd.java | 3 +- .../parser/mimeType/mimeTypeParser.java | 1 + .../de/anomic/plasma/plasmaCrawlWorker.java | 4 +- source/de/anomic/plasma/plasmaDbImporter.java | 85 +++++- source/de/anomic/plasma/plasmaParser.java | 284 +++++++----------- .../anomic/plasma/plasmaParserDocument.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 14 +- .../anomic/urlRedirector/urlRedirectord.java | 6 +- source/de/anomic/yacy/yacyVersion.java | 8 + yacy.init | 6 +- 20 files changed, 442 insertions(+), 274 deletions(-) diff --git a/htroot/IndexImport_p.html b/htroot/IndexImport_p.html index acf5435c0..42f4fb415 100644 --- a/htroot/IndexImport_p.html +++ b/htroot/IndexImport_p.html @@ -30,7 +30,6 @@
-

Currently running jobs

@@ -44,12 +43,15 @@ - + + #{running.jobs}# + + - + @@ -59,14 +61,21 @@ + + + #{/running.jobs}#
# URLs # Word
Entities
# Word
Entries
Stop ImportAbort ImportPause Import
#[path]##(stopped)#Finished::Running#(/stopped)##(status)#Finished::Running::Paused#(/status)# #[percent]# #[elapsed]# #[estimated]##[word_entry_num]# #(stopped)#:: - - + #(/stopped)# - + #(paused)# + + :: + + #(/paused)# +
- +


@@ -86,7 +95,7 @@ #{finished.jobs}# #[path]# - #(stopped)#Finished::Error: #[errorMsg]##(/stopped)# + #(status)#Finished::Error: #[errorMsg]#::Paused#(/status)# #[percent]# #[elapsed]# #[wordHash]# diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java index 7dfd44920..6ce30b34c 100644 --- a/htroot/IndexImport_p.java +++ b/htroot/IndexImport_p.java @@ -102,8 +102,12 @@ public final class IndexImport_p { plasmaDbImporter.finishedJobs.clear(); prop.put("LOCATION",""); return prop; - } else if (post.containsKey("stopIndexDbImport")) { - // getting the job nr of the thread that should be stopped + } else if ( + (post.containsKey("stopIndexDbImport")) || + (post.containsKey("pauseIndexDbImport")) || + (post.containsKey("continueIndexDbImport")) + ) { + // getting the job nr of the thread String jobNr = (String) post.get("jobNr"); Thread[] importThreads = new Thread[plasmaDbImporter.runningJobs.activeCount()*2]; @@ -112,12 +116,13 @@ public final class IndexImport_p { for (int i=0; i < activeCount; i++) { plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i]; if (currThread.getJobNr() == Integer.valueOf(jobNr).intValue()) { - currThread.stoppIt(); - try { - currThread.join(); - } catch (InterruptedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + if (post.containsKey("stopIndexDbImport")) { + currThread.stoppIt(); + try { currThread.join(); } catch (InterruptedException e) {e.printStackTrace();} + } else if (post.containsKey("pauseIndexDbImport")) { + currThread.pauseIt(); + } else if (post.containsKey("continueIndexDbImport")) { + currThread.continueIt(); } break; } @@ -138,30 +143,30 @@ public final class IndexImport_p { for (int i=0; i < activeCount; i++) { plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i]; + + // root path of the source db + prop.put("running.jobs_" + i + "_path", currThread.getImportRoot().toString()); + + // specifies if the importer is still running + prop.put("running.jobs_" + i + "_stopped", currThread.isAlive() ? 1:0); + + // specifies if the importer was paused + prop.put("running.jobs_" + i + "_paused", currThread.isPaused() ? 1:0); - File importPath = currThread.getImportRoot(); - String currWordHash = currThread.getCurrentWordhash(); - long currWordEntryCount = currThread.getWordEntryCounter(); - long currWordEntityCounter = currThread.getWordEntityCounter(); - long currUrlCounter = currThread.getUrlCounter(); - //long currImportDbSize = currThread.getImportWordDbSize(); - long estimatedTime = currThread.getEstimatedTime(); - long elapsedTime = currThread.getElapsedTime(); - int jobNr = currThread.getJobNr(); - int percent = currThread.getProcessingStatus(); + // setting the status + prop.put("running.jobs_" + i + "_status", currThread.isPaused() ? 2 : currThread.isAlive() ? 1 : 0); - boolean isRunning = currThread.isAlive(); + // other information + prop.put("running.jobs_" + i + "_percent", Integer.toString(currThread.getProcessingStatus())); + prop.put("running.jobs_" + i + "_elapsed", serverDate.intervalToString(currThread.getElapsedTime())); + prop.put("running.jobs_" + i + "_estimated", serverDate.intervalToString(currThread.getEstimatedTime())); + prop.put("running.jobs_" + i + "_wordHash", currThread.getCurrentWordhash()); + prop.put("running.jobs_" + i + "_url_num", Long.toString(currThread.getUrlCounter())); + prop.put("running.jobs_" + i + "_word_entity_num", Long.toString(currThread.getWordEntityCounter())); + prop.put("running.jobs_" + i + "_word_entry_num", Long.toString(currThread.getWordEntryCounter())); - prop.put("running.jobs_" + i + "_path", importPath.toString()); - prop.put("running.jobs_" + i + "_stopped", isRunning ? 1:0); - prop.put("running.jobs_" + i + "_percent", Integer.toString(percent)); - prop.put("running.jobs_" + i + "_elapsed", serverDate.intervalToString(elapsedTime)); - prop.put("running.jobs_" + i + "_estimated", serverDate.intervalToString(estimatedTime)); - prop.put("running.jobs_" + i + "_wordHash", currWordHash); - prop.put("running.jobs_" + i + "_url_num", Long.toString(currUrlCounter)); - prop.put("running.jobs_" + i + "_word_entity_num", Long.toString(currWordEntityCounter)); - prop.put("running.jobs_" + i + "_word_entry_num", Long.toString(currWordEntryCount)); - prop.put("running.jobs_" + i + "_stopped_job_nr", Integer.toString(jobNr)); + // job number of the importer thread + prop.put("running.jobs_" + i + "_job_nr", Integer.toString(currThread.getJobNr())); } prop.put("running.jobs",activeCount); @@ -174,10 +179,10 @@ public final class IndexImport_p { String error = currThread.getError(); prop.put("finished.jobs_" + i + "_path", currThread.getImportRoot().toString()); if (error != null) { - prop.put("finished.jobs_" + i + "_stopped", 2); - prop.put("finished.jobs_" + i + "_stopped_errorMsg", error); + prop.put("finished.jobs_" + i + "_status", 2); + prop.put("finished.jobs_" + i + "_status_errorMsg", error); } else { - prop.put("finished.jobs_" + i + "_stopped", 0); + prop.put("finished.jobs_" + i + "_status", 0); } prop.put("finished.jobs_" + i + "_percent", Integer.toString(currThread.getProcessingStatus())); prop.put("finished.jobs_" + i + "_elapsed", serverDate.intervalToString(currThread.getElapsedTime())); diff --git a/htroot/QuickCrawlLink_p.html b/htroot/QuickCrawlLink_p.html index 1dc541f2d..da1c8be4b 100644 --- a/htroot/QuickCrawlLink_p.html +++ b/htroot/QuickCrawlLink_p.html @@ -35,7 +35,8 @@ If you click on it while browsing, the currently viewed website will be inserted -  Crawl with YaCy +  Crawl with YaCy + :: diff --git a/htroot/SettingsAck_p.html b/htroot/SettingsAck_p.html index cf4a1b55b..43e5f7a8d 100644 --- a/htroot/SettingsAck_p.html +++ b/htroot/SettingsAck_p.html @@ -79,13 +79,15 @@ Your Peer Language is: #[peerLang]#
Peer names must not contain characters other than (a-z, A-Z, 0-9, '-', '_') and must not be longer than 80 characters. Your Peer Language is: #[peerLang]#
:: +

The new parser settings where changed successfully.
-Parsing of the following mime-types was enabled:
-