You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/yacy.java

1323 lines
66 KiB

// yacy.java
// -----------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.yacy.net
// Frankfurt, Germany, 2004, 2005
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import de.anomic.data.translator;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMapObjects;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlNURL;
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverMemory;
import de.anomic.server.serverSemaphore;
import de.anomic.server.serverSystem;
import de.anomic.server.serverUpdaterCallback;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.enumerateFiles;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyVersion;
/**
* This is the main class of YaCy. Several threads are started from here:
* <ul>
* <li>one single instance of the plasmaSwitchboard is generated, which itself
* starts a thread with a plasmaHTMLCache object. This object simply counts
* files sizes in the cache and terminates them. It also generates a
* plasmaCrawlerLoader object, which may itself start some more httpc-calling
* threads to load web pages. They terminate automatically when a page has
* loaded.
* <li>one serverCore - thread is started, which implements a multi-threaded
* server. The process may start itself many more processes that handle
* connections.lo
* <li>finally, all idle-dependent processes are written in a queue in
* plasmaSwitchboard which are worked off inside an idle-sensitive loop of the
* main process. (here)
* </ul>
*
* On termination, the following must be done:
* <ul>
* <li>stop feeding of the crawling process because it othervise fills the
* indexing queue.
* <li>say goodbye to connected peers and disable new connections. Don't wait for
* success.
* <li>first terminate the serverCore thread. This prevents that new cache
* objects are queued.
* <li>wait that the plasmaHTMLCache terminates (it should be normal that this
* process already has terminated).
* <li>then wait for termination of all loader process of the
* plasmaCrawlerLoader.
* <li>work off the indexing and cache storage queue. These values are inside a
* RAM cache and would be lost otherwise.
* <li>write all settings.
* <li>terminate.
* </ul>
*/
public final class yacy {
// static objects
private static String vString = "@REPL_VERSION@";
private static double version = 0.1;
private static final String vDATE = "@REPL_DATE@";
private static final String copyright = "[ YaCy v" + vString + ", build " + vDATE + " by Michael Christen / www.yacy.net ]";
private static final String hline = "-------------------------------------------------------------------------------";
/**
* a reference to the {@link plasmaSwitchboard} created by the
* {@link yacy#startup(String, long, long)} method.
*/
private static plasmaSwitchboard sb = null;
/**
* Semaphore needed by {@link yacy#setUpdaterCallback(serverUpdaterCallback)} to block
* until the {@link plasmaSwitchboard }object was created.
*/
private static serverSemaphore sbSync = new serverSemaphore(0);
/**
* Semaphore needed by {@link yacy#waitForFinishedStartup()} to block
* until startup has finished
*/
private static serverSemaphore startupFinishedSync = new serverSemaphore(0);
/**
* Converts combined version-string to a pretty string, e.g. "0.435/01818" or "dev/01818" (development version) or "dev/00000" (in case of wrong input)
*
* @param ver Combined version string matching regular expression: "\A(\d+\.\d{3})(\d{4}|\d{5})\z" <br>
* (i.e.: start of input, 1 or more digits in front of decimal point, decimal point followed by 3 digits as major version, 4 or 5 digits for SVN-Version, end of input)
* @return If the major version is &lt; 0.11 - major version is separated from SVN-version by '/', e.g. "0.435/01818" <br>
* If the major version is &gt;= 0.11 - major version is replaced by "dev" and separated SVN-version by '/', e.g."dev/01818" <br>
* "dev/00000" - If the input does not matcht the regular expression above
*/
public static String combined2prettyVersion(String ver) {
return combined2prettyVersion(ver, "");
}
public static String combined2prettyVersion(String ver, String computerName) {
final Matcher matcher = Pattern.compile("\\A(\\d+\\.\\d{1,3})(\\d{0,5})\\z").matcher(ver);
if (!matcher.find()) {
serverLog.logWarning("STARTUP", "Peer '"+computerName+"': wrong format of version-string: '" + ver + "'. Using default string 'dev/00000' instead");
return "dev/00000";
}
String mainversion = (Double.parseDouble(matcher.group(1)) < 0.11 ? "dev" : matcher.group(1));
String revision = matcher.group(2);
for(int i=revision.length();i<5;++i) revision += "0";
return mainversion+"/"+revision;
}
/**
* Combines the version of YaCy with the versionnumber from SVN to a
* combined version
*
* @param version Current given version.
* @param svn Current version given from SVN.
* @return String with the combined version.
*/
public static double versvn2combinedVersion(double v, int svn) {
return (Math.rint((v*100000000.0) + ((double)svn))/100000000);
}
/**
* Starts up the whole application. Sets up all datastructures and starts
* the main threads.
*
* @param homePath Root-path where all information is to be found.
* @param startupFree free memory at startup time, to be used later for statistics
*/
private static void startup(String homePath, long startupMemFree, long startupMemTotal) {
long startup = System.currentTimeMillis();
int oldRev=0;
int newRev=0;
try {
// start up
System.out.println(copyright);
System.out.println(hline);
// check java version
try {
/*String[] check =*/ "a,b".split(","); // split needs java 1.4
} catch (NoSuchMethodError e) {
System.err.println("STARTUP: Java Version too low. You need at least Java 1.4.2 to run YaCy");
Thread.sleep(3000);
System.exit(-1);
}
// ensure that there is a DATA directory, if not, create one and if that fails warn and die
File f = new File(homePath); if (!(f.exists())) f.mkdirs();
f = new File(homePath, "DATA/"); if (!(f.exists())) f.mkdirs();
if (!(f.exists())) {
System.err.println("Error creating DATA-directory in " + homePath.toString() + " . Please check your write-permission for this folder. YaCy will now terminate.");
System.exit(-1);
}
// setting up logging
f = new File(homePath, "DATA/LOG/"); if (!(f.exists())) f.mkdirs();
if (!((new File(homePath, "DATA/LOG/yacy.logging")).exists())) try {
serverFileUtils.copy(new File(homePath, "yacy.logging"), new File(homePath, "DATA/LOG/yacy.logging"));
}catch (IOException e){
System.out.println("could not copy yacy.logging");
}
try{
serverLog.configureLogging(homePath,new File(homePath, "DATA/LOG/yacy.logging"));
} catch (IOException e) {
System.out.println("could not find logging properties in homePath=" + homePath);
e.printStackTrace();
}
serverLog.logConfig("STARTUP", "java version " + System.getProperty("java.version", "no-java-version"));
serverLog.logConfig("STARTUP", "Application Root Path: " + homePath);
serverLog.logConfig("STARTUP", "Time Zone: UTC" + serverDate.UTCDiffString() + "; UTC+0000 is " + System.currentTimeMillis());
serverLog.logConfig("STARTUP", "Maximum file system path length: " + serverSystem.maxPathLength);
f = new File(homePath, "DATA/yacy.running");
if (f.exists()) { // another instance running? VM crash? User will have to care about this
serverLog.logSevere("STARTUP", "the file " + f + " exists, this usually means that another instance of YaCy is using this DATA-folder.");
serverLog.logSevere("STARTUP", "please make sure that DATA can be used exclusively by one YaCy. Quitting...");
System.exit(-1);
} else {
f.createNewFile();
f.deleteOnExit();
}
/*
// Testing if the yacy archive file were unzipped correctly.
// This test is needed because of classfile-names longer than 100 chars
// which could cause problems with incompatible unzip software.
// See:
// - http://www.yacy-forum.de/viewtopic.php?t=1763
// - http://www.yacy-forum.de/viewtopic.php?t=715
// - http://www.yacy-forum.de/viewtopic.php?t=1674
File unzipTest = new File(homePath,"doc/This_is_a_test_if_the_archive_file_containing_YaCy_was_unpacked_correctly_If_not_please_use_gnu_tar_instead.txt");
if (!unzipTest.exists()) {
String errorMsg = "The archive file containing YaCy was not unpacked correctly. " +
"Please use 'GNU-Tar' or upgrade to a newer version of your unzip software.\n" +
"For detailed information on this bug see: " +
"http://www.yacy-forum.de/viewtopic.php?t=715";
System.err.println(errorMsg);
serverLog.logSevere("STARTUP", errorMsg);
System.exit(1);
}
*/
sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
sbSync.V(); // signal that the sb reference was set
// save information about available memory at startup time
sb.setConfig("memoryFreeAfterStartup", startupMemFree);
sb.setConfig("memoryTotalAfterStartup", startupMemTotal);
// hardcoded, forced, temporary value-migration
sb.setConfig("htTemplatePath", "htroot/env/templates");
sb.setConfig("parseableExt", "html,htm,txt,php,shtml,asp");
// if we are running an SVN version, we try to detect the used svn revision now ...
final Properties buildProp = new Properties();
File buildPropFile = null;
try {
buildPropFile = new File(homePath,"build.properties");
buildProp.load(new FileInputStream(buildPropFile));
} catch (Exception e) {
serverLog.logWarning("STARTUP", buildPropFile.toString() + " not found in settings path");
}
oldRev=Integer.parseInt(sb.getConfig("svnRevision", "0"));
try {
if (buildProp.containsKey("releaseNr")) {
// this normally looks like this: $Revision$
final String svnReleaseNrStr = buildProp.getProperty("releaseNr");
final Pattern pattern = Pattern.compile("\\$Revision:\\s(.*)\\s\\$",Pattern.DOTALL+Pattern.CASE_INSENSITIVE);
final Matcher matcher = pattern.matcher(svnReleaseNrStr);
if (matcher.find()) {
final String svrReleaseNr = matcher.group(1);
try {
try {version = Double.parseDouble(vString);} catch (NumberFormatException e) {version = (float) 0.1;}
version = versvn2combinedVersion(version, Integer.parseInt(svrReleaseNr));
} catch (NumberFormatException e) {}
sb.setConfig("svnRevision", svrReleaseNr);
}
}
newRev=Integer.parseInt(sb.getConfig("svnRevision", "0"));
} catch (Exception e) {
System.err.println("Unable to determine the currently used SVN revision number.");
}
sb.setConfig("version", Double.toString(version));
sb.setConfig("vString", combined2prettyVersion(Double.toString(version)));
sb.setConfig("vdate", vDATE);
sb.setConfig("applicationRoot", homePath);
sb.startupTime = startup;
serverLog.logConfig("STARTUP", "YACY Version: " + version + ", Built " + vDATE);
yacyVersion.latestRelease = version;
// read environment
int timeout = Math.max(20000, Integer.parseInt(sb.getConfig("httpdTimeout", "20000")));
// create some directories
final File htRootPath = new File(homePath, sb.getConfig("htRootPath", "htroot"));
final File htDocsPath = new File(homePath, sb.getConfig("htDocsPath", "DATA/HTDOCS"));
if (!(htDocsPath.exists())) htDocsPath.mkdir();
//final File htTemplatePath = new File(homePath, sb.getConfig("htTemplatePath","htdocs"));
// create default notifier picture
//TODO: Use templates instead of copying images ...
if (!((new File(htDocsPath, "notifier.gif")).exists())) try {
serverFileUtils.copy(new File(htRootPath, "env/grafics/empty.gif"),
new File(htDocsPath, "notifier.gif"));
} catch (IOException e) {}
final File htdocsDefaultReadme = new File(htDocsPath, "readme.txt");
if (!(htdocsDefaultReadme.exists())) try {serverFileUtils.write((
"This is your root directory for individual Web Content\r\n" +
"\r\n" +
"Please place your html files into the www subdirectory.\r\n" +
"The URL of that path is either\r\n" +
"http://www.<your-peer-name>.yacy or\r\n" +
"http://<your-ip>:<your-port>/www\r\n" +
"\r\n" +
"Other subdirectories may be created; they map to corresponding sub-domains.\r\n" +
"This directory shares it's content with the applications htroot path, so you\r\n" +
"may access your yacy search page with\r\n" +
"http://<your-peer-name>.yacy/\r\n" +
"\r\n").getBytes(), htdocsDefaultReadme);} catch (IOException e) {
System.out.println("Error creating htdocs readme: " + e.getMessage());
}
final File wwwDefaultPath = new File(htDocsPath, "www");
if (!(wwwDefaultPath.exists())) wwwDefaultPath.mkdir();
final File shareDefaultPath = new File(htDocsPath, "share");
if (!(shareDefaultPath.exists())) shareDefaultPath.mkdir();
migration.migrate(sb, oldRev, newRev);
// start main threads
final String port = sb.getConfig("port", "8080");
try {
final httpd protocolHandler = new httpd(sb, new httpdFileHandler(sb), new httpdProxyHandler(sb));
final serverCore server = new serverCore(
timeout /*control socket timeout in milliseconds*/,
true /* block attacks (wrong protocol) */,
protocolHandler /*command class*/,
sb,
30000 /*command max length incl. GET args*/);
server.setName("httpd:"+port);
server.setPriority(Thread.MAX_PRIORITY);
server.setObeyIntermission(false);
if (server == null) {
serverLog.logSevere("STARTUP", "Failed to start server. Probably port " + port + " already in use.");
} else {
// first start the server
sb.deployThread("10_httpd", "HTTPD Server/Proxy", "the HTTPD, used as web server and proxy", null, server, 0, 0, 0, 0);
//server.start();
// open the browser window
final boolean browserPopUpTrigger = sb.getConfig("browserPopUpTrigger", "true").equals("true");
if (browserPopUpTrigger) {
String browserPopUpPage = sb.getConfig("browserPopUpPage", "ConfigBasic.html");
boolean properPW = (sb.getConfig("adminAccount", "").length() == 0) && (sb.getConfig(httpd.ADMIN_ACCOUNT_B64MD5, "").length() > 0);
if (!properPW) browserPopUpPage = "ConfigBasic.html";
final String browserPopUpApplication = sb.getConfig("browserPopUpApplication", "netscape");
serverSystem.openBrowser((server.withSSL()?"https":"http") + "://localhost:" + serverCore.getPortNr(port) + "/" + browserPopUpPage, browserPopUpApplication);
}
//Copy the shipped locales into DATA
final File localesPath = new File(homePath, sb.getConfig("localesPath", "DATA/LOCALE"));
final File defaultLocalesPath = new File(homePath, "locales");
try{
final File[] defaultLocales = defaultLocalesPath.listFiles();
localesPath.mkdirs();
for(int i=0;i < defaultLocales.length; i++){
if(defaultLocales[i].getName().endsWith(".lng"))
serverFileUtils.copy(defaultLocales[i], new File(localesPath, defaultLocales[i].getName()));
}
serverLog.logInfo("STARTUP", "Copied the default locales to DATA/LOCALE");
}catch(NullPointerException e){
serverLog.logSevere("STARTUP", "Nullpointer Exception while copying the default Locales");
}
//regenerate Locales from Translationlist, if needed
final String lang = sb.getConfig("htLocaleSelection", "");
if(! lang.equals("") && ! lang.equals("default") ){ //locale is used
String currentRev = "";
try{
final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File( sb.getConfig("htLocalePath", "DATA/HTDOCS/locale"), lang+"/version" ))));
currentRev = br.readLine();
br.close();
}catch(IOException e){
//Error
}
try{ //seperate try, because we want this, even if the file "version" does not exist.
if(! currentRev.equals(sb.getConfig("svnRevision", "")) ){ //is this another version?!
final File sourceDir = new File(sb.getConfig("htRootPath", "htroot"));
final File destDir = new File(sb.getConfig("htLocalePath", "DATA/HTDOCS/locale"), lang);
if(translator.translateFilesRecursive(sourceDir, destDir, new File("DATA/LOCALE/"+lang+".lng"), "html,template,inc", "locale")){ //translate it
//write the new Versionnumber
final BufferedWriter bw = new BufferedWriter(new PrintWriter(new FileWriter(new File(destDir, "version"))));
bw.write(sb.getConfig("svnRevision", "Error getting Version"));
bw.close();
}
}
}catch(IOException e){
//Error
}
}
// registering shutdown hook
serverLog.logConfig("STARTUP", "Registering Shutdown Hook");
final Runtime run = Runtime.getRuntime();
run.addShutdownHook(new shutdownHookThread(Thread.currentThread(), sb));
// save information about available memory after all initializations
//try {
sb.setConfig("memoryFreeAfterInitBGC", Runtime.getRuntime().freeMemory());
sb.setConfig("memoryTotalAfterInitBGC", Runtime.getRuntime().totalMemory());
System.gc();
sb.setConfig("memoryFreeAfterInitAGC", Runtime.getRuntime().freeMemory());
sb.setConfig("memoryTotalAfterInitAGC", Runtime.getRuntime().totalMemory());
//} catch (ConcurrentModificationException e) {}
// signal finished startup
startupFinishedSync.V();
// wait for server shutdown
try {
sb.waitForShutdown();
} catch (Exception e) {
serverLog.logSevere("MAIN CONTROL LOOP", "PANIC: " + e.getMessage(),e);
}
// shut down
serverLog.logConfig("SHUTDOWN", "caught termination signal");
server.terminate(false);
server.interrupt();
if (server.isAlive()) try {
URL u = new URL((server.withSSL()?"https":"http")+"://localhost:" + serverCore.getPortNr(port));
httpc.wget(u, u.getHost(), 1000, null, null, null); // kick server
serverLog.logConfig("SHUTDOWN", "sent termination signal to server socket");
} catch (IOException ee) {
serverLog.logConfig("SHUTDOWN", "termination signal to server socket missed (server shutdown, ok)");
}
// idle until the processes are down
while (server.isAlive()) {
Thread.sleep(2000); // wait a while
}
serverLog.logConfig("SHUTDOWN", "server has terminated");
sb.close();
}
} catch (Exception e) {
serverLog.logSevere("STARTUP", "Unexpected Error: " + e.getClass().getName(),e);
//System.exit(1);
}
} catch (Exception ee) {
serverLog.logSevere("STARTUP", "FATAL ERROR: " + ee.getMessage(),ee);
} finally {
startupFinishedSync.V();
}
serverLog.logConfig("SHUTDOWN", "goodbye. (this is the last line)");
//try {
// System.exit(0);
//} catch (Exception e) {} // was once stopped by de.anomic.net.ftpc$sm.checkExit(ftpc.java:1790)
}
/**
* Loads the configuration from the data-folder.
* FIXME: Why is this called over and over again from every method, instead
* of setting the configurationdata once for this class in main?
*
* @param mes Where are we called from, so that the errormessages can be
* more descriptive.
* @param homePath Root-path where all the information is to be found.
* @return Properties read from the configurationfile.
*/
private static Properties configuration(String mes, String homePath) {
serverLog.logConfig(mes, "Application Root Path: " + homePath.toString());
// read data folder
File dataFolder = new File(homePath, "DATA");
if (!(dataFolder.exists())) {
serverLog.logSevere(mes, "Application was never started or root path wrong.");
System.exit(-1);
}
Properties config = new Properties();
try {
config.load(new FileInputStream(new File(homePath, "DATA/SETTINGS/httpProxy.conf")));
} catch (FileNotFoundException e) {
serverLog.logSevere(mes, "could not find configuration file.");
System.exit(-1);
} catch (IOException e) {
serverLog.logSevere(mes, "could not read configuration file.");
System.exit(-1);
}
return config;
}
public static void shutdown() {
if (sb != null) {
// YaCy is running in the same runtime. we can shutdown via interrupt
sb.terminate();
} else {
String applicationRoot = System.getProperty("user.dir").replace('\\', '/');
shutdown(applicationRoot);
}
}
/**
* Function to set the updater callback class
* @param updaterCallback
* @throws InterruptedException
*/
public static void setUpdaterCallback(serverUpdaterCallback updaterCallback) throws InterruptedException {
sbSync.P();
sb.updaterCallback = updaterCallback;
sbSync.V();
}
/**
* Function allowing the updater to block until the startup has finished
* @throws InterruptedException
*/
public static void waitForFinishedStartup() throws InterruptedException {
startupFinishedSync.P();
startupFinishedSync.V();
}
/**
* Call the shutdown-page of YaCy to tell it to shut down. This method is
* called if you start yacy with the argument -shutdown.
*
* @param homePath Root-path where all the information is to be found.
*/
static void shutdown(String homePath) {
// start up
System.out.println(copyright);
System.out.println(hline);
Properties config = configuration("REMOTE-SHUTDOWN", homePath);
// read port
int port = serverCore.getPortNr(config.getProperty("port", "8080"));
// read password
String encodedPassword = (String) config.get(httpd.ADMIN_ACCOUNT_B64MD5);
if (encodedPassword == null) encodedPassword = ""; // not defined
// send 'wget' to web interface
httpHeader requestHeader = new httpHeader();
requestHeader.put("Authorization", "realm=" + encodedPassword); // for http-authentify
try {
httpc con = httpc.getInstance("localhost", "localhost", port, 10000, false);
httpc.response res = con.GET("Steering.html?shutdown=", requestHeader);
// read response
if (res.status.startsWith("2")) {
serverLog.logConfig("REMOTE-SHUTDOWN", "YACY accepted shutdown command.");
serverLog.logConfig("REMOTE-SHUTDOWN", "Stand by for termination, which may last some seconds.");
ByteArrayOutputStream bos = new ByteArrayOutputStream();
res.writeContent(bos, null);
con.close();
} else {
serverLog.logSevere("REMOTE-SHUTDOWN", "error response from YACY socket: " + res.status);
System.exit(-1);
}
} catch (IOException e) {
serverLog.logSevere("REMOTE-SHUTDOWN", "could not establish connection to YACY socket: " + e.getMessage());
System.exit(-1);
}
// finished
serverLog.logConfig("REMOTE-SHUTDOWN", "SUCCESSFULLY FINISHED remote-shutdown:");
serverLog.logConfig("REMOTE-SHUTDOWN", "YACY will terminate after working off all enqueued tasks.");
}
/**
* This method gets all found words and outputs a statistic about the score
* of the words. The output of this method can be used to create stop-word
* lists. This method will be called if you start yacy with the argument
* -genwordstat.
* FIXME: How can stop-word list be created from this output? What type of
* score is output?
*
* @param homePath Root-Path where all the information is to be found.
*/
private static void genWordstat(String homePath) {
// start up
System.out.println(copyright);
System.out.println(hline);
Properties config = configuration("GEN-WORDSTAT", homePath);
// load words
serverLog.logInfo("GEN-WORDSTAT", "loading words...");
HashMap words = loadWordMap(new File(homePath, "yacy.words"));
// find all hashes
serverLog.logInfo("GEN-WORDSTAT", "searching all word-hash databases...");
File dbRoot = new File(homePath, config.getProperty("dbPath"));
enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true);
File f;
String h;
kelondroMScoreCluster hs = new kelondroMScoreCluster();
while (ef.hasMoreElements()) {
f = (File) ef.nextElement();
h = f.getName().substring(0, yacySeedDB.commonHashLength);
hs.addScore(h, (int) f.length());
}
// list the hashes in reverse order
serverLog.logInfo("GEN-WORDSTAT", "listing words in reverse size order...");
String w;
Iterator i = hs.scores(false);
while (i.hasNext()) {
h = (String) i.next();
w = (String) words.get(h);
if (w == null) System.out.print("# " + h); else System.out.print(w);
System.out.println(" - " + hs.getScore(h));
}
// finished
serverLog.logConfig("GEN-WORDSTAT", "FINISHED");
}
/**
* @param homePath path to the YaCy directory
* @param dbcache cache size in MB
*/
public static void minimizeUrlDB(String homePath) {
// run with "java -classpath classes yacy -minimizeUrlDB"
try {serverLog.configureLogging(homePath,new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX");
File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX");
File indexRoot2 = new File(new File(homePath), "DATA/INDEX2");
serverLog log = new serverLog("URL-CLEANUP");
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
try {
log.logInfo("STARTING URL CLEANUP");
// db containing all currently loades urls
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexSecondaryRoot, 10000);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
// db used to hold all neede urls
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot2, 10000);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
Runtime rt = Runtime.getRuntime();
int cacheMem = (int)(serverMemory.max-rt.totalMemory());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
plasmaWordIndex wordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 10000, log);
Iterator indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
long urlCounter = 0, wordCounter = 0;
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
String wordChunkStartHash = "AAAAAAAAAAAA", wordChunkEndHash;
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
while (indexContainerIterator.hasNext()) {
indexContainer wordIdxContainer = null;
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
try {
wordCounter++;
wordIdxContainer = (indexContainer) indexContainerIterator.next();
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
// the combined container will fit, read the container
Iterator wordIdxEntries = wordIdxContainer.entries();
indexRWIEntry iEntry;
while (wordIdxEntries.hasNext()) {
iEntry = (indexRWIEntry) wordIdxEntries.next();
String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
indexURLEntry urlEntry = currentUrlDB.load(urlHash, null);
urlCounter++;
minimizedUrlDB.store(urlEntry);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
if (urlCounter % 500 == 0) {
log.logInfo(urlCounter + " URLs found so far.");
}
} catch (IOException e) {}
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
}
if (wordCounter%500 == 0) {
wordChunkEndHash = wordIdxContainer.getWordHash();
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
wordChunkEnd = System.currentTimeMillis();
long duration = wordChunkEnd - wordChunkStart;
log.logInfo(wordCounter + " words scanned " +
"[" + wordChunkStartHash + " .. " + wordChunkEndHash + "]\n" +
"Duration: "+ 500*1000/duration + " words/s" +
" | Free memory: " + rt.freeMemory() +
" | Total memory: " + rt.totalMemory());
wordChunkStart = wordChunkEnd;
wordChunkStartHash = wordChunkEndHash;
}
// we have read all elements, now we can close it
wordIdxContainer = null;
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
} catch (Exception e) {
log.logSevere("Exception", e);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
} finally {
if (wordIdxContainer != null) try { wordIdxContainer = null; } catch (Exception e) {}
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
}
}
log.logInfo("current LURL DB contains " + currentUrlDB.size() + " entries.");
log.logInfo("mimimized LURL DB contains " + minimizedUrlDB.size() + " entries.");
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
currentUrlDB.close();
minimizedUrlDB.close();
wordIndex.close();
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
// TODO: rename the mimimized UrlDB to the name of the previous UrlDB
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
log.logInfo("FINISHED URL CLEANUP, WAIT FOR DUMP");
log.logInfo("You can now backup your old URL DB and rename minimized/urlHash.db to urlHash.db");
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
log.logInfo("TERMINATED URL CLEANUP");
} catch (Exception e) {
log.logSevere("Exception: " + e.getMessage(), e);
} catch (Error e) {
log.logSevere("Error: " + e.getMessage(), e);
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
}
}
/**
* Reads all words from the given file and creates a hashmap, where key is
* the plasma word hash and value is the word itself.
*
* @param wordlist File where the words are stored.
* @return HashMap with the hash-word - relation.
*/
private static HashMap loadWordMap(File wordlist) {
// returns a hash-word - Relation
HashMap wordmap = new HashMap();
try {
String word;
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
while ((word = br.readLine()) != null) wordmap.put(plasmaCondenser.word2hash(word),word);
br.close();
} catch (IOException e) {}
return wordmap;
}
/**
* Cleans a wordlist in a file according to the length of the words. The
* file with the given filename is read and then only the words in the given
* length-range are written back to the file.
*
* @param wordlist Name of the file the words are stored in.
* @param minlength Minimal needed length for each word to be stored.
* @param maxlength Maximal allowed length for each word to be stored.
*/
private static void cleanwordlist(String wordlist, int minlength, int maxlength) {
// start up
System.out.println(copyright);
System.out.println(hline);
serverLog.logConfig("CLEAN-WORDLIST", "START");
String word;
TreeSet wordset = new TreeSet();
int count = 0;
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
String seps = "' .,:/-&";
while ((word = br.readLine()) != null) {
word = word.toLowerCase().trim();
for (int i = 0; i < seps.length(); i++) {
if (word.indexOf(seps.charAt(i)) >= 0) word = word.substring(0, word.indexOf(seps.charAt(i)));
}
if ((word.length() >= minlength) && (word.length() <= maxlength)) wordset.add(word);
count++;
}
br.close();
if (wordset.size() != count) {
count = count - wordset.size();
BufferedWriter bw = new BufferedWriter(new PrintWriter(new FileWriter(wordlist)));
while (wordset.size() > 0) {
word = (String) wordset.first();
bw.write(word + "\n");
wordset.remove(word);
}
bw.close();
serverLog.logInfo("CLEAN-WORDLIST", "shrinked wordlist by " + count + " words.");
} else {
serverLog.logInfo("CLEAN-WORDLIST", "not necessary to change wordlist");
}
} catch (IOException e) {
serverLog.logSevere("CLEAN-WORDLIST", "ERROR: " + e.getMessage());
System.exit(-1);
}
// finished
serverLog.logConfig("CLEAN-WORDLIST", "FINISHED");
}
private static void transferCR(String targetaddress, String crfile) {
File f = new File(crfile);
try {
byte[] b = serverFileUtils.read(f);
String result = yacyClient.transfer(targetaddress, f.getName(), b);
if (result == null)
serverLog.logInfo("TRANSFER-CR", "transmitted file " + crfile + " to " + targetaddress + " successfully");
else
serverLog.logInfo("TRANSFER-CR", "error transmitting file " + crfile + " to " + targetaddress + ": " + result);
} catch (IOException e) {
serverLog.logInfo("TRANSFER-CR", "could not read file " + crfile);
}
}
/**
* Generates a text file containing all domains in this peer's DB.
* This may be useful to calculate the YaCy-Blockrank.
*
* @param format String which determines the format of the file. Possible values: "html", "zip", "gzip" or "plain"
* @see urllist
*/
private static void domlist(String homePath, String source, String format, String targetName) {
File root = new File(homePath);
try {
final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
HashMap doms = new HashMap();
System.out.println("Started domain list extraction from " + sb.wordIndex.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
System.out.println("This process may fail in case of too less memory. To increase memory, start with");
System.out.println("java -Xmx<megabytes>m -classpath classes yacy -domlist [ -source { nurl | lurl | eurl } ] [ -format { text | zip | gzip | html } ] [ <path to DATA folder> ]");
int c = 0;
long start = System.currentTimeMillis();
if (source.equals("lurl")) {
Iterator eiter = sb.wordIndex.loadedURL.entries(true, null);
indexURLEntry entry;
while (eiter.hasNext()) {
try {
entry = (indexURLEntry) eiter.next();
indexURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) doms.put(comp.url().getHost(), null);
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
}
c++;
if (c % 10000 == 0) System.out.println(
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (source.equals("eurl")) {
Iterator eiter = sb.errorURL.entries(true, null);
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
plasmaCrawlZURL.Entry entry;
while (eiter.hasNext()) {
try {
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
entry = (plasmaCrawlZURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.anycause());
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
}
c++;
if (c % 10000 == 0) System.out.println(
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (source.equals("nurl")) {
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
Iterator eiter = sb.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (eiter.hasNext()) {
try {
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
entry = (plasmaCrawlEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth());
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
}
c++;
if (c % 10000 == 0) System.out.println(
c + " urls checked, " +
doms.size() + " domains collected, " +
((Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory() + Runtime.getRuntime().freeMemory()) / 1024 / 1024) + " MB available, " +
((System.currentTimeMillis() - start) * (sb.wordIndex.loadedURL.size() - c) / c / 60000) + " minutes remaining.");
}
}
if (format.equals("html")) {
// output file in HTML format
File file = new File(root, targetName + ".html");
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
System.out.println("Started domain list dump to file " + file);
Iterator i = doms.entrySet().iterator();
Map.Entry entry;
String key;
bos.write(("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">").getBytes());
bos.write(serverCore.crlf);
bos.write(("<html><head><title>YaCy " + source + " domainlist</title></head><body>").getBytes());
bos.write(serverCore.crlf);
while (i.hasNext()) {
entry = (Map.Entry) i.next();
key = (String) entry.getKey();
bos.write(("<a href=\"http://" + key + "\">" + key + "</a>" +
((entry.getValue() == null) ? "" : (" " + ((String) entry.getValue()))) + "<br>"
).getBytes());
bos.write(serverCore.crlf);
}
bos.write(("</body></html>").getBytes());
bos.close();
} else if (format.equals("zip")) {
// output file in plain text but compressed with ZIP
File file = new File(root, targetName + ".zip");
System.out.println("Started domain list dump to file " + file);
serverFileUtils.saveSet(file, "zip", doms.keySet(), new String(serverCore.crlf));
} else if (format.equals("gzip")) {
// output file in plain text but compressed with GZIP
File file = new File(root, targetName + ".txt.gz");
System.out.println("Started domain list dump to file " + file);
serverFileUtils.saveSet(file, "gzip", doms.keySet(), new String(serverCore.crlf));
} else {
// plain text list
File file = new File(root, targetName + ".txt");
System.out.println("Started domain list dump to file " + file);
serverFileUtils.saveSet(file, "plain", doms.keySet(), new String(serverCore.crlf));
}
sb.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static void urllist(String homePath, String source, boolean html, String targetName) {
File root = new File(homePath);
try {
final plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
File file = new File(root, targetName);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
if (source.equals("lurl")) {
Iterator eiter = sb.wordIndex.loadedURL.entries(true, null);
indexURLEntry entry;
while (eiter.hasNext()) {
entry = (indexURLEntry) eiter.next();
indexURLEntry.Components comp = entry.comp();
if ((entry != null) && (comp.url() != null)) {
if (html) {
bos.write(("<a href=\"" + comp.url().toNormalform() + "\">" + comp.title() + "</a><br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(comp.url().toNormalform().getBytes());
bos.write(serverCore.crlf);
}
}
}
}
if (source.equals("eurl")) {
Iterator eiter = sb.errorURL.entries(true, null);
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
plasmaCrawlZURL.Entry entry;
while (eiter.hasNext()) {
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
entry = (plasmaCrawlZURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + entry.anycause() + "<br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(entry.url().toString().getBytes());
bos.write(serverCore.crlf);
}
}
}
}
if (source.equals("nurl")) {
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
Iterator eiter = sb.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (eiter.hasNext()) {
redesigned NURL-handling: - the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago
entry = (plasmaCrawlEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + "profile=" + entry.profileHandle() + ", depth=" + entry.depth() + "<br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(entry.url().toString().getBytes());
bos.write(serverCore.crlf);
}
}
}
}
bos.close();
sb.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static String[] shift(String[] args, int pos, int count) {
String[] newargs = new String[args.length - count];
System.arraycopy(args, 0, newargs, 0, pos);
System.arraycopy(args, pos + count, newargs, pos, args.length - pos - count);
return newargs;
}
/**
* Uses an Iteration over urlHash.db to detect malformed URL-Entries.
* Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
*
* @param homePath Root-Path where all information is to be found.
*/
private static void urldbcleanup(String homePath) {
File root = new File(homePath);
File indexroot = new File(root, "DATA/INDEX");
try {serverLog.configureLogging(homePath,new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexroot, 10000);
currentUrlDB.urldbcleanup();
currentUrlDB.close();
}
private static void RWIHashList(String homePath, String targetName, String resource, String format) {
plasmaWordIndex WordIndex = null;
serverLog log = new serverLog("HASHLIST");
File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX");
File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX");
String wordChunkStartHash = "AAAAAAAAAAAA";
try {serverLog.configureLogging(homePath,new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
log.logInfo("STARTING CREATION OF RWI-HASHLIST");
File root = new File(homePath);
try {
Iterator indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 3000, log);
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false);
}
int counter = 0;
indexContainer container = null;
if (format.equals("zip")) {
log.logInfo("Writing Hashlist to ZIP-file: " + targetName + ".zip");
ZipEntry zipEntry = new ZipEntry(targetName + ".txt");
File file = new File(root, targetName + ".zip");
ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file));
bos.putNextEntry(zipEntry);
while (indexContainerIterator.hasNext()) {
counter++;
container = (indexContainer) indexContainerIterator.next();
bos.write((container.getWordHash()).getBytes());
bos.write(serverCore.crlf);
if (counter % 500 == 0) {
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getWordHash());
}
}
bos.flush();
bos.close();
} else {
log.logInfo("Writing Hashlist to TXT-file: " + targetName + ".txt");
File file = new File(root, targetName + ".txt");
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
while (indexContainerIterator.hasNext()) {
counter++;
container = (indexContainer) indexContainerIterator.next();
bos.write((container.getWordHash()).getBytes());
bos.write(serverCore.crlf);
if (counter % 500 == 0) {
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + container.getWordHash());
}
}
bos.flush();
bos.close();
}
log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + container.getWordHash());
} catch (IOException e) {
log.logSevere("IOException", e);
}
if (WordIndex != null) {
WordIndex.close();
WordIndex = null;
}
}
/**
* Searching for peers affected by Bug documented in <a href="http://www.yacy-forum.de/viewtopic.php?p=16056#16056">YaCy-Forum Posting 16056</a>
* @param homePath
* @see <a href="http://www.yacy-forum.de/viewtopic.php?p=16056#16056">YaCy-Forum Posting 16056</a>
*/
public static void testPeerDB(String homePath) {
try {
File yacyDBPath = new File(new File(homePath), "DATA/YACYDB");
String[] dbFileNames = {"seed.new.db","seed.old.db","seed.pot.db"};
for (int i=0; i < dbFileNames.length; i++) {
File dbFile = new File(yacyDBPath,dbFileNames[i]);
kelondroMapObjects db = new kelondroMapObjects(new kelondroDyn(dbFile, true, true, 3000, yacySeedDB.commonHashLength, 480, '#', kelondroBase64Order.enhancedCoder, true, false, true), 500, yacySeedDB.sortFields, yacySeedDB.longaccFields, yacySeedDB.doubleaccFields, null, null);
kelondroMapObjects.mapIterator it;
it = db.maps(true, false);
while (it.hasNext()) {
Map dna = (Map) it.next();
String peerHash = (String) dna.get("key");
if (peerHash.length() < yacySeedDB.commonHashLength) {
String peerName = (String) dna.get("Name");
String peerIP = (String) dna.get("IP");
String peerPort = (String) dna.get("Port");
while (peerHash.length() < yacySeedDB.commonHashLength) { peerHash = peerHash + "_"; }
System.err.println("Invalid Peer-Hash found in '" + dbFileNames[i] + "': " + peerName + ":" + peerHash + ", http://" + peerIP + ":" + peerPort);
}
}
db.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Main-method which is started by java. Checks for special arguments or
* starts up the application.
*
* @param args
* Given arguments from the command line.
*/
public static void main(String args[]) {
// check assertion status
//ClassLoader.getSystemClassLoader().setDefaultAssertionStatus(true);
boolean assertionenabled = false;
assert assertionenabled = true;
if (assertionenabled) System.out.println("Asserts are enabled");
// check memory amount
System.gc();
long startupMemFree = Runtime.getRuntime().freeMemory(); // the amount of free memory in the Java Virtual Machine
long startupMemTotal = Runtime.getRuntime().totalMemory(); // the total amount of memory in the Java virtual machine; may vary over time
serverMemory.available(); // force initialization of class serverMemory
// go into headless awt mode
System.setProperty("java.awt.headless", "true");
//which XML Parser?
// if(System.getProperty("javax.xml.parsers.DocumentBuilderFactory")==null){
// System.setProperty("javax.xml.parsers.DocumentBuilderFactory", "org.apache.crimson.jaxp.DocumentBuilderFactoryImpl");
// }
// if(System.getProperty("javax.xml.parsers.SAXParserFactory")==null){
// System.setProperty("javax.xml.parsers.SAXParserFactory", "org.apache.crimson.jaxp.SAXParserFactoryImpl");
// }
String applicationRoot = System.getProperty("user.dir").replace('\\', '/');
//System.out.println("args.length=" + args.length);
//System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
if ((args.length >= 1) && ((args[0].toLowerCase().equals("-startup")) || (args[0].equals("-start")))) {
// normal start-up of yacy
if (args.length == 2) applicationRoot= args[1];
startup(applicationRoot, startupMemFree, startupMemTotal);
} else if ((args.length >= 1) && ((args[0].toLowerCase().equals("-shutdown")) || (args[0].equals("-stop")))) {
// normal shutdown of yacy
if (args.length == 2) applicationRoot= args[1];
shutdown(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-minimizeurldb"))) {
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
// migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
// attention: this may run long and should not be interrupted!
if (args.length >= 3 && args[1].toLowerCase().equals("-cache")) {
args = shift(args, 1, 2);
}
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
if (args.length == 2) applicationRoot= args[1];
minimizeUrlDB(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-testpeerdb"))) {
if (args.length == 2) {
applicationRoot= args[1];
} else if (args.length > 2) {
System.err.println("Usage: -testPeerDB [homeDbRoot]");
}
testPeerDB(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-genwordstat"))) {
// this can help to create a stop-word list
// to use this, you need a 'yacy.words' file in the root path
// start this with "java -classpath classes yacy -genwordstat [<rootdir>]"
if (args.length == 2) applicationRoot= args[1];
genWordstat(applicationRoot);
} else if ((args.length == 4) && (args[0].toLowerCase().equals("-cleanwordlist"))) {
// this can be used to organize and clean a word-list
// start this with "java -classpath classes yacy -cleanwordlist <word-file> <minlength> <maxlength>"
int minlength = Integer.parseInt(args[2]);
int maxlength = Integer.parseInt(args[3]);
cleanwordlist(args[1], minlength, maxlength);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-transfercr"))) {
// transfer a single cr file to a remote peer
String targetaddress = args[1];
String crfile = args[2];
transferCR(targetaddress, crfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-domlist"))) {
// generate a url list and save it in a file
String source = "lurl";
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
if ((args[2].equals("nurl")) ||
(args[2].equals("lurl")) ||
(args[2].equals("eurl")))
source = args[2];
args = shift(args, 1, 2);
}
String format = "txt";
if (args.length >= 3 && args[1].toLowerCase().equals("-format")) {
if ((args[2].equals("html")) ||
(args[2].equals("zip")) ||
(args[2].equals("gzip")))
format = args[2];
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= args[1];
String outfile = "domlist_" + source + "_" + System.currentTimeMillis();
domlist(applicationRoot, source, format, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urllist"))) {
// generate a url list and save it in a file
String source = "lurl";
if (args.length >= 3 && args[1].toLowerCase().equals("-source")) {
if ((args[2].equals("nurl")) ||
(args[2].equals("lurl")) ||
(args[2].equals("eurl")))
source = args[2];
args = shift(args, 1, 2);
}
boolean html = false;
if (args.length >= 3 && args[1].toLowerCase().equals("-format")) {
if (args[2].equals("html")) html = true;
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= args[1];
String outfile = "urllist_" + source + "_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
urllist(applicationRoot, source, html, outfile);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1];
urldbcleanup(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-rwihashlist"))) {
// generate a url list and save it in a file
String domain = "all";
String format = "txt";
if (args.length >= 2) domain= args[1];
if (args.length >= 3) format= args[2];
if (args.length == 4) applicationRoot= args[3];
String outfile = "rwihashlist_" + System.currentTimeMillis();
RWIHashList(applicationRoot, outfile, domain, format);
} else {
if (args.length == 1) applicationRoot= args[0];
startup(applicationRoot, startupMemFree, startupMemTotal);
}
}
}
/**
* This class is a helper class whose instance is started, when the java virtual
* machine shuts down. Signals the plasmaSwitchboard to shut down.
*/
class shutdownHookThread extends Thread {
private plasmaSwitchboard sb = null;
private Thread mainThread = null;
public shutdownHookThread(Thread mainThread, plasmaSwitchboard sb) {
super();
this.sb = sb;
this.mainThread = mainThread;
}
public void run() {
try {
if (!this.sb.isTerminated()) {
serverLog.logConfig("SHUTDOWN","Shutdown via shutdown hook.");
// sending the yacy main thread a shutdown signal
serverLog.logFine("SHUTDOWN","Signaling shutdown to the switchboard.");
this.sb.terminate();
// waiting for the yacy thread to finish execution
serverLog.logFine("SHUTDOWN","Waiting for main thread to finish.");
if (this.mainThread.isAlive() && !this.sb.isTerminated()) {
this.mainThread.join();
}
}
} catch (Exception e) {
serverLog.logSevere("SHUTDOWN","Unexpected error. " + e.getClass().getName(),e);
}
}
}