From be1f324fcadedca7bd72ba4853f0b2963ae8305e Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 22 Jul 2005 13:56:19 +0000 Subject: [PATCH] performance setting for remote indexing configuration and latest changes for 0.39 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@424 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 10 ++--- build.xml | 2 - doc/News.html | 36 ++++++++++++++++++ doc/Volunteers.html | 1 + htroot/IndexCreate_p.html | 29 +++++++++++---- htroot/IndexCreate_p.java | 43 ++++++++++++++++++++-- htroot/Steering.java | 8 ++-- readme.txt | 38 ++++++++++++------- source/de/anomic/plasma/plasmaHTCache.java | 7 +++- yacy.init | 10 ++--- yacy.logging | 2 +- yacy.yellow | 3 -- 12 files changed, 145 insertions(+), 44 deletions(-) diff --git a/build.properties b/build.properties index 84bcfc09a..2bbb2dcf9 100644 --- a/build.properties +++ b/build.properties @@ -3,11 +3,11 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.387 -releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz -#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz -releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} -#releaseDir=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr} +releaseVersion=0.39 +#releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz +releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz +#releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} +releaseDir=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr} releaseNr=$Revision$ # defining some file/directory access rights diff --git a/build.xml b/build.xml index fc4b5699e..f6cd7c7e4 100644 --- a/build.xml +++ b/build.xml @@ -322,8 +322,6 @@ - - diff --git a/doc/News.html b/doc/News.html index 88be58d16..a402e91da 100644 --- a/doc/News.html +++ b/doc/News.html @@ -43,6 +43,42 @@ globalheader(); --> +

v0.39_20050722_424 +

    +
  • New Features:
  • +
      +
    • Added snippets to search results. Snippets are fetched by searching peer from original web sites and are also transported during result transmission from remote search results.
    • +
    • Proxy shows now an error page in case of errors.
    • +
    • Preparation for localization: started (not finished) German translation
    • +
    • Status page shows now memory amount, transfer volume and indexing speed as PPM (pages per minute). A global PPM (sum over all peers) is also computed.
    • +
    • Re-Structuring of Index-Creation Menue: added more submenues and queue monitors
    • +
    • Added feature to start crawling on bookmark files
    • +
    • Added blocking of blacklistet urls in indexReceive (remote DHT index transmissions)
    • +
    • Added port forwarding for remote peer connections (the peer may now be connected to an configurable address)
    • +
    • Added bbCode for Profiles
    • +
    • Memory Management in Performance Menu: a memory-limit can be set as condition for queue execution.
    • +
    • Added option to do performance-limited remote crawls (use this instead to switch off remote indexing if you are scared about too much performance loss on your machine)
    • +
    • Enhanced logging, configuration with yacy.logging
    • +
    +
  • Performance: enhanced indexing speed
  • +
      +
    • Implemented indexing/loading multithreading
    • +
    • Enhanced caching in database (less memory occupation)
    • +
    • Replaced RAM-queue after indexing by a file-based queue (makes long queues possible)
    • +
    • Changed assortment cache-flush procedure: words may now appear in any assortment, not only one assortment. This prevents assortment-flushes, increases the capacity and prevents creation of files in DATA/PLASMADB/WORDS, which further speeds up indexing.
    • +
    • Speed-up of start-up and shut-down by replacement of stack by array. The dumped index takes also less memory on disk now. Because dumping is faster, the cache may be bigger which also increases indexing speed.
    • +
    +
  • Bugfixes:
  • +
      +
    • Better shut-down behavior, time-out on sockets, less exceptions
    • +
    • Fixed gzip decoding and content-length in http-client
    • +
    • Better httpd header validation
    • +
    • Fixed possible memory leaks
    • +
    • Fixed 100% CPU bug (caused by repeated GC when memory was low)
    • +
    • Fixed UTF8-decoding for parser
    • +
    +
+

v0.38_20050603_208

  • Enhanced Crawling: diff --git a/doc/Volunteers.html b/doc/Volunteers.html index 3d73172c0..766d36621 100644 --- a/doc/Volunteers.html +++ b/doc/Volunteers.html @@ -28,6 +28,7 @@ the P2P-based index distribution was designed and implemented by Michael Pete
  • Alexander Schier did much alpha-testing, gave valuable feed-back on my ideas and suggested his own. He suggested and implemented large parts of the popular blacklist feature. He supplied the 'Log'-menu function, the skin-feature, many minor changes, bug fixes and the Windows-Installer - version of YaCy. Alex also provides and maintaines the german documentation for yacy.
  • Martin Thelian made system-wide performance enhancement by introducing thread pools. He provided a plug-in system for external text parser and integrated many parser libraries such as pdf and word format parsers. Martin also extended and enhanced the http and proxy protocol towards a rfc-clean implementation.
  • Roland Ramthun owns and administrates the German YaCy-Forum. He also cares for correct English spelling and a German translation of the YaCy user interface. Roland and other forum participants extended the PHPForum code to make it possible to track development feature requests and bug reports with status codes and editor flags.
  • +
  • Marc Nause made enhancements to the Message- and User-Profile menues and functions.
  • Natali Christen designed the YaCy logo.
  • Thomas Quella designed the Kaskelix mascot.
  • Wolfgang Sander-Beuermann, executive board member of the German search-engine association SuMa-eV diff --git a/htroot/IndexCreate_p.html b/htroot/IndexCreate_p.html index e4561449d..b140221b9 100644 --- a/htroot/IndexCreate_p.html +++ b/htroot/IndexCreate_p.html @@ -133,11 +133,28 @@ Crawling and indexing can be done by remote peers. Your peer can search and index for other peers and they can search for you. - - + + + + + + + +
    - - Accept remote crawling requests + + + + Accept remote crawling requests and perform crawl at maximum load +
    + + + Accept remote crawling requests and perform crawl at maximum of + Pages Per Minute (minimum is 1, low system load at PPM <= 30) +
    + + + Do not accept remote crawling requests (please set this only if you cannot accept to crawl only one page per minute; see option above)

    @@ -238,9 +255,7 @@ No remote crawl peers availible.
    #(/remoteCrawlPeers)# -

    - -

    +

    #(crawler-paused)# diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index eddd2d5c4..59c9dce87 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -66,6 +66,7 @@ import de.anomic.plasma.plasmaURL; import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import de.anomic.server.serverThread; import de.anomic.tools.bitfield; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; @@ -224,9 +225,27 @@ public class IndexCreate_p { } } + + if (post.containsKey("distributedcrawling")) { - boolean crawlResponse = ((String) post.get("crawlResponse", "")).equals("on"); - env.setConfig("crawlResponse", (crawlResponse) ? "true" : "false"); + long newBusySleep = Integer.parseInt(env.getConfig("62_remotetriggeredcrawl_busysleep", "100")); + if (((String) post.get("dcr", "")).equals("acceptCrawlMax")) { + env.setConfig("crawlResponse", "true"); + newBusySleep = 100; + } else if (((String) post.get("dcr", "")).equals("acceptCrawlLimited")) { + env.setConfig("crawlResponse", "true"); + int newppm = Integer.parseInt(post.get("acceptCrawlLimit", "1")); + if (newppm < 1) newppm = 1; + newBusySleep = 60000 / newppm; + if (newBusySleep < 100) newBusySleep = 100; + } else if (((String) post.get("dcr", "")).equals("acceptCrawlDenied")) { + env.setConfig("crawlResponse", "false"); + } + serverThread rct = switchboard.getThread("62_remotetriggeredcrawl"); + rct.setBusySleep(newBusySleep); + env.setConfig("62_remotetriggeredcrawl_busysleep", "" + newBusySleep); + //boolean crawlResponse = ((String) post.get("acceptCrawlMax", "")).equals("on"); + //env.setConfig("crawlResponse", (crawlResponse) ? "true" : "false"); } @@ -249,7 +268,25 @@ public class IndexCreate_p { prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? 1 : 0); prop.put("localIndexingChecked", env.getConfig("localIndexing", "").equals("true") ? 1 : 0); prop.put("crawlOrderChecked", env.getConfig("crawlOrder", "").equals("true") ? 1 : 0); - prop.put("crawlResponseChecked", env.getConfig("crawlResponse", "").equals("true") ? 1 : 0); + long busySleep = Integer.parseInt(env.getConfig("62_remotetriggeredcrawl_busysleep", "100")); + if (env.getConfig("crawlResponse", "").equals("true")) { + if (busySleep <= 100) { + prop.put("acceptCrawlMaxChecked", 1); + prop.put("acceptCrawlLimitedChecked", 0); + prop.put("acceptCrawlDeniedChecked", 0); + } else { + prop.put("acceptCrawlMaxChecked", 0); + prop.put("acceptCrawlLimitedChecked", 1); + prop.put("acceptCrawlDeniedChecked", 0); + } + } else { + prop.put("acceptCrawlMaxChecked", 0); + prop.put("acceptCrawlLimitedChecked", 0); + prop.put("acceptCrawlDeniedChecked", 1); + } + int ppm = (int) ((long) 60000 / busySleep); + if (ppm > 60) ppm = 60; + prop.put("PPM", ppm); prop.put("xsstopwChecked", env.getConfig("xsstopw", "").equals("true") ? 1 : 0); prop.put("xdstopwChecked", env.getConfig("xdstopw", "").equals("true") ? 1 : 0); prop.put("xpstopwChecked", env.getConfig("xpstopw", "").equals("true") ? 1 : 0); diff --git a/htroot/Steering.java b/htroot/Steering.java index 45948513e..95ceea47d 100644 --- a/htroot/Steering.java +++ b/htroot/Steering.java @@ -59,13 +59,15 @@ public class Steering { // handle access rights switch (switchboard.adminAuthenticated(header)) { case 0: // wrong password given - try {Thread.currentThread().sleep(3000);} catch (InterruptedException e) {} + try {Thread.currentThread().sleep(3000);} catch (InterruptedException e) {} // prevent brute-force + prop.put("AUTHENTICATE", "admin log-in"); // force log-in + return prop; case 1: // no password given prop.put("AUTHENTICATE", "admin log-in"); // force log-in return prop; case 2: // no password stored - prop.put("info", 1); // actions only with password - return prop; + //prop.put("info", 1); // actions only with password + //return prop; case 3: // soft-authenticated for localhost only case 4: // hard-authenticated, all ok } diff --git a/readme.txt b/readme.txt index 291f225a3..73192b6cf 100644 --- a/readme.txt +++ b/readme.txt @@ -7,7 +7,7 @@ under certain conditions; see file gpl.txt for details. --------------------------------------------------------------------------- This is a P2P-based Web Search Engine -and also a http/https proxy. +and also a caching http/https proxy. The complete documentation can be found inside the 'doc' subdirectory in this release. Start browsing the manual by opening the index.html @@ -16,22 +16,34 @@ file with your web browser. YOU NEED JAVA 1.4.2 OR LATER TO RUN THIS APPLICATION! PLEASE DOWNLOAD JAVA FROM http://java.sun.com -Startup of YaCy: +Startup and Shutdown of YaCy: -- on Linux : start startYACY.sh -- on Windows : double-click startYACY.bat -- on Mac OS X : double-click startYACY.command (alias possible!) -- on any other OS : set your classpath to the 'classes' folder - and execute yacy.class, while your current system - path must target the release directory to access the - configuration files. +- on Linux: +to start: execute startYACY.sh +to stop : execute stopYACY.sh -Then start using YaCy with the applications on-line interface: +- on Windows: +to start: double-click startYACY.bat +to stop : double-click stopYACY.bat + +- on Mac OS X: +to start: double-click startYACY.command (alias possible!) +to stop : double-click stopYACY.command + +- on any other OS: +to start: execute java as + java -classpath classes:htroot:lib/commons-collections.jar:lib/commons-pool-1.2.jar yacy -startup +to stop : execute java as + java -classpath classes:htroot:lib/commons-collections.jar:lib/commons-pool-1.2.jar yacy -shutdown + + +YaCy is a server process that can be administrated and used +with your web browser: browse to http://localhost:8080 where you can see your personal search, configuration and administration interface. -If you want to use the proxy, simply configure your internet connection -to use YaCy at port 8080. You can also change the default proxy port. +If you want to use the built-in proxy, simply configure your internet connection +to use a proxy at port 8080. You can also change this default proxy port. If you like to use YaCy not as proxy but only as distributed crawling/search engine, you can do so. @@ -47,5 +59,5 @@ feel free to ask the author for a business proposal to customize YaCy according to your needs. We also provide integration solutions if the software is about to be integrated into your enterprise application. -Germany, Frankfurt a.M., 03.05.2005 +Germany, Frankfurt a.M., 22.07.2005 Michael Peter Christen diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 2c2a38b95..35546cc60 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -136,7 +136,10 @@ public final class plasmaHTCache { } public Entry pop() { - return (Entry) cacheStack.removeFirst(); + if (cacheStack.size() > 0) + return (Entry) cacheStack.removeFirst(); + else + return null; } public void storeHeader(String urlHash, httpHeader responseHeader) throws IOException { @@ -243,7 +246,7 @@ public final class plasmaHTCache { ageHours = (System.currentTimeMillis() - Long.parseLong(((String) cacheAge.firstKey()).substring(0, 16), 16)) / 3600000; } catch (NumberFormatException e) { - e.printStackTrace(); + //e.printStackTrace(); } log.logSystem("CACHE SCANNED, CONTAINS " + c + " FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " + diff --git a/yacy.init b/yacy.init index 72780b9d0..85b195230 100644 --- a/yacy.init +++ b/yacy.init @@ -400,7 +400,7 @@ xpstopw=true 20_dhtdistribution_memprereq=1000000 30_peerping_idlesleep=120000 30_peerping_busysleep=120000 -30_peerping_memprereq=20000 +30_peerping_memprereq=100000 40_peerseedcycle_idlesleep=1800000 40_peerseedcycle_busysleep=1200000 40_peerseedcycle_memprereq=1000000 @@ -411,14 +411,14 @@ xpstopw=true 61_globalcrawltrigger_busysleep=100 61_globalcrawltrigger_memprereq=1000000 62_remotetriggeredcrawl_idlesleep=10000 -62_remotetriggeredcrawl_busysleep=100 +62_remotetriggeredcrawl_busysleep=2000 62_remotetriggeredcrawl_memprereq=1000000 70_cachemanager_idlesleep=5000 70_cachemanager_busysleep=0 -70_cachemanager_memprereq=10000 +70_cachemanager_memprereq=100000 80_indexing_idlesleep=5000 80_indexing_busysleep=0 -80_indexing_memprereq=2000000 +80_indexing_memprereq=1000000 90_cleanup_idlesleep=300000 90_cleanup_busysleep=300000 90_cleanup_memprereq=0 @@ -461,7 +461,7 @@ ramCacheWiki = 8192 # flushed to disc; this may last some minutes. # maxWaitingWordFlush gives the number of seconds that the shutdown # may last for the word flush -wordCacheMax = 6000 +wordCacheMax = 10000 maxWaitingWordFlush = 180 # Specifies if yacy can be used as transparent http proxy. diff --git a/yacy.logging b/yacy.logging index 08a96ec74..d6be2159f 100644 --- a/yacy.logging +++ b/yacy.logging @@ -12,7 +12,7 @@ # INFO regular action information (i.e. any httpd request URL) # FINEST in-function status debug output PARSER.level = INFO -YACY.level = FINEST +YACY.level = INFO HTCACHE.level = INFO PLASMA.level = FINEST SERVER.level = INFO diff --git a/yacy.yellow b/yacy.yellow index c52b9b878..bffc56d29 100644 --- a/yacy.yellow +++ b/yacy.yellow @@ -3,6 +3,3 @@ # then the proxy passes the client's user agent to the domain's server google yahoo -heise -ebay -stern \ No newline at end of file