diff --git a/build.xml b/build.xml index 2ea83b30a..de172f643 100644 --- a/build.xml +++ b/build.xml @@ -338,7 +338,9 @@ - + + + diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index c273bb2d8..c96686f41 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -274,25 +274,24 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt private boolean blacklistedURL(String hostlow, String path) { if (blackListURLs == null) return false; - int index = 0; + String pp = ""; // path-pattern + // first try to match the domain with wildcard '*' // [TL] While "." are found within the string - while ((index = hostlow.indexOf(".", index + 1)) != -1) { - if (blackListURLs.get(hostlow.substring(0, index + 1) + "*") != null) { - //System.out.println("Host blocked: " + hostlow.substring(0, index+1) + "*"); - return true; + int index = 0; + while ((index = hostlow.indexOf('.', index + 1)) != -1) { + if ((pp = (String) blackListURLs.get(hostlow.substring(0, index + 1) + "*")) != null) { + return ((pp.equals("*")) || (path.substring(1).matches(pp))); } } - index = hostlow.length(); - while ((index = hostlow.lastIndexOf(".", index - 1)) != -1) { - if (blackListURLs.get("*" + hostlow.substring(index, hostlow.length())) != null) { - //System.out.println("Host blocked: " + "*" + hostlow.substring(index, host.length())); - return true; + while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) { + if ((pp = (String) blackListURLs.get("*" + hostlow.substring(index, hostlow.length()))) != null) { + return ((pp.equals("*")) || (path.substring(1).matches(pp))); } } - String pp = ""; // path-pattern + // try to match without wildcard in domain return (((pp = (String) blackListURLs.get(hostlow)) != null) && ((pp.equals("*")) || (path.substring(1).matches(pp)))); } diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index e8804add4..99249f3f0 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -210,112 +210,6 @@ public final class plasmaCrawlWorker extends Thread { this.done = true; } } - - /* - private httpc newhttpc(String server, int port, boolean ssl) throws IOException { - // a new httpc connection, combined with possible remote proxy - if (remoteProxyUse) - return httpc.getInstance(server, port, socketTimeout, ssl, remoteProxyHost, remoteProxyPort); - else return httpc.getInstance(server, port, socketTimeout, ssl); - } - - private void load( - URL url, - String referer, - String initiator, - int depth, - plasmaCrawlProfile.entry profile - ) throws IOException { - if (url == null) return; - Date requestDate = new Date(); // remember the time... - String host = url.getHost(); - String path = url.getPath(); - int port = url.getPort(); - boolean ssl = url.getProtocol().equals("https"); - if (port < 0) port = (ssl) ? 443 : 80; - - // set referrer; in some case advertise a little bit: - referer = referer.trim(); - if (referer.length() == 0) referer = "http://www.yacy.net/yacy/"; - - // take a file from the net - httpc remote = null; - try { - // create a request header - httpHeader requestHeader = new httpHeader(); - requestHeader.put("User-Agent", httpdProxyHandler.userAgent); - requestHeader.put("Referer", referer); - requestHeader.put("Accept-Encoding", "gzip,deflate"); - - //System.out.println("CRAWLER_REQUEST_HEADER=" + requestHeader.toString()); // DEBUG - - // open the connection - remote = newhttpc(host, port, ssl); - - // send request - httpc.response res = remote.GET(path, requestHeader); - - if (res.status.startsWith("200")) { - // the transfer is ok - long contentLength = res.responseHeader.contentLength(); - - // reserve cache entry - plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile); - - // request has been placed and result has been returned. work off response - File cacheFile = cacheManager.getCachePath(url); - try { - if (!(plasmaParser.supportedMimeTypesContains(res.responseHeader.mime()))) { - // if the response has not the right file type then reject file - remote.close(); - log.logInfo("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString()); - htCache.status = plasmaHTCache.CACHE_UNFILLED; - } else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) { - // we write the new cache entry to file system directly - cacheFile.getParentFile().mkdirs(); - FileOutputStream fos = new FileOutputStream(cacheFile); - htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file - fos.close(); - htCache.status = plasmaHTCache.CACHE_FILL; - } else { - if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error); - // anyway, the content still lives in the content scraper - htCache.cacheArray = res.writeContent(null); // writes only into cacheArray - htCache.status = plasmaHTCache.CACHE_PASSING; - } - // enQueue new entry with response header - if ((initiator == null) || (initiator.length() == 0)) { - // enqueued for proxy writings - cacheManager.stackProcess(htCache); - } else { - // direct processing for crawling - cacheManager.process(htCache); - } - } catch (SocketException e) { - // this may happen if the client suddenly closes its connection - // maybe the user has stopped loading - // in that case, we are not responsible and just forget it - // but we clean the cache also, since it may be only partial - // and most possible corrupted - if (cacheFile.exists()) cacheFile.delete(); - log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString()); - } - } else { - // if the response has not the right response type then reject file - log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString()); - // not processed any further - } - remote.close(); - } catch (Exception e) { - // this may happen if the targeted host does not exist or anything with the - // remote server was wrong. - log.logError("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString()); - e.printStackTrace(); - } finally { - if (remote != null) httpc.returnInstance(remote); - } - } - */ public void setStopped(boolean stopped) { this.stopped = stopped; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index d530320e1..1d78f87f4 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -587,7 +587,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // if the server is busy, we do crawling more slowly - if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {} + //if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {} // if crawling was paused we have to wait until we wer notified to continue synchronized(this.crawlingPausedSync) { @@ -793,7 +793,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // put anchors on crawl stack if (((processCase == 4) || (processCase == 5)) && - (entry.depth < entry.profile.generalDepth())) { + (entry.depth < entry.profile.generalDepth())) { Map hl = document.getHyperlinks(); Iterator i = hl.entrySet().iterator(); String nexturlstring; @@ -816,7 +816,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // create index - String descr = document.getMainLongTitle(); URL referrerURL = entry.referrerURL(); String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL); diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java index 0ff1e80a1..15cbb6487 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntity.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java @@ -175,11 +175,11 @@ public class plasmaWordIndexEntity { boolean success = theLocation.delete(); // and also the paren directory if that is empty if (success) { - File f = theLocation.getParentFile(); - while ((f.isDirectory()) && (f.list().length == 0)) { - if (!(f.delete())) break; - f = f.getParentFile(); - } + File f = theLocation.getParentFile(); + while ((f.isDirectory()) && (f.list().length == 0)) { + if (!(f.delete())) break; + f = f.getParentFile(); + } } // reset all values theIndex = null; @@ -188,7 +188,7 @@ public class plasmaWordIndexEntity { theTmpMap = new TreeMap(); //theIndex.removeAll(); return success; - } else { + } else { theTmpMap = new TreeMap(); return true; } diff --git a/source/de/anomic/server/logging/serverLog.java b/source/de/anomic/server/logging/serverLog.java index d656be2ad..d571d7b14 100644 --- a/source/de/anomic/server/logging/serverLog.java +++ b/source/de/anomic/server/logging/serverLog.java @@ -73,7 +73,8 @@ public final class serverLog { private final Logger theLogger; public serverLog(String appName) { - this.theLogger = Logger.getLogger(appName); + this.theLogger = Logger.getLogger(appName); + this.theLogger.setLevel(Level.FINEST); // set a default level } public void setLevel(Level newLevel) { @@ -152,11 +153,10 @@ public final class serverLog { } - public static final void configureLogging(String homePath) throws SecurityException, FileNotFoundException, IOException { + public static final void configureLogging(File loggingConfigFile) throws SecurityException, FileNotFoundException, IOException { FileInputStream fileIn = null; try { - File loggingConfigFile = new File(homePath, "yacy.logging"); System.out.println("STARTUP: Trying to load logging configuration from file " + loggingConfigFile.toString()); fileIn = new FileInputStream(loggingConfigFile); diff --git a/source/yacy.java b/source/yacy.java index aa0dd357c..d53a51ecd 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -78,6 +78,7 @@ import java.net.URL; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.Enumeration; import java.util.Properties; import java.util.TreeSet; import java.util.regex.Matcher; @@ -88,11 +89,14 @@ import de.anomic.http.httpc; import de.anomic.http.httpd; import de.anomic.http.httpdFileHandler; import de.anomic.http.httpdProxyHandler; +import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; +import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntry; +import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverCodings; import de.anomic.server.serverCore; import de.anomic.server.serverFileUtils; @@ -100,7 +104,6 @@ import de.anomic.server.serverSystem; import de.anomic.server.logging.serverLog; import de.anomic.tools.enumerateFiles; import de.anomic.yacy.yacyCore; -//import de.anomic.http.*; public final class yacy { @@ -128,7 +131,7 @@ public final class yacy { // setting up logging try { - serverLog.configureLogging(homePath); + serverLog.configureLogging(new File(homePath, "yacy.logging")); } catch (IOException e) { System.out.println("could not find logging properties in homePath=" + homePath); e.printStackTrace(); @@ -464,7 +467,53 @@ public final class yacy { // finished serverLog.logSystem("GEN-WORDSTAT", "FINISHED"); } + + private static void checkMigrate(File dbroot, serverLog log, File file, plasmaWordIndex wordIndex) throws IOException { + kelondroTree db = new kelondroTree(file, 0); + String wordhash = file.getName().substring(0, 12); + int size = db.size(); + long length = file.length(); + db.close(); + if (size <= 50) { + plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordhash); + plasmaWordIndexEntity entity = new plasmaWordIndexEntity(dbroot, wordhash, true); + Enumeration entries = entity.elements(true); + plasmaWordIndexEntry entry; + while (entries.hasMoreElements()) { + entry = (plasmaWordIndexEntry) entries.nextElement(); + container.add(new plasmaWordIndexEntry[]{entry}, System.currentTimeMillis()); + } + wordIndex.addEntries(container); + entity.deleteComplete(); + entity.close(); + if (file.exists()) { + log.logInfo("MIGRATED " + file.toString() + ": " + size + " entries, " + (length / 1024) + "kb, delete fail at end"); + file.delete(); + } else { + log.logInfo("MIGRATED " + file.toString() + ": " + size + " entries, " + (length / 1024) + "kb"); + } + } else { + log.logInfo("SKIPPED " + file.toString() + ": " + size + " entries, " + (length / 1024) + "kb"); + } + db.close(); + } + public static void migrateWords(String homePath) { + // run with "java -classpath classes yacy -migratewords" + try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {} + File dbroot = new File(new File(homePath), "DATA/PLASMADB"); + try { + serverLog log = new serverLog("WORDMIGRATION"); + plasmaWordIndex wordIndex = new plasmaWordIndex(dbroot, 20000, log); + enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true); + while (words.hasMoreElements()) { + checkMigrate(dbroot, log, (File) words.nextElement(), wordIndex); + } + wordIndex.close(60); + } catch (IOException e) { + e.printStackTrace(); + } + } private static HashMap loadWordMap(File wordlist) { // returns a hash-word - Relation @@ -575,8 +624,8 @@ public final class yacy { // application wrapper public static void main(String args[]) { String applicationRoot = System.getProperty("user.dir"); - System.out.println("args.length=" + args.length); - System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]"); + //System.out.println("args.length=" + args.length); + //System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]"); if ((args.length >= 1) && ((args[0].equals("-startup")) || (args[0].equals("-start")))) { // normal start-up of yacy if (args.length == 2) applicationRoot= args[1]; @@ -585,6 +634,11 @@ public final class yacy { // normal shutdown of yacy if (args.length == 2) applicationRoot= args[1]; shutdown(applicationRoot); + } else if ((args.length >= 1) && (args[0].equals("-migratewords"))) { + // migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible + // attention: this may run long and should not be interrupted! + if (args.length == 2) applicationRoot= args[1]; + migrateWords(applicationRoot); } else if ((args.length >= 1) && (args[0].equals("-deletestopwords"))) { // delete those words in the index that are listed in the stopwords file if (args.length == 2) applicationRoot= args[1]; @@ -606,7 +660,6 @@ public final class yacy { startup(applicationRoot); } } - } class shutdownHookThread extends Thread diff --git a/yacy.logging b/yacy.logging index 0101b8827..d6be2159f 100644 --- a/yacy.logging +++ b/yacy.logging @@ -4,17 +4,17 @@ # setting logging levels vor individual classes # possible values are: -# ZERO no output at all -# FAILURE system-level error, internal cause, critical and not fixeable (i.e. inconsistency) +# OFF no output at all +# SEVERE system-level error, internal cause, critical and not fixeable (i.e. inconsistency) # ERROR exceptional error, catcheable and non-critical (i.e. file error) # WARNING uncritical service failure, may require user activity (i.e. input required, wrong authorization) -# SYSTEM regular system status information (i.e. start-up messages) +# CONFIG regular system status information (i.e. start-up messages) # INFO regular action information (i.e. any httpd request URL) -# DEBUG in-function status debug output +# FINEST in-function status debug output PARSER.level = INFO YACY.level = INFO HTCACHE.level = INFO -PLASMA.level = INFO +PLASMA.level = FINEST SERVER.level = INFO # List of global handlers