diff --git a/build.xml b/build.xml
index 2ea83b30a..de172f643 100644
--- a/build.xml
+++ b/build.xml
@@ -338,7 +338,9 @@
-
+
+
+
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index c273bb2d8..c96686f41 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -274,25 +274,24 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
private boolean blacklistedURL(String hostlow, String path) {
if (blackListURLs == null) return false;
- int index = 0;
+ String pp = ""; // path-pattern
+ // first try to match the domain with wildcard '*'
// [TL] While "." are found within the string
- while ((index = hostlow.indexOf(".", index + 1)) != -1) {
- if (blackListURLs.get(hostlow.substring(0, index + 1) + "*") != null) {
- //System.out.println("Host blocked: " + hostlow.substring(0, index+1) + "*");
- return true;
+ int index = 0;
+ while ((index = hostlow.indexOf('.', index + 1)) != -1) {
+ if ((pp = (String) blackListURLs.get(hostlow.substring(0, index + 1) + "*")) != null) {
+ return ((pp.equals("*")) || (path.substring(1).matches(pp)));
}
}
-
index = hostlow.length();
- while ((index = hostlow.lastIndexOf(".", index - 1)) != -1) {
- if (blackListURLs.get("*" + hostlow.substring(index, hostlow.length())) != null) {
- //System.out.println("Host blocked: " + "*" + hostlow.substring(index, host.length()));
- return true;
+ while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) {
+ if ((pp = (String) blackListURLs.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
+ return ((pp.equals("*")) || (path.substring(1).matches(pp)));
}
}
- String pp = ""; // path-pattern
+ // try to match without wildcard in domain
return (((pp = (String) blackListURLs.get(hostlow)) != null) &&
((pp.equals("*")) || (path.substring(1).matches(pp))));
}
diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java
index e8804add4..99249f3f0 100644
--- a/source/de/anomic/plasma/plasmaCrawlWorker.java
+++ b/source/de/anomic/plasma/plasmaCrawlWorker.java
@@ -210,112 +210,6 @@ public final class plasmaCrawlWorker extends Thread {
this.done = true;
}
}
-
- /*
- private httpc newhttpc(String server, int port, boolean ssl) throws IOException {
- // a new httpc connection, combined with possible remote proxy
- if (remoteProxyUse)
- return httpc.getInstance(server, port, socketTimeout, ssl, remoteProxyHost, remoteProxyPort);
- else return httpc.getInstance(server, port, socketTimeout, ssl);
- }
-
- private void load(
- URL url,
- String referer,
- String initiator,
- int depth,
- plasmaCrawlProfile.entry profile
- ) throws IOException {
- if (url == null) return;
- Date requestDate = new Date(); // remember the time...
- String host = url.getHost();
- String path = url.getPath();
- int port = url.getPort();
- boolean ssl = url.getProtocol().equals("https");
- if (port < 0) port = (ssl) ? 443 : 80;
-
- // set referrer; in some case advertise a little bit:
- referer = referer.trim();
- if (referer.length() == 0) referer = "http://www.yacy.net/yacy/";
-
- // take a file from the net
- httpc remote = null;
- try {
- // create a request header
- httpHeader requestHeader = new httpHeader();
- requestHeader.put("User-Agent", httpdProxyHandler.userAgent);
- requestHeader.put("Referer", referer);
- requestHeader.put("Accept-Encoding", "gzip,deflate");
-
- //System.out.println("CRAWLER_REQUEST_HEADER=" + requestHeader.toString()); // DEBUG
-
- // open the connection
- remote = newhttpc(host, port, ssl);
-
- // send request
- httpc.response res = remote.GET(path, requestHeader);
-
- if (res.status.startsWith("200")) {
- // the transfer is ok
- long contentLength = res.responseHeader.contentLength();
-
- // reserve cache entry
- plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile);
-
- // request has been placed and result has been returned. work off response
- File cacheFile = cacheManager.getCachePath(url);
- try {
- if (!(plasmaParser.supportedMimeTypesContains(res.responseHeader.mime()))) {
- // if the response has not the right file type then reject file
- remote.close();
- log.logInfo("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString());
- htCache.status = plasmaHTCache.CACHE_UNFILLED;
- } else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) {
- // we write the new cache entry to file system directly
- cacheFile.getParentFile().mkdirs();
- FileOutputStream fos = new FileOutputStream(cacheFile);
- htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file
- fos.close();
- htCache.status = plasmaHTCache.CACHE_FILL;
- } else {
- if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error);
- // anyway, the content still lives in the content scraper
- htCache.cacheArray = res.writeContent(null); // writes only into cacheArray
- htCache.status = plasmaHTCache.CACHE_PASSING;
- }
- // enQueue new entry with response header
- if ((initiator == null) || (initiator.length() == 0)) {
- // enqueued for proxy writings
- cacheManager.stackProcess(htCache);
- } else {
- // direct processing for crawling
- cacheManager.process(htCache);
- }
- } catch (SocketException e) {
- // this may happen if the client suddenly closes its connection
- // maybe the user has stopped loading
- // in that case, we are not responsible and just forget it
- // but we clean the cache also, since it may be only partial
- // and most possible corrupted
- if (cacheFile.exists()) cacheFile.delete();
- log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString());
- }
- } else {
- // if the response has not the right response type then reject file
- log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString());
- // not processed any further
- }
- remote.close();
- } catch (Exception e) {
- // this may happen if the targeted host does not exist or anything with the
- // remote server was wrong.
- log.logError("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString());
- e.printStackTrace();
- } finally {
- if (remote != null) httpc.returnInstance(remote);
- }
- }
- */
public void setStopped(boolean stopped) {
this.stopped = stopped;
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index d530320e1..1d78f87f4 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -587,7 +587,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// if the server is busy, we do crawling more slowly
- if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
+ //if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
// if crawling was paused we have to wait until we wer notified to continue
synchronized(this.crawlingPausedSync) {
@@ -793,7 +793,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// put anchors on crawl stack
if (((processCase == 4) || (processCase == 5)) &&
- (entry.depth < entry.profile.generalDepth())) {
+ (entry.depth < entry.profile.generalDepth())) {
Map hl = document.getHyperlinks();
Iterator i = hl.entrySet().iterator();
String nexturlstring;
@@ -816,7 +816,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// create index
-
String descr = document.getMainLongTitle();
URL referrerURL = entry.referrerURL();
String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL);
diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java
index 0ff1e80a1..15cbb6487 100644
--- a/source/de/anomic/plasma/plasmaWordIndexEntity.java
+++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java
@@ -175,11 +175,11 @@ public class plasmaWordIndexEntity {
boolean success = theLocation.delete();
// and also the paren directory if that is empty
if (success) {
- File f = theLocation.getParentFile();
- while ((f.isDirectory()) && (f.list().length == 0)) {
- if (!(f.delete())) break;
- f = f.getParentFile();
- }
+ File f = theLocation.getParentFile();
+ while ((f.isDirectory()) && (f.list().length == 0)) {
+ if (!(f.delete())) break;
+ f = f.getParentFile();
+ }
}
// reset all values
theIndex = null;
@@ -188,7 +188,7 @@ public class plasmaWordIndexEntity {
theTmpMap = new TreeMap();
//theIndex.removeAll();
return success;
- } else {
+ } else {
theTmpMap = new TreeMap();
return true;
}
diff --git a/source/de/anomic/server/logging/serverLog.java b/source/de/anomic/server/logging/serverLog.java
index d656be2ad..d571d7b14 100644
--- a/source/de/anomic/server/logging/serverLog.java
+++ b/source/de/anomic/server/logging/serverLog.java
@@ -73,7 +73,8 @@ public final class serverLog {
private final Logger theLogger;
public serverLog(String appName) {
- this.theLogger = Logger.getLogger(appName);
+ this.theLogger = Logger.getLogger(appName);
+ this.theLogger.setLevel(Level.FINEST); // set a default level
}
public void setLevel(Level newLevel) {
@@ -152,11 +153,10 @@ public final class serverLog {
}
- public static final void configureLogging(String homePath) throws SecurityException, FileNotFoundException, IOException {
+ public static final void configureLogging(File loggingConfigFile) throws SecurityException, FileNotFoundException, IOException {
FileInputStream fileIn = null;
try {
- File loggingConfigFile = new File(homePath, "yacy.logging");
System.out.println("STARTUP: Trying to load logging configuration from file " + loggingConfigFile.toString());
fileIn = new FileInputStream(loggingConfigFile);
diff --git a/source/yacy.java b/source/yacy.java
index aa0dd357c..d53a51ecd 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -78,6 +78,7 @@ import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.Enumeration;
import java.util.Properties;
import java.util.TreeSet;
import java.util.regex.Matcher;
@@ -88,11 +89,14 @@ import de.anomic.http.httpc;
import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
+import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
+import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
+import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils;
@@ -100,7 +104,6 @@ import de.anomic.server.serverSystem;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.enumerateFiles;
import de.anomic.yacy.yacyCore;
-//import de.anomic.http.*;
public final class yacy {
@@ -128,7 +131,7 @@ public final class yacy {
// setting up logging
try {
- serverLog.configureLogging(homePath);
+ serverLog.configureLogging(new File(homePath, "yacy.logging"));
} catch (IOException e) {
System.out.println("could not find logging properties in homePath=" + homePath);
e.printStackTrace();
@@ -464,7 +467,53 @@ public final class yacy {
// finished
serverLog.logSystem("GEN-WORDSTAT", "FINISHED");
}
+
+ private static void checkMigrate(File dbroot, serverLog log, File file, plasmaWordIndex wordIndex) throws IOException {
+ kelondroTree db = new kelondroTree(file, 0);
+ String wordhash = file.getName().substring(0, 12);
+ int size = db.size();
+ long length = file.length();
+ db.close();
+ if (size <= 50) {
+ plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordhash);
+ plasmaWordIndexEntity entity = new plasmaWordIndexEntity(dbroot, wordhash, true);
+ Enumeration entries = entity.elements(true);
+ plasmaWordIndexEntry entry;
+ while (entries.hasMoreElements()) {
+ entry = (plasmaWordIndexEntry) entries.nextElement();
+ container.add(new plasmaWordIndexEntry[]{entry}, System.currentTimeMillis());
+ }
+ wordIndex.addEntries(container);
+ entity.deleteComplete();
+ entity.close();
+ if (file.exists()) {
+ log.logInfo("MIGRATED " + file.toString() + ": " + size + " entries, " + (length / 1024) + "kb, delete fail at end");
+ file.delete();
+ } else {
+ log.logInfo("MIGRATED " + file.toString() + ": " + size + " entries, " + (length / 1024) + "kb");
+ }
+ } else {
+ log.logInfo("SKIPPED " + file.toString() + ": " + size + " entries, " + (length / 1024) + "kb");
+ }
+ db.close();
+ }
+ public static void migrateWords(String homePath) {
+ // run with "java -classpath classes yacy -migratewords"
+ try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {}
+ File dbroot = new File(new File(homePath), "DATA/PLASMADB");
+ try {
+ serverLog log = new serverLog("WORDMIGRATION");
+ plasmaWordIndex wordIndex = new plasmaWordIndex(dbroot, 20000, log);
+ enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true);
+ while (words.hasMoreElements()) {
+ checkMigrate(dbroot, log, (File) words.nextElement(), wordIndex);
+ }
+ wordIndex.close(60);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
private static HashMap loadWordMap(File wordlist) {
// returns a hash-word - Relation
@@ -575,8 +624,8 @@ public final class yacy {
// application wrapper
public static void main(String args[]) {
String applicationRoot = System.getProperty("user.dir");
- System.out.println("args.length=" + args.length);
- System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
+ //System.out.println("args.length=" + args.length);
+ //System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
if ((args.length >= 1) && ((args[0].equals("-startup")) || (args[0].equals("-start")))) {
// normal start-up of yacy
if (args.length == 2) applicationRoot= args[1];
@@ -585,6 +634,11 @@ public final class yacy {
// normal shutdown of yacy
if (args.length == 2) applicationRoot= args[1];
shutdown(applicationRoot);
+ } else if ((args.length >= 1) && (args[0].equals("-migratewords"))) {
+ // migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
+ // attention: this may run long and should not be interrupted!
+ if (args.length == 2) applicationRoot= args[1];
+ migrateWords(applicationRoot);
} else if ((args.length >= 1) && (args[0].equals("-deletestopwords"))) {
// delete those words in the index that are listed in the stopwords file
if (args.length == 2) applicationRoot= args[1];
@@ -606,7 +660,6 @@ public final class yacy {
startup(applicationRoot);
}
}
-
}
class shutdownHookThread extends Thread
diff --git a/yacy.logging b/yacy.logging
index 0101b8827..d6be2159f 100644
--- a/yacy.logging
+++ b/yacy.logging
@@ -4,17 +4,17 @@
# setting logging levels vor individual classes
# possible values are:
-# ZERO no output at all
-# FAILURE system-level error, internal cause, critical and not fixeable (i.e. inconsistency)
+# OFF no output at all
+# SEVERE system-level error, internal cause, critical and not fixeable (i.e. inconsistency)
# ERROR exceptional error, catcheable and non-critical (i.e. file error)
# WARNING uncritical service failure, may require user activity (i.e. input required, wrong authorization)
-# SYSTEM regular system status information (i.e. start-up messages)
+# CONFIG regular system status information (i.e. start-up messages)
# INFO regular action information (i.e. any httpd request URL)
-# DEBUG in-function status debug output
+# FINEST in-function status debug output
PARSER.level = INFO
YACY.level = INFO
HTCACHE.level = INFO
-PLASMA.level = INFO
+PLASMA.level = FINEST
SERVER.level = INFO
# List of global handlers