added word migration to assortments (start with 'java -classpath classes yacy -migratewords')

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@278 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 33e992cc8b
commit a5b40923b6

@ -338,7 +338,9 @@
<pathelement location="${libx}" />
<fileset dir="${libx}" includes="**/*.jar" />
</classpath>
<arg line="-start ${user.home}"/>
<arg line="-start"/>
<!-- <arg line="-migratewords"/> -->
<!-- <arg line="-start ${user.dir}"/>-->
</java>
</target>

@ -274,25 +274,24 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
private boolean blacklistedURL(String hostlow, String path) {
if (blackListURLs == null) return false;
int index = 0;
String pp = ""; // path-pattern
// first try to match the domain with wildcard '*'
// [TL] While "." are found within the string
while ((index = hostlow.indexOf(".", index + 1)) != -1) {
if (blackListURLs.get(hostlow.substring(0, index + 1) + "*") != null) {
//System.out.println("Host blocked: " + hostlow.substring(0, index+1) + "*");
return true;
int index = 0;
while ((index = hostlow.indexOf('.', index + 1)) != -1) {
if ((pp = (String) blackListURLs.get(hostlow.substring(0, index + 1) + "*")) != null) {
return ((pp.equals("*")) || (path.substring(1).matches(pp)));
}
}
index = hostlow.length();
while ((index = hostlow.lastIndexOf(".", index - 1)) != -1) {
if (blackListURLs.get("*" + hostlow.substring(index, hostlow.length())) != null) {
//System.out.println("Host blocked: " + "*" + hostlow.substring(index, host.length()));
return true;
while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) {
if ((pp = (String) blackListURLs.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
return ((pp.equals("*")) || (path.substring(1).matches(pp)));
}
}
String pp = ""; // path-pattern
// try to match without wildcard in domain
return (((pp = (String) blackListURLs.get(hostlow)) != null) &&
((pp.equals("*")) || (path.substring(1).matches(pp))));
}

@ -210,112 +210,6 @@ public final class plasmaCrawlWorker extends Thread {
this.done = true;
}
}
/*
private httpc newhttpc(String server, int port, boolean ssl) throws IOException {
// a new httpc connection, combined with possible remote proxy
if (remoteProxyUse)
return httpc.getInstance(server, port, socketTimeout, ssl, remoteProxyHost, remoteProxyPort);
else return httpc.getInstance(server, port, socketTimeout, ssl);
}
private void load(
URL url,
String referer,
String initiator,
int depth,
plasmaCrawlProfile.entry profile
) throws IOException {
if (url == null) return;
Date requestDate = new Date(); // remember the time...
String host = url.getHost();
String path = url.getPath();
int port = url.getPort();
boolean ssl = url.getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443 : 80;
// set referrer; in some case advertise a little bit:
referer = referer.trim();
if (referer.length() == 0) referer = "http://www.yacy.net/yacy/";
// take a file from the net
httpc remote = null;
try {
// create a request header
httpHeader requestHeader = new httpHeader();
requestHeader.put("User-Agent", httpdProxyHandler.userAgent);
requestHeader.put("Referer", referer);
requestHeader.put("Accept-Encoding", "gzip,deflate");
//System.out.println("CRAWLER_REQUEST_HEADER=" + requestHeader.toString()); // DEBUG
// open the connection
remote = newhttpc(host, port, ssl);
// send request
httpc.response res = remote.GET(path, requestHeader);
if (res.status.startsWith("200")) {
// the transfer is ok
long contentLength = res.responseHeader.contentLength();
// reserve cache entry
plasmaHTCache.Entry htCache = cacheManager.newEntry(requestDate, depth, url, requestHeader, res.status, res.responseHeader, initiator, profile);
// request has been placed and result has been returned. work off response
File cacheFile = cacheManager.getCachePath(url);
try {
if (!(plasmaParser.supportedMimeTypesContains(res.responseHeader.mime()))) {
// if the response has not the right file type then reject file
remote.close();
log.logInfo("REJECTED WRONG MIME TYPE " + res.responseHeader.mime() + " for url " + url.toString());
htCache.status = plasmaHTCache.CACHE_UNFILLED;
} else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) {
// we write the new cache entry to file system directly
cacheFile.getParentFile().mkdirs();
FileOutputStream fos = new FileOutputStream(cacheFile);
htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file
fos.close();
htCache.status = plasmaHTCache.CACHE_FILL;
} else {
if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error);
// anyway, the content still lives in the content scraper
htCache.cacheArray = res.writeContent(null); // writes only into cacheArray
htCache.status = plasmaHTCache.CACHE_PASSING;
}
// enQueue new entry with response header
if ((initiator == null) || (initiator.length() == 0)) {
// enqueued for proxy writings
cacheManager.stackProcess(htCache);
} else {
// direct processing for crawling
cacheManager.process(htCache);
}
} catch (SocketException e) {
// this may happen if the client suddenly closes its connection
// maybe the user has stopped loading
// in that case, we are not responsible and just forget it
// but we clean the cache also, since it may be only partial
// and most possible corrupted
if (cacheFile.exists()) cacheFile.delete();
log.logError("CRAWLER LOADER ERROR1: with url=" + url.toString() + ": " + e.toString());
}
} else {
// if the response has not the right response type then reject file
log.logInfo("REJECTED WRONG STATUS TYPE '" + res.status + "' for url " + url.toString());
// not processed any further
}
remote.close();
} catch (Exception e) {
// this may happen if the targeted host does not exist or anything with the
// remote server was wrong.
log.logError("CRAWLER LOADER ERROR2 with url=" + url.toString() + ": " + e.toString());
e.printStackTrace();
} finally {
if (remote != null) httpc.returnInstance(remote);
}
}
*/
public void setStopped(boolean stopped) {
this.stopped = stopped;

@ -587,7 +587,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// if the server is busy, we do crawling more slowly
if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
//if (!(cacheManager.idle())) try {Thread.currentThread().sleep(2000);} catch (InterruptedException e) {}
// if crawling was paused we have to wait until we wer notified to continue
synchronized(this.crawlingPausedSync) {
@ -793,7 +793,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// put anchors on crawl stack
if (((processCase == 4) || (processCase == 5)) &&
(entry.depth < entry.profile.generalDepth())) {
(entry.depth < entry.profile.generalDepth())) {
Map hl = document.getHyperlinks();
Iterator i = hl.entrySet().iterator();
String nexturlstring;
@ -816,7 +816,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// create index
String descr = document.getMainLongTitle();
URL referrerURL = entry.referrerURL();
String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL);

@ -175,11 +175,11 @@ public class plasmaWordIndexEntity {
boolean success = theLocation.delete();
// and also the paren directory if that is empty
if (success) {
File f = theLocation.getParentFile();
while ((f.isDirectory()) && (f.list().length == 0)) {
if (!(f.delete())) break;
f = f.getParentFile();
}
File f = theLocation.getParentFile();
while ((f.isDirectory()) && (f.list().length == 0)) {
if (!(f.delete())) break;
f = f.getParentFile();
}
}
// reset all values
theIndex = null;
@ -188,7 +188,7 @@ public class plasmaWordIndexEntity {
theTmpMap = new TreeMap();
//theIndex.removeAll();
return success;
} else {
} else {
theTmpMap = new TreeMap();
return true;
}

@ -73,7 +73,8 @@ public final class serverLog {
private final Logger theLogger;
public serverLog(String appName) {
this.theLogger = Logger.getLogger(appName);
this.theLogger = Logger.getLogger(appName);
this.theLogger.setLevel(Level.FINEST); // set a default level
}
public void setLevel(Level newLevel) {
@ -152,11 +153,10 @@ public final class serverLog {
}
public static final void configureLogging(String homePath) throws SecurityException, FileNotFoundException, IOException {
public static final void configureLogging(File loggingConfigFile) throws SecurityException, FileNotFoundException, IOException {
FileInputStream fileIn = null;
try {
File loggingConfigFile = new File(homePath, "yacy.logging");
System.out.println("STARTUP: Trying to load logging configuration from file " + loggingConfigFile.toString());
fileIn = new FileInputStream(loggingConfigFile);

@ -78,6 +78,7 @@ import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Enumeration;
import java.util.Properties;
import java.util.TreeSet;
import java.util.regex.Matcher;
@ -88,11 +89,14 @@ import de.anomic.http.httpc;
import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils;
@ -100,7 +104,6 @@ import de.anomic.server.serverSystem;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.enumerateFiles;
import de.anomic.yacy.yacyCore;
//import de.anomic.http.*;
public final class yacy {
@ -128,7 +131,7 @@ public final class yacy {
// setting up logging
try {
serverLog.configureLogging(homePath);
serverLog.configureLogging(new File(homePath, "yacy.logging"));
} catch (IOException e) {
System.out.println("could not find logging properties in homePath=" + homePath);
e.printStackTrace();
@ -464,7 +467,53 @@ public final class yacy {
// finished
serverLog.logSystem("GEN-WORDSTAT", "FINISHED");
}
private static void checkMigrate(File dbroot, serverLog log, File file, plasmaWordIndex wordIndex) throws IOException {
kelondroTree db = new kelondroTree(file, 0);
String wordhash = file.getName().substring(0, 12);
int size = db.size();
long length = file.length();
db.close();
if (size <= 50) {
plasmaWordIndexEntryContainer container = new plasmaWordIndexEntryContainer(wordhash);
plasmaWordIndexEntity entity = new plasmaWordIndexEntity(dbroot, wordhash, true);
Enumeration entries = entity.elements(true);
plasmaWordIndexEntry entry;
while (entries.hasMoreElements()) {
entry = (plasmaWordIndexEntry) entries.nextElement();
container.add(new plasmaWordIndexEntry[]{entry}, System.currentTimeMillis());
}
wordIndex.addEntries(container);
entity.deleteComplete();
entity.close();
if (file.exists()) {
log.logInfo("MIGRATED " + file.toString() + ": " + size + " entries, " + (length / 1024) + "kb, delete fail at end");
file.delete();
} else {
log.logInfo("MIGRATED " + file.toString() + ": " + size + " entries, " + (length / 1024) + "kb");
}
} else {
log.logInfo("SKIPPED " + file.toString() + ": " + size + " entries, " + (length / 1024) + "kb");
}
db.close();
}
public static void migrateWords(String homePath) {
// run with "java -classpath classes yacy -migratewords"
try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {}
File dbroot = new File(new File(homePath), "DATA/PLASMADB");
try {
serverLog log = new serverLog("WORDMIGRATION");
plasmaWordIndex wordIndex = new plasmaWordIndex(dbroot, 20000, log);
enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true);
while (words.hasMoreElements()) {
checkMigrate(dbroot, log, (File) words.nextElement(), wordIndex);
}
wordIndex.close(60);
} catch (IOException e) {
e.printStackTrace();
}
}
private static HashMap loadWordMap(File wordlist) {
// returns a hash-word - Relation
@ -575,8 +624,8 @@ public final class yacy {
// application wrapper
public static void main(String args[]) {
String applicationRoot = System.getProperty("user.dir");
System.out.println("args.length=" + args.length);
System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
//System.out.println("args.length=" + args.length);
//System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
if ((args.length >= 1) && ((args[0].equals("-startup")) || (args[0].equals("-start")))) {
// normal start-up of yacy
if (args.length == 2) applicationRoot= args[1];
@ -585,6 +634,11 @@ public final class yacy {
// normal shutdown of yacy
if (args.length == 2) applicationRoot= args[1];
shutdown(applicationRoot);
} else if ((args.length >= 1) && (args[0].equals("-migratewords"))) {
// migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
// attention: this may run long and should not be interrupted!
if (args.length == 2) applicationRoot= args[1];
migrateWords(applicationRoot);
} else if ((args.length >= 1) && (args[0].equals("-deletestopwords"))) {
// delete those words in the index that are listed in the stopwords file
if (args.length == 2) applicationRoot= args[1];
@ -606,7 +660,6 @@ public final class yacy {
startup(applicationRoot);
}
}
}
class shutdownHookThread extends Thread

@ -4,17 +4,17 @@
# setting logging levels vor individual classes
# possible values are:
# ZERO no output at all
# FAILURE system-level error, internal cause, critical and not fixeable (i.e. inconsistency)
# OFF no output at all
# SEVERE system-level error, internal cause, critical and not fixeable (i.e. inconsistency)
# ERROR exceptional error, catcheable and non-critical (i.e. file error)
# WARNING uncritical service failure, may require user activity (i.e. input required, wrong authorization)
# SYSTEM regular system status information (i.e. start-up messages)
# CONFIG regular system status information (i.e. start-up messages)
# INFO regular action information (i.e. any httpd request URL)
# DEBUG in-function status debug output
# FINEST in-function status debug output
PARSER.level = INFO
YACY.level = INFO
HTCACHE.level = INFO
PLASMA.level = INFO
PLASMA.level = FINEST
SERVER.level = INFO
# List of global handlers

Loading…
Cancel
Save