|
|
|
@ -38,31 +38,6 @@
|
|
|
|
|
//the intact and unchanged copyright notice.
|
|
|
|
|
//Contributions and changes to the program code must be marked as such.
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
This is the main class of the proxy.
|
|
|
|
|
From here, several threads are started:
|
|
|
|
|
|
|
|
|
|
- one single instance of the plasmaSwitchboard is generated,
|
|
|
|
|
which itself starts a thread with a plasmaHTMLCache object. This object simply counts
|
|
|
|
|
files sizes in the cache and terminates then.
|
|
|
|
|
It also generates a plasmaCrawlerLoader object, which may itself start
|
|
|
|
|
some more httpc-calling threads to load web pages. They terminate automatically when a page has loaded
|
|
|
|
|
- one serverCore - thread is started, which implements a multi-threaded server.
|
|
|
|
|
The process may start itself many more processes that handle connections.
|
|
|
|
|
- finally, all idle-dependent processes are written in a queue in plasmaSwitchboard
|
|
|
|
|
which are worked off inside an idle-sensitive loop of the main process. (here)
|
|
|
|
|
|
|
|
|
|
On termination, the following must be done:
|
|
|
|
|
- stop feeding of the crawling process because it othervise fills the indexing queue.
|
|
|
|
|
- say goodbye to connected peers and disable new connections. Don't wait for success.
|
|
|
|
|
- first terminate the serverCore thread. This prevents that new cache objects are queued
|
|
|
|
|
- wait that the plasmaHTMLCache terminates (it should be normal that this process already has terminated)
|
|
|
|
|
- then wait for termination of all loader process of the plasmaCrawlerLoader
|
|
|
|
|
- work off the indexing and cache storage queue. These values are inside a RAM cache and would be lost othervise
|
|
|
|
|
- write all settings
|
|
|
|
|
- terminate
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import java.io.BufferedReader;
|
|
|
|
|
import java.io.BufferedWriter;
|
|
|
|
@ -108,6 +83,41 @@ import de.anomic.server.logging.serverLog;
|
|
|
|
|
import de.anomic.tools.enumerateFiles;
|
|
|
|
|
import de.anomic.yacy.yacyCore;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This is the main class of the proxy. Several threads are started from here:
|
|
|
|
|
* <ul>
|
|
|
|
|
* <li>one single instance of the plasmaSwitchboard is generated, which itself
|
|
|
|
|
* starts a thread with a plasmaHTMLCache object. This object simply counts
|
|
|
|
|
* files sizes in the cache and terminates them. It also generates a
|
|
|
|
|
* plasmaCrawlerLoader object, which may itself start some more httpc-calling
|
|
|
|
|
* threads to load web pages. They terminate automatically when a page has
|
|
|
|
|
* loaded.
|
|
|
|
|
* <li>one serverCore - thread is started, which implements a multi-threaded
|
|
|
|
|
* server. The process may start itself many more processes that handle
|
|
|
|
|
* connections.
|
|
|
|
|
* <li>finally, all idle-dependent processes are written in a queue in
|
|
|
|
|
* plasmaSwitchboard which are worked off inside an idle-sensitive loop of the
|
|
|
|
|
* main process. (here)
|
|
|
|
|
* </ul>
|
|
|
|
|
*
|
|
|
|
|
* On termination, the following must be done:
|
|
|
|
|
* <ul>
|
|
|
|
|
* <li>stop feeding of the crawling process because it othervise fills the
|
|
|
|
|
* indexing queue.
|
|
|
|
|
* <li>say goodbye to connected peers and disable new connections. Don't wait for
|
|
|
|
|
* success.
|
|
|
|
|
* <li>first terminate the serverCore thread. This prevents that new cache
|
|
|
|
|
* objects are queued.
|
|
|
|
|
* <li>wait that the plasmaHTMLCache terminates (it should be normal that this
|
|
|
|
|
* process already has terminated).
|
|
|
|
|
* <li>then wait for termination of all loader process of the
|
|
|
|
|
* plasmaCrawlerLoader.
|
|
|
|
|
* <li>work off the indexing and cache storage queue. These values are inside a
|
|
|
|
|
* RAM cache and would be lost otherwise.
|
|
|
|
|
* <li>write all settings.
|
|
|
|
|
* <li>terminate.
|
|
|
|
|
* </ul>
|
|
|
|
|
*/
|
|
|
|
|
public final class yacy {
|
|
|
|
|
|
|
|
|
|
// static objects
|
|
|
|
@ -118,6 +128,14 @@ public final class yacy {
|
|
|
|
|
private static final String copyright = "[ YACY Proxy v" + vString + ", build " + vDATE + " by Michael Christen / www.yacy.net ]";
|
|
|
|
|
private static final String hline = "-------------------------------------------------------------------------------";
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Convert the combined versionstring into a pretty string.
|
|
|
|
|
* FIXME: Why is this so complicated?
|
|
|
|
|
*
|
|
|
|
|
* @param s Combined version string
|
|
|
|
|
* @return Pretty string where version and svn-Version are separated by an
|
|
|
|
|
* slash
|
|
|
|
|
*/
|
|
|
|
|
public static String combinedVersionString2PrettyString(String s) {
|
|
|
|
|
long svn;
|
|
|
|
|
try {svn = (long) (100000000.0 * Double.parseDouble(s));} catch (NumberFormatException ee) {svn = 0;}
|
|
|
|
@ -131,10 +149,24 @@ public final class yacy {
|
|
|
|
|
return vStr + "/" + svnStr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Combines the version of the proxy with the versionnumber from svn to a
|
|
|
|
|
* combined Version
|
|
|
|
|
*
|
|
|
|
|
* @param version Current given version for this proxy.
|
|
|
|
|
* @param svn Current version given from svn.
|
|
|
|
|
* @return String with the combined version
|
|
|
|
|
*/
|
|
|
|
|
public static float versvn2combinedVersion(float version, int svn) {
|
|
|
|
|
return (float) (((double) version * 100000000.0 + ((double) svn)) / 100000000.0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Starts up the whole application. Sets up all datastructures and starts
|
|
|
|
|
* the main threads.
|
|
|
|
|
*
|
|
|
|
|
* @param homePath Root-path where all information is to be found.
|
|
|
|
|
*/
|
|
|
|
|
private static void startup(String homePath) {
|
|
|
|
|
long startup = yacyCore.universalTime();
|
|
|
|
|
try {
|
|
|
|
@ -422,8 +454,16 @@ public final class yacy {
|
|
|
|
|
} catch (Exception e) {} // was once stopped by de.anomic.net.ftpc$sm.checkExit(ftpc.java:1790)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Loads the configuration from the data-folder.
|
|
|
|
|
* FIXME: Why is this called over and over again from every method, instead
|
|
|
|
|
* of setting the configurationdata once for this class in main?
|
|
|
|
|
*
|
|
|
|
|
* @param mes Where are we called from, so that the errormessages can be
|
|
|
|
|
* more descriptive.
|
|
|
|
|
* @param homePath Root-path where all the information is to be found.
|
|
|
|
|
* @return Properties read from the configurationfile.
|
|
|
|
|
*/
|
|
|
|
|
private static Properties configuration(String mes, String homePath) {
|
|
|
|
|
serverLog.logSystem(mes, "Application Root Path: " + homePath.toString());
|
|
|
|
|
|
|
|
|
@ -448,6 +488,12 @@ public final class yacy {
|
|
|
|
|
return config;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Call the shutdown-page from yacy to tell it to shut down. This method is
|
|
|
|
|
* called if you start yacy with the argument -shutdown.
|
|
|
|
|
*
|
|
|
|
|
* @param homePath Root-path where all the information is to be found.
|
|
|
|
|
*/
|
|
|
|
|
static void shutdown(String homePath) {
|
|
|
|
|
// start up
|
|
|
|
|
System.out.println(copyright);
|
|
|
|
@ -490,6 +536,16 @@ public final class yacy {
|
|
|
|
|
serverLog.logSystem("REMOTE-SHUTDOWN", "YACY will terminate after working off all enqueued tasks.");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This method gets all found words and outputs a statistic about the score
|
|
|
|
|
* of the words. The output of this method can be used to create stop-word
|
|
|
|
|
* lists. This method will be called if you start yacy with the argument
|
|
|
|
|
* -genwordstat.
|
|
|
|
|
* FIXME: How can stop-word list be created from this output? What type of
|
|
|
|
|
* score is output?
|
|
|
|
|
*
|
|
|
|
|
* @param homePath Root-Path where all the information is to be found.
|
|
|
|
|
*/
|
|
|
|
|
private static void genWordstat(String homePath) {
|
|
|
|
|
// start up
|
|
|
|
|
System.out.println(copyright);
|
|
|
|
@ -529,6 +585,15 @@ public final class yacy {
|
|
|
|
|
serverLog.logSystem("GEN-WORDSTAT", "FINISHED");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Migrates the PLASMA WORDS structure to the assortment cache if possible.
|
|
|
|
|
* This method will be called if you start yacy with the argument
|
|
|
|
|
* -migratewords.
|
|
|
|
|
* Caution: This might take a long time to finish. Don't interrupt it!
|
|
|
|
|
* FIXME: Shouldn't this method be private?
|
|
|
|
|
*
|
|
|
|
|
* @param homePath Root-path where all the information is to be found.
|
|
|
|
|
*/
|
|
|
|
|
public static void migrateWords(String homePath) {
|
|
|
|
|
// run with "java -classpath classes yacy -migratewords"
|
|
|
|
|
try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {}
|
|
|
|
@ -562,6 +627,13 @@ public final class yacy {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Reads all words from the given file and creates a hashmap, where key is
|
|
|
|
|
* the plasma word hash and value is the word itself.
|
|
|
|
|
*
|
|
|
|
|
* @param wordlist File where the words are stored.
|
|
|
|
|
* @return HashMap with the hash-word - relation.
|
|
|
|
|
*/
|
|
|
|
|
private static HashMap loadWordMap(File wordlist) {
|
|
|
|
|
// returns a hash-word - Relation
|
|
|
|
|
HashMap wordmap = new HashMap();
|
|
|
|
@ -574,6 +646,13 @@ public final class yacy {
|
|
|
|
|
return wordmap;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Reads all words from the given file and creats as HashSet, which contains
|
|
|
|
|
* all found words.
|
|
|
|
|
*
|
|
|
|
|
* @param wordlist File where the words are stored.
|
|
|
|
|
* @return HashSet with the words
|
|
|
|
|
*/
|
|
|
|
|
private static HashSet loadWordSet(File wordlist) {
|
|
|
|
|
// returns a set of words
|
|
|
|
|
HashSet wordset = new HashSet();
|
|
|
|
@ -586,6 +665,15 @@ public final class yacy {
|
|
|
|
|
return wordset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Cleans a wordlist in a file according to the length of the words. The
|
|
|
|
|
* file with the given filename is read and then only the words in the given
|
|
|
|
|
* length-range are written back to the file.
|
|
|
|
|
*
|
|
|
|
|
* @param wordlist Name of the file the words are stored in.
|
|
|
|
|
* @param minlength Minimal needed length for each word to be stored.
|
|
|
|
|
* @param maxlength Maximal allowed length for each word to be stored.
|
|
|
|
|
*/
|
|
|
|
|
private static void cleanwordlist(String wordlist, int minlength, int maxlength) {
|
|
|
|
|
// start up
|
|
|
|
|
System.out.println(copyright);
|
|
|
|
@ -630,6 +718,12 @@ public final class yacy {
|
|
|
|
|
serverLog.logSystem("CLEAN-WORDLIST", "FINISHED");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Gets all words from the stopword-list and removes them in the databases.
|
|
|
|
|
* FIXME: Really? Don't know if I read this correctly.
|
|
|
|
|
*
|
|
|
|
|
* @param homePath Root-Path where all information is to be found.
|
|
|
|
|
*/
|
|
|
|
|
private static void deleteStopwords(String homePath) {
|
|
|
|
|
// start up
|
|
|
|
|
System.out.println(copyright);
|
|
|
|
@ -668,7 +762,12 @@ public final class yacy {
|
|
|
|
|
serverLog.logSystem("DELETE-STOPWORDS", "FINISHED");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// application wrapper
|
|
|
|
|
/**
|
|
|
|
|
* Main-method which is started by java. Checks for special arguments or
|
|
|
|
|
* starts up the application.
|
|
|
|
|
*
|
|
|
|
|
* @param args Given arguments from the command line.
|
|
|
|
|
*/
|
|
|
|
|
public static void main(String args[]) {
|
|
|
|
|
String applicationRoot = System.getProperty("user.dir");
|
|
|
|
|
//System.out.println("args.length=" + args.length);
|
|
|
|
@ -709,6 +808,10 @@ public final class yacy {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This class is a helper class whose instance is started, when the java virtual
|
|
|
|
|
* machine shuts down. Signals the plasmaSwitchboard to shut down.
|
|
|
|
|
*/
|
|
|
|
|
class shutdownHookThread extends Thread {
|
|
|
|
|
private plasmaSwitchboard sb = null;
|
|
|
|
|
private Thread mainThread = null;
|
|
|
|
|