You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/de/anomic/data/listManager.java

403 lines
14 KiB

// listManager.java
// -------------------------------------
// part of YACY
//
// (C) 2005, 2006 by Alexander Schier
// (C) 2007 by Bjoern 'Fuchs' Krombholz; fox.box@gmail.com
//
// last change: $LastChangedDate$ by $LastChangedBy$
// $LastChangedRevision$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.data;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.Vector;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.abstractURLPattern;
import de.anomic.plasma.urlPattern.plasmaURLPattern.blacklistFile;
import de.anomic.server.serverCore;
// The Naming of the functions is a bit strange...
public class listManager {
public static plasmaSwitchboard switchboard;
public static File listsPath;
/**
* Get ListSet from configuration file and return it as a unified Set.
*
* <b>Meaning of ListSet</b>: There are various "lists" in YaCy which are
* actually disjunct (pairwise unequal) sets which themselves can be seperated
* into different subsets. E.g., there can be more than one blacklist of a type.
* A ListSet is the set of all those "lists" (subsets) of an equal type.
*
* @param setName name of the ListSet
* @return a ListSet from configuration file
*/
public static Set getListSet(String setName) {
return string2set(switchboard.getConfig(setName, ""));
}
/**
* Removes an element from a ListSet and updates the configuration file
* accordingly. If the element doesn't exist, then nothing will be changed.
*
* @param setName name of the ListSet.
* @param listName name of the element to remove from the ListSet.
*/
public static void removeFromListSet(String setName, String listName) {
Set listSet = getListSet(setName);
if (listSet.size() > 0) {
listSet.remove(listName);
switchboard.setConfig(setName, collection2string(listSet));
}
}
/**
* Adds an element to an existing ListSet. If the ListSet doesn't exist yet,
* a new one will be added. If the ListSet already contains an identical element,
* then nothing happens.
*
* The new list will be written to the configuartion file.
*
* @param setName
* @param newListName
*/
public static void updateListSet(String setName, String newListName) {
Set listSet = getListSet(setName);
listSet.add(newListName);
switchboard.setConfig(setName, collection2string(listSet));
}
/**
* @param setName ListSet in which to search for an element.
* @param listName the element to search for.
* @return <code>true</code> if the ListSet "setName" contains an element
* "listName", <code>false</code> otherwise.
*/
public static boolean listSetContains(String setName, String listName) {
Set Lists = getListSet(setName);
return Lists.contains(listName);
}
//================general Lists==================
/**
* Read lines of a file into an ArrayList.
*
* @param listFile the file
* @return the resulting array as an ArrayList
*/
public static ArrayList getListArray(File listFile){
String line;
*) Asynchronous queuing of crawl job URLs (stackCrawl) various checks like the blacklist check or the robots.txt disallow check are now done by a separate thread to unburden the indexer thread(s) TODO: maybe we have to introduce a threadpool here if it turn out that this single thread is a bottleneck because of the time consuming robots.txt downloads *) improved index transfer The index selection and transmission is done in parallel now to improve index transfer performance. TODO: maybe we could speed up performance by unsing multiple transmission threads in parallel instead of only a single one. *) gzip encoded post requests it is now configureable if a gzip encoded post request should be send on intex transfer/distribution *) storage Peer (very experimentell and not optimized yet) Now it's possible to send the result of the yacy indexer thread to a remote peer istead of storing the indexed words locally. This could be done by setting the property "storagePeerHash" in the yacy config file - Please note that if the index transfer fails, the index ist stored locally. - TODO: currently this index transfer is done by the indexer thread. To seedup the indexer a) this transmission should be done in parallel and b) multiple chunks should be bundled and transfered together *) general performance improvements - better memory cleanup after http request processing has finished - replacing some string concatenations with stringBuffers - replacing BufferedInputStreams with serverByteBuffer - replacing vectors with arraylists wherever possible - replacing hashtables with hashmaps wherever possible This was done because function calls to verctor or hashtable functions take 3 time longer than calls to functions of arraylists or hashmaps. TODO: we should take a look on the class serverObject which is inherited from hashmap Do we realy need a synchronization for this class? TODO: replace arraylists with linkedLists if random access to the list elements is not needed *) Robots Parser supports if-modified-since downloads now If the downloaded robots.txt file is older than 7 days the robots parser tries to download the robots.txt with the if-modified-since header to avoid unnecessary downloads if the file was not changed. Additionally the ETag header is used to detect changes. *) Crawler: better handling of unsupported mimeTypes + FileExtension *) Bugfix: plasmaWordIndexEntity was not closed correctly in - query.java - plasmaswitchboard.java *) function minimizeUrlDB added to yacy.java this function tests the current urlHashDB for unused urls ATTENTION: please don't use this function at the moment because it causes the wordIndexDB to flush all words into the word directory! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@853 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago
ArrayList list = new ArrayList();
int count = 0;
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(listFile),"UTF-8"));
while((line = br.readLine()) != null){
list.add(line);
count++;
}
br.close();
} catch(IOException e) {
// list is empty
} finally {
if (br!=null) try { br.close(); } catch (Exception e) {}
}
return list;
}
/**
* Write a String to a file (used for string representation of lists).
*
* @param listFile the file to write to
* @param out the String to write
* @return returns <code>true</code> if successful, <code>false</code> otherwise
*/
public static boolean writeList(File listFile, String out) {
BufferedWriter bw = null;
try {
bw = new BufferedWriter(new PrintWriter(new FileWriter(listFile)));
bw.write(out);
bw.close();
return true;
} catch(IOException e) {
return false;
} finally {
if (bw!=null) try { bw.close(); } catch (Exception e) {}
}
}
/**
* Write elements of an Array of Strings to a file (one element per line).
*
* @param listFile the file to write to
* @param list the Array to write
* @return returns <code>true</code> if successful, <code>false</code> otherwise
*/
public static boolean writeList(File listFile, String[] list){
StringBuffer out = new StringBuffer();
for(int i=0;i < list.length; i++){
out
.append(list[i])
.append(serverCore.crlfString);
}
return writeList(listFile, new String(out)); //(File, String)
}
// same as below
public static String getListString(String filename, boolean withcomments) {
File listFile = new File(listsPath ,filename);
return getListString(listFile, withcomments);
}
/**
* Read lines of a text file into a String, optionally ignoring comments.
*
* @param listFile the File to read from.
* @param withcomments If <code>false</code> ignore lines starting with '#'.
* @return String representation of the file content.
*/
public static String getListString(File listFile, boolean withcomments){
StringBuffer temp = new StringBuffer();
BufferedReader br = null;
try{
br = new BufferedReader(new InputStreamReader(new FileInputStream(listFile)));
temp.ensureCapacity((int) listFile.length());
// Read the List
String line = "";
while ((line = br.readLine()) != null) {
if ((!line.startsWith("#") || withcomments) || !line.equals("")) {
//temp += line + serverCore.crlfString;
temp.append(line)
.append(serverCore.crlfString);
}
}
br.close();
} catch (IOException e) {
} finally {
if (br!=null) try { br.close(); } catch (Exception e) {}
}
return new String(temp);
}
// get a Directory Listing as a String Array
public static String[] getDirListing(String dirname){
final File dir = new File(dirname);
return getDirListing(dir);
}
/**
* Read content of a directory into a String array of file names.
*
* @param dir The directory to get the file listing from. If it doesn't exist yet,
* it will be created.
* @return array of file names
*/
public static String[] getDirListing(File dir){
String[] fileListString;
File[] fileList;
if (dir != null ) {
if (!dir.exists()) {
dir.mkdir();
}
fileList = dir.listFiles();
fileListString = new String[fileList.length];
for (int i=0; i<= fileList.length-1; i++) {
fileListString[i]=fileList[i].getName();
}
return fileListString;
}
return null;
}
// same as below
public static ArrayList getDirsRecursive(File dir, String notdir){
return getDirsRecursive(dir, notdir, true);
}
/**
* Returns a List of all dirs and subdirs as File Objects
*
* Warning: untested
*/
public static ArrayList getDirsRecursive(File dir, String notdir, boolean excludeDotfiles){
final File[] dirList = dir.listFiles();
final ArrayList resultList = new ArrayList();
ArrayList recursive;
Iterator iter;
for (int i=0;i<dirList.length;i++) {
if (dirList[i].isDirectory() && (!excludeDotfiles || !dirList[i].getName().startsWith(".")) && !dirList[i].getName().equals(notdir)) {
resultList.add(dirList[i]);
recursive = getDirsRecursive(dirList[i], notdir, excludeDotfiles);
iter=recursive.iterator();
while (iter.hasNext()) {
resultList.add(iter.next());
}
}
}
return resultList;
}
//================Helper functions for collection conversion==================
/**
* Simple conversion of a Collection of Strings to a comma separated String.
* If the implementing Collection subclass guaranties an order of its elements,
* the substrings of the result will have the same order.
*
* @param col a Collection of Strings.
* @return String with elements from set separated by comma.
*/
public static String collection2string(Collection col){
StringBuffer str = new StringBuffer();
if (col != null && (col.size() > 0)) {
Iterator it = col.iterator();
str.append((String) it.next());
while(it.hasNext()) {
str.append(",").append((String) it.next());
}
}
return str.toString();
}
/**
* @see listManager#string2vector(String)
*/
public static ArrayList string2arraylist(String string){
ArrayList l;
if (string != null) {
l = new ArrayList(Arrays.asList(string.split(",")));
} else {
l = new ArrayList();
}
return l;
}
/**
* Simple conversion of a comma separated list to a unified Set.
*
* @param string list of comma separated Strings
* @return resulting Set or empty Set if string is <code>null</code>
*/
public static Set string2set(String string){
HashSet set;
if (string != null) {
set = new HashSet(Arrays.asList(string.split(",")));
} else {
set = new HashSet();
}
return set;
}
/**
* Simple conversion of a comma separated list to a Vector containing
* the order of the substrings.
*
* @param string list of comma separated Strings
* @return resulting Vector or empty Vector if string is <code>null</code>
*/
public static Vector string2vector(String string){
Vector v;
if (string != null) {
v = new Vector(Arrays.asList(string.split(",")));
} else {
v = new Vector();
}
return v;
}
//=============Blacklist specific================
/**
* Load or reload all active Blacklists
*/
public static void reloadBlacklists(){
String supportedBlacklistTypesStr = abstractURLPattern.BLACKLIST_TYPES_STRING;
String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
ArrayList blacklistFiles = new ArrayList(supportedBlacklistTypes.length);
for (int i=0; i < supportedBlacklistTypes.length; i++) {
blacklistFile blFile = new blacklistFile(
switchboard.getConfig(
supportedBlacklistTypes[i] + ".BlackLists", switchboard.getConfig("BlackLists.DefaultList", "url.default.black")),
supportedBlacklistTypes[i]);
blacklistFiles.add(blFile);
}
plasmaSwitchboard.urlBlacklist.clear();
plasmaSwitchboard.urlBlacklist.loadList(
(blacklistFile[])blacklistFiles.toArray(new blacklistFile[blacklistFiles.size()]),
"/");
// switchboard.urlBlacklist.clear();
// if (f != "") switchboard.urlBlacklist.loadLists("black", f, "/");
}
}