many bugfixes, memory leak fixes, performance enhancements; new kelondroHashtable; activated snippets

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@313 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent dc1d707eb2
commit 1e7f062350

@ -81,7 +81,7 @@ public class index {
referrerprop.put("clientip", header.get("CLIENTIP"));
referrerprop.put("useragent", header.get("User-Agent"));
referrerprop.put("date", (new serverDate()).toShortString(false));
try { sb.facilityDB.update("backlinks", referer, referrerprop); } catch (IOException e) {}
if (sb.facilityDB != null) try { sb.facilityDB.update("backlinks", referer, referrerprop); } catch (IOException e) {}
}
}
@ -114,7 +114,7 @@ public class index {
// process search words
String querystring = (String) post.get("search", "");
try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {}
if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {}
TreeSet query = cleanQuery(querystring);
// filter out stopwords
TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);

@ -45,13 +45,9 @@
package de.anomic.kelondro;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.Iterator;
import java.util.StringTokenizer;
public class kelondroArray extends kelondroRecords {
@ -60,7 +56,7 @@ public class kelondroArray extends kelondroRecords {
private static short thisOHHandles = 0; // and two handles overhead for a double-chained list
public kelondroArray(File file, int[] columns, int intprops) throws IOException {
// this creates a new tree
// this creates a new array
super(file, 0, thisOHBytes, thisOHHandles, columns, intprops, columns.length /*txtProps*/, 80 /*txtPropWidth*/);
for (int i = 0; i < intprops; i++) setHandle(i, new Handle(0));
}
@ -90,6 +86,17 @@ public class kelondroArray extends kelondroRecords {
return getNode(new Handle(index)).getValues();
}
public synchronized int seti(int index, int value) throws IOException {
int before = getHandle(index).hashCode();
setHandle(index, new Handle(index));
return before;
}
public synchronized int geti(int index) throws IOException {
return getHandle(index).hashCode();
}
public void print() throws IOException {
System.out.println("PRINTOUT of table, length=" + size());
byte[][] row;

@ -146,6 +146,7 @@ public class kelondroMScoreCluster {
// set new value
c = scoreKey(en, ec);
cs = new Long(c);
Object oldcs = refkeyDB.remove(obj); if (oldcs != null) keyrefDB.remove(oldcs); // avoid memory leak
refkeyDB.put(obj, cs);
keyrefDB.put(cs, obj);
@ -174,6 +175,7 @@ public class kelondroMScoreCluster {
// set new value
c = scoreKey(en, ec);
cs = new Long(c);
Object oldcs = refkeyDB.remove(obj); if (oldcs != null) keyrefDB.remove(oldcs); // avoid memory leak
refkeyDB.put(obj, cs);
keyrefDB.put(cs, obj);

@ -59,9 +59,11 @@ public class kelondroMSetTools {
throw new ClassCastException();
}
private static int log2(int x) {
public static int log2a(int x) {
// this computes 1 + log2
// it is the number of bits in x, not the logarithmus by 2
int l = 0;
while (x > 0) {x = x >> 1; l++;}
while (x > 0) {x = x >>> 1; l++;}
return l;
}
@ -84,7 +86,7 @@ public class kelondroMSetTools {
int high = ((map.size() > set.size()) ? map.size() : set.size());
int low = ((map.size() > set.size()) ? set.size() : map.size());
int stepsEnum = 10 * (high + low - 1);
int stepsTest = 12 * log2(high) * low;
int stepsTest = 12 * log2a(high) * low;
// start most efficient method
if (stepsEnum > stepsTest) {
@ -156,7 +158,7 @@ public class kelondroMSetTools {
int high = ((set1.size() > set2.size()) ? set1.size() : set2.size());
int low = ((set1.size() > set2.size()) ? set2.size() : set1.size());
int stepsEnum = 10 * (high + low - 1);
int stepsTest = 12 * log2(high) * low;
int stepsTest = 12 * log2a(high) * low;
// start most efficient method
if (stepsEnum > stepsTest) {

@ -744,6 +744,10 @@ public class kelondroRecords {
}
// additional properties
public synchronized int handles() {
return this.HANDLES.length;
}
protected void setHandle(int pos, Handle handle) throws IOException {
if (pos >= HANDLES.length) throw new IllegalArgumentException("setHandle: handle array exceeded");
if (handle == null) handle = new Handle(NUL);

@ -119,18 +119,16 @@ public class kelondroTree extends kelondroRecords implements Comparator {
super(ra, buffersize);
}
private static byte abs(byte b) {
// for height computation
if (b < 0) return (byte) -b; else return b;
}
// Returns the value to which this map maps the specified key.
public synchronized byte[][] get(byte[] key) throws IOException {
//System.out.println("kelondroTree.get " + new String(key) + " in " + filename);
Search search = new Search(key);
if (search.found()) {
return search.getMatcher().getValues();
byte[][] result = search.getMatcher().getValues();
search = null;
return result;
} else {
search = null;
return null;
}
}
@ -306,6 +304,7 @@ public class kelondroTree extends kelondroRecords implements Comparator {
// a node with this key exist. simply overwrite the content and return old content
Node e = searchResult.getMatcher();
byte[][] result = e.setValues(newrow);
searchResult = null;
return result;
} else if (searchResult.isRoot()) {
// a node with this key does not exist and there is no node at all
@ -320,6 +319,7 @@ public class kelondroTree extends kelondroRecords implements Comparator {
e.setOHHandle(new Handle[] {null, null, null}); // {parent, leftchild, rightchild}
// do updates
setHandle(root, e.handle());
searchResult = null;
return null;
} else {
// a node with this key does not exist
@ -375,7 +375,7 @@ public class kelondroTree extends kelondroRecords implements Comparator {
parentOHByte[balance]--;
path = "R" + path;
}
increasedHight = ((abs(parentOHByte[balance]) - abs(prevHight)) > 0);
increasedHight = ((java.lang.Math.abs((int) parentOHByte[balance]) - java.lang.Math.abs((int) prevHight)) > 0);
parentNode.setOHByte(parentOHByte);
// here we either stop because we had no increased hight,
@ -384,7 +384,7 @@ public class kelondroTree extends kelondroRecords implements Comparator {
if (!(increasedHight)) break; // finished
// check rotation need
if (abs(parentOHByte[balance]) > 1) {
if (java.lang.Math.abs((int) parentOHByte[balance]) > 1) {
// rotate and stop then
//System.out.println("* DB DEBUG: " + path.substring(0,2) + " ROTATION AT NODE " + parentNode.handle().toString() + ": BALANCE=" + parentOHByte[balance]);
if (path.startsWith("LL")) {
@ -561,6 +561,7 @@ public class kelondroTree extends kelondroRecords implements Comparator {
Node result = search.getMatcher();
byte[][] values = result.getValues();
remove(result, search.getParent());
search = null;
return values;
} else {
return null;
@ -722,9 +723,12 @@ public class kelondroTree extends kelondroRecords implements Comparator {
try {
Search s = new Search(firstKey);
if (s.found()) {
return new nodeIterator(up, rotating, s.getMatcher());
Node matcher = s.getMatcher();
s = null;
return new nodeIterator(up, rotating, matcher);
} else {
Node nn = s.getParent();
s = null;
if (nn == null) {
return (new HashSet()).iterator(); // an empty iterator
} else {
@ -862,9 +866,12 @@ public class kelondroTree extends kelondroRecords implements Comparator {
public synchronized Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException {
Search s = new Search(firstKey);
if (s.found()) {
return new rowIterator(new nodeIterator(up, rotating, s.getMatcher()));
Node matcher = s.getMatcher();
s = null;
return new rowIterator(new nodeIterator(up, rotating, matcher));
} else {
Node nn = s.getParent();
s = null;
if (nn == null) {
return (Iterator) (new HashSet()).iterator();
} else {
@ -910,9 +917,12 @@ public class kelondroTree extends kelondroRecords implements Comparator {
public synchronized Iterator keys(boolean up, boolean rotating, byte[] firstKey) throws IOException {
Search s = new Search(firstKey);
if (s.found()) {
return new keyIterator(new nodeIterator(up, rotating, s.getMatcher()));
Node matcher = s.getMatcher();
s = null;
return new keyIterator(new nodeIterator(up, rotating, matcher));
} else {
Node nn = s.getParent();
s = null;
if (nn == null) {
return (Iterator) (new HashSet()).iterator();
} else {

@ -134,13 +134,21 @@ public class plasmaCrawlNURL extends plasmaURL {
public void run() {
Iterator i;
try {
//System.out.println("init coreStack index");
i = coreStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
//System.out.println("init limitStack index");
i = limitStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
//System.out.println("init overhangStack index");
i = overhangStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
//System.out.println("init remoteStack index");
i = remoteStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
//System.out.println("init imageStack index");
i = imageStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
//System.out.println("init movieStack index");
i = movieStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
//System.out.println("init musicStack index");
i = musicStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey()));
//System.out.println("finished index init");
} catch (IOException e) {}
}
}

@ -298,12 +298,14 @@ public final class plasmaCrawlWorker extends Thread {
htCache.status = plasmaHTCache.CACHE_PASSING;
}
// enQueue new entry with response header
if ((initiator == null) || (initiator.length() == 0)) {
// enqueued for proxy writings
cacheManager.stackProcess(htCache);
} else {
// direct processing for crawling
cacheManager.process(htCache);
if (profile != null) {
if ((initiator == null) || (initiator.length() == 0)) {
// enqueued for proxy writings
cacheManager.stackProcess(htCache);
} else {
// direct processing for crawling
cacheManager.process(htCache);
}
}
} catch (SocketException e) {
// this may happen if the client suddenly closes its connection

@ -52,10 +52,12 @@ package de.anomic.plasma;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.Map;
import java.util.LinkedList;
import java.util.TreeMap;
@ -206,9 +208,9 @@ public final class plasmaHTCache {
}
public httpHeader getCachedResponse(String urlHash) throws IOException {
httpHeader header = new httpHeader(null, responseHeaderDB.get(urlHash));
//System.out.println("DEBUG: getCachedResponse hash=" + urlHash + ", header=" + header.toString());
return header;
Map hdb = responseHeaderDB.get(urlHash);
if (hdb == null) return null;
return new httpHeader(null, hdb);
}
public boolean idle() {
@ -245,76 +247,76 @@ public final class plasmaHTCache {
}
synchronized public void process(Entry entry) throws IOException {
if (entry == null) return;
// store response header
if ((entry.status == CACHE_FILL) ||
(entry.status == CACHE_STALE_RELOAD_GOOD) ||
(entry.status == CACHE_STALE_RELOAD_BAD)) {
responseHeaderDB.set(entry.nomalizedURLHash, entry.responseHeader);
}
// work off unwritten files and undone parsing
String storeError = null;
if (((entry.status == CACHE_FILL) || (entry.status == CACHE_STALE_RELOAD_GOOD)) &&
((storeError = entry.shallStoreCache()) == null)) {
// write file if not written yet
if (entry.cacheArray != null) try {
if (entry.cacheFile.exists()) {
currCacheSize -= entry.cacheFile.length();
entry.cacheFile.delete();
// store response header
if ((entry.status == CACHE_FILL) ||
(entry.status == CACHE_STALE_RELOAD_GOOD) ||
(entry.status == CACHE_STALE_RELOAD_BAD)) {
responseHeaderDB.set(entry.nomalizedURLHash, entry.responseHeader);
}
// work off unwritten files and undone parsing
String storeError = null;
if (((entry.status == CACHE_FILL) || (entry.status == CACHE_STALE_RELOAD_GOOD)) &&
((storeError = entry.shallStoreCache()) == null)) {
// write file if not written yet
if (entry.cacheArray != null) try {
if (entry.cacheFile.exists()) {
currCacheSize -= entry.cacheFile.length();
entry.cacheFile.delete();
}
entry.cacheFile.getParentFile().mkdirs();
log.logInfo("WRITE FILE (" + entry.cacheArray.length + " bytes) " + entry.cacheFile);
serverFileUtils.write(entry.cacheArray, entry.cacheFile);
log.logDebug("AFTER WRITE cacheArray = " + entry.cacheFile + ": " + ((entry.cacheArray == null) ? "empty" : "full"));
//entry.cacheArray = null;
} catch (FileNotFoundException e) {
// this is the case of a "(Not a directory)" error, which should be prohibited
// by the shallStoreCache() property. However, sometimes the error still occurs
// In this case do nothing.
log.logError("File storage failed: " + e.getMessage());
}
entry.cacheFile.getParentFile().mkdirs();
log.logInfo("WRITE FILE (" + entry.cacheArray.length + " bytes) " + entry.cacheFile);
serverFileUtils.write(entry.cacheArray, entry.cacheFile);
log.logDebug("AFTER WRITE cacheArray = " + entry.cacheFile + ": " + ((entry.cacheArray == null) ? "empty" : "full"));
//entry.cacheArray = null;
} catch (FileNotFoundException e) {
// this is the case of a "(Not a directory)" error, which should be prohibited
// by the shallStoreCache() property. However, sometimes the error still occurs
// In this case do nothing.
log.logError("File storage failed: " + e.getMessage());
// update statistics
currCacheSize += entry.cacheFile.length();
cacheAge.put(ageString(entry.cacheFile.lastModified(), entry.cacheFile), entry.cacheFile);
// enqueue in switchboard
switchboard.enQueue(entry);
} else if (entry.status == CACHE_PASSING) {
// even if the file should not be stored in the cache, it can be used to be indexed
if (storeError != null) log.logDebug("NOT STORED " + entry.cacheFile + ":" + storeError);
// enqueue in switchboard
switchboard.enQueue(entry);
}
// write log
switch (entry.status) {
case CACHE_UNFILLED:
log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break;
case CACHE_FILL:
log.logInfo("CACHE FILL: " + entry.cacheFile +
((entry.cacheArray == null) ? "" : " (cacheArray is filled)") +
((entry.scraper == null) ? "" : " (scraper is filled)"));
break;
case CACHE_HIT:
log.logInfo("CACHE HIT: " + entry.cacheFile); break;
case CACHE_STALE_NO_RELOAD:
log.logInfo("CACHE STALE, NO RELOAD: " + entry.cacheFile); break;
case CACHE_STALE_RELOAD_GOOD:
log.logInfo("CACHE STALE, NECESSARY RELOAD: " + entry.cacheFile); break;
case CACHE_STALE_RELOAD_BAD:
log.logInfo("CACHE STALE, SUPERFLUOUS RELOAD: " + entry.cacheFile); break;
case CACHE_PASSING:
log.logInfo("PASSING: " + entry.cacheFile); break;
default:
log.logInfo("CACHE STATE UNKNOWN: " + entry.cacheFile); break;
}
// update statistics
currCacheSize += entry.cacheFile.length();
cacheAge.put(ageString(entry.cacheFile.lastModified(), entry.cacheFile), entry.cacheFile);
// enqueue in switchboard
switchboard.enQueue(entry);
} else if (entry.status == CACHE_PASSING) {
// even if the file should not be stored in the cache, it can be used to be indexed
if (storeError != null) log.logDebug("NOT STORED " + entry.cacheFile + ":" + storeError);
// enqueue in switchboard
switchboard.enQueue(entry);
}
// write log
switch (entry.status) {
case CACHE_UNFILLED:
log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break;
case CACHE_FILL:
log.logInfo("CACHE FILL: " + entry.cacheFile +
((entry.cacheArray == null) ? "" : " (cacheArray is filled)") +
((entry.scraper == null) ? "" : " (scraper is filled)"));
break;
case CACHE_HIT:
log.logInfo("CACHE HIT: " + entry.cacheFile); break;
case CACHE_STALE_NO_RELOAD:
log.logInfo("CACHE STALE, NO RELOAD: " + entry.cacheFile); break;
case CACHE_STALE_RELOAD_GOOD:
log.logInfo("CACHE STALE, NECESSARY RELOAD: " + entry.cacheFile); break;
case CACHE_STALE_RELOAD_BAD:
log.logInfo("CACHE STALE, SUPERFLUOUS RELOAD: " + entry.cacheFile); break;
case CACHE_PASSING:
log.logInfo("PASSING: " + entry.cacheFile); break;
default:
log.logInfo("CACHE STATE UNKNOWN: " + entry.cacheFile); break;
}
}
@ -453,6 +455,32 @@ public final class plasmaHTCache {
return null;
}
public byte[] loadResource(URL url) {
// load the url as resource from the cache
File f = getCachePath(url);
if (f.exists()) try {
return serverFileUtils.read(f);
} catch (IOException e) {
return null;
} else {
return null;
}
}
/*
public void saveResource(URL url, byte[] resource) {
File f = getCachePath(url);
f.getParentFile().mkdirs();
FileOutputStream fos = null;
try {
fos = new FileOutputStream(f);
htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file
} finally {
if (fos!=null)try{fos.close();}catch(Exception e){}
}
}
*/
public static boolean isPOST(String urlString) {
return ((urlString.indexOf("?") >= 0) ||
(urlString.indexOf("&") >= 0));

@ -143,8 +143,8 @@ public final class plasmaParser {
* @see #initMediaExt(String)
*/
static {
initMediaExt("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
"sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj");
initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
"sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"));
/* ===================================================
* initializing the parser object pool
@ -200,25 +200,31 @@ public final class plasmaParser {
setEnabledParserList(mimeTypes);
}
public static void initMediaExt(String mediaExtString) {
public static List extString2extList(String extString) {
LinkedList extensions = new LinkedList();
if ((mediaExtString == null) || (mediaExtString.length() == 0)) {
if ((extString == null) || (extString.length() == 0)) {
return extensions;
} else {
String[] xs = mediaExtString.split(",");
String[] xs = extString.split(",");
for (int i = 0; i < xs.length; i++) extensions.add(xs[i].toLowerCase().trim());
}
initMediaExt(extensions);
return extensions;
}
public static void initMediaExt(List mediaExtList) {
synchronized (mediaExtSet) {
mediaExtSet.clear();
mediaExtSet.addAll(mediaExtList);
}
mediaExtSet.addAll(mediaExtList);
}
}
public static void initSupportedFileExt(List supportedFileExtList) {
synchronized (mediaExtSet) {
supportedFileExt.clear();
supportedFileExt.addAll(supportedFileExtList);
}
}
public static boolean realtimeParsableMimeTypesContains(String mimeType) {
mimeType = getRealMimeType(mimeType);
synchronized (realtimeParsableMimeTypes) {
@ -238,6 +244,12 @@ public final class plasmaParser {
}
}
public static boolean supportedFileExtContains(String mediaExt) {
if (supportedFileExt == null) return false;
//System.out.println("supported ext: " + supportedFileExt.toString());
return (supportedFileExt.contains(mediaExt));
}
public static boolean mediaExtContains(String mediaExt) {
if (mediaExt == null) return false;
@ -316,16 +328,16 @@ public final class plasmaParser {
}
synchronized (enabledParserList) {
enabledParserList.clear();
enabledParserList.putAll(newEnabledParsers);
}
//enabledParserList.clear();
enabledParserList.putAll(newEnabledParsers);
}
synchronized (supportedFileExt) {
supportedFileExt.clear();
//supportedFileExt.clear();
supportedFileExt.addAll(newSupportedFileExt);
}
}
return (String[])newEnabledParsers.keySet().toArray(new String[newEnabledParsers.size()]);
}

@ -114,26 +114,38 @@ public class plasmaSnippetCache {
return (String) snippetsCache.get(key);
}
public String retrieve(java.net.URL url, boolean fetchOnline, Set query, boolean queryAreHashes) {
if (query.size() == 0) return null;
if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query);
public String retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) {
if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for url retrieve " + url);
return null;
}
String urlhash = plasmaURL.urlHash(url);
// try to get snippet from snippetCache
String wordhashes = yacySearch.set2string(query);
String wordhashes = yacySearch.set2string(queryhashes);
String snippet = retrieve(wordhashes, urlhash);
if (snippet != null) return snippet;
if (snippet != null) {
//System.out.println("found snippet for url " + url + " in cache: " + snippet);
return snippet;
}
// if the snippet is not in the cache, we can try to get it from the htcache
plasmaParserDocument document = getDocument(url, fetchOnline);
if (document == null) return null;
if (document == null) {
//System.out.println("cannot load document for url " + url);
return null;
}
//System.out.println("loaded document for url " + url);
String[] sentences = document.getSentences();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if ((sentences == null) || (sentences.length == 0)) return null;
if ((sentences == null) || (sentences.length == 0)) {
//System.out.println("found no sentences in url " + url);
return null;
}
// we have found a parseable non-empty file: use the lines
TreeMap sentencematrix = hashMatrix(sentences);
Iterator i = query.iterator();
Iterator i = queryhashes.iterator();
String hash;
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator j;
@ -151,8 +163,9 @@ public class plasmaSnippetCache {
Integer maxLine = (Integer) hitTable.getMaxObject();
if (maxLine == null) return null;
snippet = sentences[maxLine.intValue()];
if (snippet.length() > 140) return null;
//System.out.println("loaded snippet for url " + url + ": " + snippet);
if (snippet.length() > 120) snippet = snippet.substring(0, 120);
// finally store this snippet in our own cache
store(wordhashes, urlhash, snippet);
return snippet;
@ -175,10 +188,10 @@ public class plasmaSnippetCache {
// load the url as resource from the web
try {
//return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
byte[] resource = getResourceFromCache(url);
byte[] resource = cacheManager.loadResource(url);
if ((fetchOnline) && (resource == null)) {
loadResourceFromWeb(url, 5000);
resource = getResourceFromCache(url);
resource = cacheManager.loadResource(url);
}
return resource;
} catch (IOException e) {
@ -186,20 +199,6 @@ public class plasmaSnippetCache {
}
}
private byte[] getResourceFromCache(URL url) {
// load the url as resource from the cache
String path = htmlFilterContentScraper.urlNormalform(url).substring(6);
File cache = cacheManager.cachePath;
File f = new File(cache, path);
if (f.exists()) try {
return serverFileUtils.read(f);
} catch (IOException e) {
return null;
} else {
return null;
}
}
private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
plasmaCrawlWorker.load(
url,
@ -221,14 +220,23 @@ public class plasmaSnippetCache {
httpHeader header = null;
try {
header = cacheManager.getCachedResponse(plasmaURL.urlHash(url));
} catch (IOException e) {
return null;
}
if (header == null) return null;
if (plasmaParser.supportedMimeTypesContains(header.mime())) {
return parser.parseSource(url, header.mime(), resource);
} catch (IOException e) {}
if (header == null) {
String filename = url.getFile();
int p = filename.lastIndexOf('.');
if ((p < 0) ||
((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {
return parser.parseSource(url, "text/html", resource);
} else {
return null;
}
} else {
return null;
if (plasmaParser.supportedMimeTypesContains(header.mime())) {
return parser.parseSource(url, header.mime(), resource);
} else {
return null;
}
}
}
}

@ -263,9 +263,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this.parser = new plasmaParser();
// define an extension-blacklist
log.logSystem("Parser: Initializing Media Extensions");
plasmaParser.initMediaExt(getConfig("mediaExt",null));
log.logSystem("Parser: Initializing Extension Mappings for Media/Parser");
plasmaParser.initMediaExt(plasmaParser.extString2extList(getConfig("mediaExt","")));
plasmaParser.initSupportedFileExt(plasmaParser.extString2extList(getConfig("parseableExt","")));
// define a realtime parsable mimetype list
log.logSystem("Parser: Initializing Mime Types");
plasmaParser.initRealtimeParsableMimeTypes(getConfig("parseableRealtimeMimeTypes","application/xhtml+xml,text/html,text/plain"));
@ -300,6 +301,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
cleanProfiles();
// init facility DB
/*
log.logSystem("Starting Facility Database");
File facilityDBpath = new File(getRootPath(), "DATA/SETTINGS/");
facilityDB = new kelondroTables(facilityDBpath);
@ -312,7 +314,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
facilityDB.update("statistik", (new serverDate()).toShortString(false).substring(0, 11), new long[]{1,2,3,4,5,6});
long[] testresult = facilityDB.selectLong("statistik", "yyyyMMddHHm");
testresult = facilityDB.selectLong("statistik", (new serverDate()).toShortString(false).substring(0, 11));
*/
// generate snippets cache
log.logSystem("Initializing Snippet Cache");
@ -322,17 +324,22 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start yacy core
log.logSystem("Starting YaCy Protocol Core");
//try{Thread.currentThread().sleep(5000);} catch (InterruptedException e) {} // for profiler
yacyCore yc = new yacyCore(this);
//log.logSystem("Started YaCy Protocol Core");
//System.gc(); try{Thread.currentThread().sleep(5000);} catch (InterruptedException e) {} // for profiler
serverInstantThread.oneTimeJob(yc, "loadSeeds", yc.log, 3000);
// deploy threads
log.logSystem("Starting Threads");
System.gc(); // help for profiler
int indexing_cluster = Integer.parseInt(getConfig("80_indexing_cluster", "1"));
if (indexing_cluster < 1) indexing_cluster = 1;
deployThread("90_cleanup", "Cleanup", "simple cleaning process for monitoring information" ,
new serverInstantThread(this, "cleanupJob", "cleanupJobSize"), 10000); // all 5 Minutes
deployThread("80_indexing", "Parsing/Indexing", "thread that performes document parsing and indexing" ,
new serverInstantThread(this, "deQueue", "queueSize"), 10000);
for (int i = 1; i < indexing_cluster; i++) {
setConfig((i + 80) + "_indexing_idlesleep", getConfig("80_indexing_idlesleep", ""));
setConfig((i + 80) + "_indexing_busysleep", getConfig("80_indexing_busysleep", ""));
@ -344,7 +351,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
deployThread("62_remotetriggeredcrawl", "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer",
new serverInstantThread(this, "remoteTriggeredCrawlJob", "remoteTriggeredCrawlJobSize"), 30000);
deployThread("61_globalcrawltrigger", "Global Crawl Trigger", "thread that triggeres remote peers for crawling",
new serverInstantThread(this, "limitCrawlTriggerJob", "limitCrawlTriggerJobSize"), 30000);
new serverInstantThread(this, "limitCrawlTriggerJob", "limitCrawlTriggerJobSize"), 30000); // error here?
deployThread("50_localcrawl", "Local Crawl", "thread that performes a single crawl step from the local crawl queue",
new serverInstantThread(this, "coreCrawlJob", "coreCrawlJobSize"), 10000);
deployThread("40_peerseedcycle", "Seed-List Upload", "task that a principal peer performes to generate and upload a seed-list to a ftp account",
@ -357,6 +364,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// init migratiion from 0.37 -> 0.38
classicCache = new plasmaWordIndexClassicCacheMigration(plasmaPath, wordIndex);
if (classicCache.size() > 0) {
setConfig("99_indexcachemigration_idlesleep" , 10000);
setConfig("99_indexcachemigration_busysleep" , 40);
@ -451,7 +459,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
cacheLoader.close();
wikiDB.close();
messageDB.close();
facilityDB.close();
if (facilityDB != null) facilityDB.close();
urlPool.close();
profiles.close();
parser.close();
@ -577,6 +585,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public boolean coreCrawlJob() {
System.gc(); // debug
if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) {
//log.logDebug("CoreCrawl: queue is empty");
return false;
@ -1128,35 +1137,43 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
char[] order;
String urlmask;
long time;
public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask) {
int fetchcount;
public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int fetchcount) {
this.queryhashes = queryhashes;
this.order = order;
this.urlmask = urlmask;
this.time = time;
this.fetchcount = fetchcount;
}
public void run() {
try {
// search the database locally
log.logDebug("presearch: started job");
plasmaWordIndexEntity idx = searchManager.searchHashes(queryhashes, time);
plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, 3);
log.logDebug("presearch: found " + idx.size() + " results");
plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, fetchcount);
if (acc == null) return;
log.logDebug("presearch: ordered results, now " + acc.sizeOrdered() + " URLs ready for fetch");
// take some elements and fetch the snippets
int i = 0;
plasmaCrawlLURL.entry urlentry;
String urlstring;
while ((acc.hasMoreElements()) && (i < 3)) {
String urlstring, snippet;
while ((acc.hasMoreElements()) && (i < fetchcount)) {
urlentry = acc.nextElement();
if (urlentry.url().getHost().endsWith(".yacyh")) continue;
urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
if (urlstring.matches(urlmask)) { //.* is default
snippetCache.retrieve(urlentry.url(), true, queryhashes, true);
log.logDebug("presearch: fetching URL " + urlstring);
snippet = snippetCache.retrieve(urlentry.url(), true, queryhashes);
if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet + "'");
i++;
}
}
} catch (IOException e) {
e.printStackTrace();
}
log.logDebug("presearch: job terminated");
}
}
@ -1169,7 +1186,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (order2.equals("quality")) order[1] = plasmaSearch.O_QUALITY; else order[1] = plasmaSearch.O_AGE;
// filter out words that appear in bluelist
Set queryhashes = plasmaSearch.words2hashes(querywords);
Iterator it = querywords.iterator();
String word, gs = "";
while (it.hasNext()) {
@ -1177,13 +1193,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (blueList.contains(word)) it.remove(); else gs += "+" + word;
}
if (gs.length() > 0) gs = gs.substring(1);
Set queryhashes = plasmaSearch.words2hashes(querywords);
// log
log.logInfo("INIT WORD SEARCH: " + gs + " - " + count + " links, " + (time / 1000) + " seconds");
long timestamp = System.currentTimeMillis();
//Thread preselect = new presearch(querywords, order, time / 10, urlmask);
//preselect.start();
Thread preselect = new presearch(queryhashes, order, time / 10, urlmask, 5);
preselect.start();
// do global fetching
int globalresults = 0;
@ -1266,7 +1283,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("results_" + i + "_urlname", urlname);
prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
snippet = snippetCache.retrieve(url, false, querywords, false);
snippet = snippetCache.retrieve(url, false, queryhashes);
if ((snippet == null) || (snippet.length() < 10)) {
prop.put("results_" + i + "_snippet", 0);
prop.put("results_" + i + "_snippet_text", "");
@ -1343,7 +1360,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String snippet;
while ((acc.hasMoreElements()) && (i < count)) {
urlentry = acc.nextElement();
snippet = snippetCache.retrieve(urlentry.url(), false, hashes, true);
snippet = snippetCache.retrieve(urlentry.url(), false, hashes);
if ((snippet == null) || (snippet.length() < 10)) {
resource = urlentry.toString();
} else {

@ -170,7 +170,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
if (System.currentTimeMillis() > messageTime) {
System.gc(); // for better statistic
wordsPerSecond = wordcount * 1000 / (1 + System.currentTimeMillis() - startTime);
log.logInfo("dumping status: " + wordcount + " words done, " + (cache.size() / wordsPerSecond) + " seconds remaining, free mem = " + (Runtime.getRuntime().freeMemory() / 1024 / 1024) + "MB");
log.logInfo("dumping status: " + wordcount + " words done, " + (cache.size() / (wordsPerSecond + 1)) + " seconds remaining, free mem = " + (Runtime.getRuntime().freeMemory() / 1024 / 1024) + "MB");
messageTime = System.currentTimeMillis() + 5000;
}
}
@ -552,11 +552,9 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
flushThread.pause();
//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
while (cache.size() >= this.maxWords) flushFromMem();
if ((cache.size() > 10000) && (Runtime.getRuntime().freeMemory() < 11000000)) flushFromMem();
while ((cache.size() > 0) && (Runtime.getRuntime().freeMemory() < 1000000)) {
flushFromMem();
System.gc();
}
if ((cache.size() > 10000) && (Runtime.getRuntime().freeMemory() < 5000000)) flushFromMem();
if ((cache.size() > 0) && (Runtime.getRuntime().freeMemory() < 1000000)) flushFromMem();
//if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries");
// put new words into cache

@ -79,6 +79,7 @@ public class plasmaWordIndexEntity {
}
private kelondroTree indexFile(File databaseRoot, String wordHash) throws IOException {
if (wordHash.length() < 12) throw new IOException("word hash wrong: '" + wordHash + "'");
theLocation = wordHash2path(databaseRoot, wordHash);
File fp = theLocation.getParentFile();
if (fp != null) fp.mkdirs();
@ -97,7 +98,8 @@ public class plasmaWordIndexEntity {
public static File wordHash2path(File databaseRoot, String hash) {
// creates a path that constructs hashing on a file system
return new File (databaseRoot, "WORDS/" +
return new File (databaseRoot, "WORDS/" +
hash.substring(0,1) + "/" + hash.substring(1,2) + "/" + hash.substring(2,4) + "/" +
hash.substring(4,6) + "/" + hash + ".db");
}

@ -236,23 +236,26 @@ public final class serverByteBuffer extends OutputStream {
length = length - start;
return this;
}
private serverByteBuffer trim(int start, int end) {
if (end > length) throw new IndexOutOfBoundsException("trim: end > length");
trim(start);
// the end value is outside (+1) of the wanted target array
if (start > length) throw new IndexOutOfBoundsException("trim: start > length");
if (end > length) throw new IndexOutOfBoundsException("trim: end > length");
if (start > end) throw new IndexOutOfBoundsException("trim: start > end");
offset = offset + start;
length = end - start;
return this;
}
public serverByteBuffer trim() {
int l = 0; while ((l < length) && (buffer[l] <= 32)) l++;
int r = length; while ((r > 0) && (buffer[r - 1] <= 32)) r--;
if ((l <= r) && (l < length)) return trim(l, r);
return this;
int l = 0; while ((l < length) && (buffer[offset + l] <= 32)) l++;
int r = length; while ((r > 0) && (buffer[offset + r - 1] <= 32)) r--;
if (l > r) r = l;
return trim(l, r);
}
public String toString() {
return new String(getBytes(), offset, length);
return new String(buffer, offset, length);
}
public Properties propParser() {

@ -450,8 +450,8 @@ public final class serverCore extends serverAbstractThread implements serverThre
*/
public SessionPool(SessionFactory objFactory) {
super(objFactory);
this.setMaxIdle(75); // Maximum idle threads.
this.setMaxActive(150); // Maximum active threads.
this.setMaxIdle(50); // Maximum idle threads.
this.setMaxActive(100); // Maximum active threads.
this.setMinEvictableIdleTimeMillis(30000); //Evictor runs every 30 secs.
//this.setMaxWait(1000); // Wait 1 second till a thread is available
}

@ -100,6 +100,7 @@ parseableMimeTypes=
# this is important to recognize <a href> - tags as not-html reference
# These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
mediaExt=ace,arj,asf,avi,bin,bz2,css,deb,doc,dmg,gif,gz,hqx,img,iso,jar,jpe,jpg,jpeg,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,sit,swf,sxc,sxd,sxi,sxw,tar,tgz,torrent,wmv,xcf,xls,zip
parseableExt=html,htm,txt
# Promotion Strings
# These strings appear in the Web Mask of the YACY search client

Loading…
Cancel
Save