From 1e7f062350fb092a9f07e50443b6e1f7edb08fe4 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 23 Jun 2005 02:07:45 +0000 Subject: [PATCH] many bugfixes, memory leak fixes, performance enhancements; new kelondroHashtable; activated snippets git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@313 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/index.java | 4 +- source/de/anomic/kelondro/kelondroArray.java | 19 +- .../kelondro/kelondroMScoreCluster.java | 2 + .../de/anomic/kelondro/kelondroMSetTools.java | 10 +- .../de/anomic/kelondro/kelondroRecords.java | 4 + source/de/anomic/kelondro/kelondroTree.java | 32 ++-- source/de/anomic/plasma/plasmaCrawlNURL.java | 8 + .../de/anomic/plasma/plasmaCrawlWorker.java | 14 +- source/de/anomic/plasma/plasmaHTCache.java | 166 ++++++++++-------- source/de/anomic/plasma/plasmaParser.java | 44 +++-- .../de/anomic/plasma/plasmaSnippetCache.java | 74 ++++---- .../de/anomic/plasma/plasmaSwitchboard.java | 49 ++++-- .../anomic/plasma/plasmaWordIndexCache.java | 10 +- .../anomic/plasma/plasmaWordIndexEntity.java | 4 +- source/de/anomic/server/serverByteBuffer.java | 19 +- source/de/anomic/server/serverCore.java | 4 +- yacy.init | 1 + 17 files changed, 284 insertions(+), 180 deletions(-) diff --git a/htroot/index.java b/htroot/index.java index 8be4b9fb3..d00a1e3c2 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -81,7 +81,7 @@ public class index { referrerprop.put("clientip", header.get("CLIENTIP")); referrerprop.put("useragent", header.get("User-Agent")); referrerprop.put("date", (new serverDate()).toShortString(false)); - try { sb.facilityDB.update("backlinks", referer, referrerprop); } catch (IOException e) {} + if (sb.facilityDB != null) try { sb.facilityDB.update("backlinks", referer, referrerprop); } catch (IOException e) {} } } @@ -114,7 +114,7 @@ public class index { // process search words String querystring = (String) post.get("search", ""); - try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {} + if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {} TreeSet query = cleanQuery(querystring); // filter out stopwords TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords); diff --git a/source/de/anomic/kelondro/kelondroArray.java b/source/de/anomic/kelondro/kelondroArray.java index 0be88ee49..8aad33c92 100644 --- a/source/de/anomic/kelondro/kelondroArray.java +++ b/source/de/anomic/kelondro/kelondroArray.java @@ -45,13 +45,9 @@ package de.anomic.kelondro; -import java.io.BufferedReader; + import java.io.File; -import java.io.FileReader; import java.io.IOException; -import java.io.RandomAccessFile; -import java.util.Iterator; -import java.util.StringTokenizer; public class kelondroArray extends kelondroRecords { @@ -60,7 +56,7 @@ public class kelondroArray extends kelondroRecords { private static short thisOHHandles = 0; // and two handles overhead for a double-chained list public kelondroArray(File file, int[] columns, int intprops) throws IOException { - // this creates a new tree + // this creates a new array super(file, 0, thisOHBytes, thisOHHandles, columns, intprops, columns.length /*txtProps*/, 80 /*txtPropWidth*/); for (int i = 0; i < intprops; i++) setHandle(i, new Handle(0)); } @@ -90,6 +86,17 @@ public class kelondroArray extends kelondroRecords { return getNode(new Handle(index)).getValues(); } + + public synchronized int seti(int index, int value) throws IOException { + int before = getHandle(index).hashCode(); + setHandle(index, new Handle(index)); + return before; + } + + public synchronized int geti(int index) throws IOException { + return getHandle(index).hashCode(); + } + public void print() throws IOException { System.out.println("PRINTOUT of table, length=" + size()); byte[][] row; diff --git a/source/de/anomic/kelondro/kelondroMScoreCluster.java b/source/de/anomic/kelondro/kelondroMScoreCluster.java index 2502b1690..f2b1adb3f 100644 --- a/source/de/anomic/kelondro/kelondroMScoreCluster.java +++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java @@ -146,6 +146,7 @@ public class kelondroMScoreCluster { // set new value c = scoreKey(en, ec); cs = new Long(c); + Object oldcs = refkeyDB.remove(obj); if (oldcs != null) keyrefDB.remove(oldcs); // avoid memory leak refkeyDB.put(obj, cs); keyrefDB.put(cs, obj); @@ -174,6 +175,7 @@ public class kelondroMScoreCluster { // set new value c = scoreKey(en, ec); cs = new Long(c); + Object oldcs = refkeyDB.remove(obj); if (oldcs != null) keyrefDB.remove(oldcs); // avoid memory leak refkeyDB.put(obj, cs); keyrefDB.put(cs, obj); diff --git a/source/de/anomic/kelondro/kelondroMSetTools.java b/source/de/anomic/kelondro/kelondroMSetTools.java index 1e98cfa64..50ce7c04a 100644 --- a/source/de/anomic/kelondro/kelondroMSetTools.java +++ b/source/de/anomic/kelondro/kelondroMSetTools.java @@ -59,9 +59,11 @@ public class kelondroMSetTools { throw new ClassCastException(); } - private static int log2(int x) { + public static int log2a(int x) { + // this computes 1 + log2 + // it is the number of bits in x, not the logarithmus by 2 int l = 0; - while (x > 0) {x = x >> 1; l++;} + while (x > 0) {x = x >>> 1; l++;} return l; } @@ -84,7 +86,7 @@ public class kelondroMSetTools { int high = ((map.size() > set.size()) ? map.size() : set.size()); int low = ((map.size() > set.size()) ? set.size() : map.size()); int stepsEnum = 10 * (high + low - 1); - int stepsTest = 12 * log2(high) * low; + int stepsTest = 12 * log2a(high) * low; // start most efficient method if (stepsEnum > stepsTest) { @@ -156,7 +158,7 @@ public class kelondroMSetTools { int high = ((set1.size() > set2.size()) ? set1.size() : set2.size()); int low = ((set1.size() > set2.size()) ? set2.size() : set1.size()); int stepsEnum = 10 * (high + low - 1); - int stepsTest = 12 * log2(high) * low; + int stepsTest = 12 * log2a(high) * low; // start most efficient method if (stepsEnum > stepsTest) { diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java index 98d4c7118..7f4719737 100644 --- a/source/de/anomic/kelondro/kelondroRecords.java +++ b/source/de/anomic/kelondro/kelondroRecords.java @@ -744,6 +744,10 @@ public class kelondroRecords { } // additional properties + public synchronized int handles() { + return this.HANDLES.length; + } + protected void setHandle(int pos, Handle handle) throws IOException { if (pos >= HANDLES.length) throw new IllegalArgumentException("setHandle: handle array exceeded"); if (handle == null) handle = new Handle(NUL); diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java index 5e53e67b7..fdb4c1719 100644 --- a/source/de/anomic/kelondro/kelondroTree.java +++ b/source/de/anomic/kelondro/kelondroTree.java @@ -119,18 +119,16 @@ public class kelondroTree extends kelondroRecords implements Comparator { super(ra, buffersize); } - private static byte abs(byte b) { - // for height computation - if (b < 0) return (byte) -b; else return b; - } - // Returns the value to which this map maps the specified key. public synchronized byte[][] get(byte[] key) throws IOException { //System.out.println("kelondroTree.get " + new String(key) + " in " + filename); Search search = new Search(key); if (search.found()) { - return search.getMatcher().getValues(); + byte[][] result = search.getMatcher().getValues(); + search = null; + return result; } else { + search = null; return null; } } @@ -306,6 +304,7 @@ public class kelondroTree extends kelondroRecords implements Comparator { // a node with this key exist. simply overwrite the content and return old content Node e = searchResult.getMatcher(); byte[][] result = e.setValues(newrow); + searchResult = null; return result; } else if (searchResult.isRoot()) { // a node with this key does not exist and there is no node at all @@ -320,6 +319,7 @@ public class kelondroTree extends kelondroRecords implements Comparator { e.setOHHandle(new Handle[] {null, null, null}); // {parent, leftchild, rightchild} // do updates setHandle(root, e.handle()); + searchResult = null; return null; } else { // a node with this key does not exist @@ -375,7 +375,7 @@ public class kelondroTree extends kelondroRecords implements Comparator { parentOHByte[balance]--; path = "R" + path; } - increasedHight = ((abs(parentOHByte[balance]) - abs(prevHight)) > 0); + increasedHight = ((java.lang.Math.abs((int) parentOHByte[balance]) - java.lang.Math.abs((int) prevHight)) > 0); parentNode.setOHByte(parentOHByte); // here we either stop because we had no increased hight, @@ -384,7 +384,7 @@ public class kelondroTree extends kelondroRecords implements Comparator { if (!(increasedHight)) break; // finished // check rotation need - if (abs(parentOHByte[balance]) > 1) { + if (java.lang.Math.abs((int) parentOHByte[balance]) > 1) { // rotate and stop then //System.out.println("* DB DEBUG: " + path.substring(0,2) + " ROTATION AT NODE " + parentNode.handle().toString() + ": BALANCE=" + parentOHByte[balance]); if (path.startsWith("LL")) { @@ -561,6 +561,7 @@ public class kelondroTree extends kelondroRecords implements Comparator { Node result = search.getMatcher(); byte[][] values = result.getValues(); remove(result, search.getParent()); + search = null; return values; } else { return null; @@ -722,9 +723,12 @@ public class kelondroTree extends kelondroRecords implements Comparator { try { Search s = new Search(firstKey); if (s.found()) { - return new nodeIterator(up, rotating, s.getMatcher()); + Node matcher = s.getMatcher(); + s = null; + return new nodeIterator(up, rotating, matcher); } else { Node nn = s.getParent(); + s = null; if (nn == null) { return (new HashSet()).iterator(); // an empty iterator } else { @@ -862,9 +866,12 @@ public class kelondroTree extends kelondroRecords implements Comparator { public synchronized Iterator rows(boolean up, boolean rotating, byte[] firstKey) throws IOException { Search s = new Search(firstKey); if (s.found()) { - return new rowIterator(new nodeIterator(up, rotating, s.getMatcher())); + Node matcher = s.getMatcher(); + s = null; + return new rowIterator(new nodeIterator(up, rotating, matcher)); } else { Node nn = s.getParent(); + s = null; if (nn == null) { return (Iterator) (new HashSet()).iterator(); } else { @@ -910,9 +917,12 @@ public class kelondroTree extends kelondroRecords implements Comparator { public synchronized Iterator keys(boolean up, boolean rotating, byte[] firstKey) throws IOException { Search s = new Search(firstKey); if (s.found()) { - return new keyIterator(new nodeIterator(up, rotating, s.getMatcher())); + Node matcher = s.getMatcher(); + s = null; + return new keyIterator(new nodeIterator(up, rotating, matcher)); } else { Node nn = s.getParent(); + s = null; if (nn == null) { return (Iterator) (new HashSet()).iterator(); } else { diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 144c71938..d5423836b 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -134,13 +134,21 @@ public class plasmaCrawlNURL extends plasmaURL { public void run() { Iterator i; try { + //System.out.println("init coreStack index"); i = coreStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + //System.out.println("init limitStack index"); i = limitStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + //System.out.println("init overhangStack index"); i = overhangStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + //System.out.println("init remoteStack index"); i = remoteStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + //System.out.println("init imageStack index"); i = imageStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + //System.out.println("init movieStack index"); i = movieStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + //System.out.println("init musicStack index"); i = musicStack.iterator(); while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey())); + //System.out.println("finished index init"); } catch (IOException e) {} } } diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index a28194e4e..14e5c1a44 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -298,12 +298,14 @@ public final class plasmaCrawlWorker extends Thread { htCache.status = plasmaHTCache.CACHE_PASSING; } // enQueue new entry with response header - if ((initiator == null) || (initiator.length() == 0)) { - // enqueued for proxy writings - cacheManager.stackProcess(htCache); - } else { - // direct processing for crawling - cacheManager.process(htCache); + if (profile != null) { + if ((initiator == null) || (initiator.length() == 0)) { + // enqueued for proxy writings + cacheManager.stackProcess(htCache); + } else { + // direct processing for crawling + cacheManager.process(htCache); + } } } catch (SocketException e) { // this may happen if the client suddenly closes its connection diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 3326d2dd1..eafb4df75 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -52,10 +52,12 @@ package de.anomic.plasma; import java.io.File; import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Date; +import java.util.Map; import java.util.LinkedList; import java.util.TreeMap; @@ -206,9 +208,9 @@ public final class plasmaHTCache { } public httpHeader getCachedResponse(String urlHash) throws IOException { - httpHeader header = new httpHeader(null, responseHeaderDB.get(urlHash)); - //System.out.println("DEBUG: getCachedResponse hash=" + urlHash + ", header=" + header.toString()); - return header; + Map hdb = responseHeaderDB.get(urlHash); + if (hdb == null) return null; + return new httpHeader(null, hdb); } public boolean idle() { @@ -245,76 +247,76 @@ public final class plasmaHTCache { } synchronized public void process(Entry entry) throws IOException { - + if (entry == null) return; - // store response header - if ((entry.status == CACHE_FILL) || - (entry.status == CACHE_STALE_RELOAD_GOOD) || - (entry.status == CACHE_STALE_RELOAD_BAD)) { - responseHeaderDB.set(entry.nomalizedURLHash, entry.responseHeader); - } - - // work off unwritten files and undone parsing - String storeError = null; - if (((entry.status == CACHE_FILL) || (entry.status == CACHE_STALE_RELOAD_GOOD)) && - ((storeError = entry.shallStoreCache()) == null)) { - - // write file if not written yet - if (entry.cacheArray != null) try { - if (entry.cacheFile.exists()) { - currCacheSize -= entry.cacheFile.length(); - entry.cacheFile.delete(); + // store response header + if ((entry.status == CACHE_FILL) || + (entry.status == CACHE_STALE_RELOAD_GOOD) || + (entry.status == CACHE_STALE_RELOAD_BAD)) { + responseHeaderDB.set(entry.nomalizedURLHash, entry.responseHeader); + } + + // work off unwritten files and undone parsing + String storeError = null; + if (((entry.status == CACHE_FILL) || (entry.status == CACHE_STALE_RELOAD_GOOD)) && + ((storeError = entry.shallStoreCache()) == null)) { + + // write file if not written yet + if (entry.cacheArray != null) try { + if (entry.cacheFile.exists()) { + currCacheSize -= entry.cacheFile.length(); + entry.cacheFile.delete(); + } + entry.cacheFile.getParentFile().mkdirs(); + log.logInfo("WRITE FILE (" + entry.cacheArray.length + " bytes) " + entry.cacheFile); + serverFileUtils.write(entry.cacheArray, entry.cacheFile); + log.logDebug("AFTER WRITE cacheArray = " + entry.cacheFile + ": " + ((entry.cacheArray == null) ? "empty" : "full")); + //entry.cacheArray = null; + } catch (FileNotFoundException e) { + // this is the case of a "(Not a directory)" error, which should be prohibited + // by the shallStoreCache() property. However, sometimes the error still occurs + // In this case do nothing. + log.logError("File storage failed: " + e.getMessage()); } - entry.cacheFile.getParentFile().mkdirs(); - log.logInfo("WRITE FILE (" + entry.cacheArray.length + " bytes) " + entry.cacheFile); - serverFileUtils.write(entry.cacheArray, entry.cacheFile); - log.logDebug("AFTER WRITE cacheArray = " + entry.cacheFile + ": " + ((entry.cacheArray == null) ? "empty" : "full")); - //entry.cacheArray = null; - } catch (FileNotFoundException e) { - // this is the case of a "(Not a directory)" error, which should be prohibited - // by the shallStoreCache() property. However, sometimes the error still occurs - // In this case do nothing. - log.logError("File storage failed: " + e.getMessage()); + + // update statistics + currCacheSize += entry.cacheFile.length(); + cacheAge.put(ageString(entry.cacheFile.lastModified(), entry.cacheFile), entry.cacheFile); + + // enqueue in switchboard + switchboard.enQueue(entry); + } else if (entry.status == CACHE_PASSING) { + // even if the file should not be stored in the cache, it can be used to be indexed + if (storeError != null) log.logDebug("NOT STORED " + entry.cacheFile + ":" + storeError); + + // enqueue in switchboard + switchboard.enQueue(entry); + } + + // write log + + switch (entry.status) { + case CACHE_UNFILLED: + log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break; + case CACHE_FILL: + log.logInfo("CACHE FILL: " + entry.cacheFile + + ((entry.cacheArray == null) ? "" : " (cacheArray is filled)") + + ((entry.scraper == null) ? "" : " (scraper is filled)")); + break; + case CACHE_HIT: + log.logInfo("CACHE HIT: " + entry.cacheFile); break; + case CACHE_STALE_NO_RELOAD: + log.logInfo("CACHE STALE, NO RELOAD: " + entry.cacheFile); break; + case CACHE_STALE_RELOAD_GOOD: + log.logInfo("CACHE STALE, NECESSARY RELOAD: " + entry.cacheFile); break; + case CACHE_STALE_RELOAD_BAD: + log.logInfo("CACHE STALE, SUPERFLUOUS RELOAD: " + entry.cacheFile); break; + case CACHE_PASSING: + log.logInfo("PASSING: " + entry.cacheFile); break; + default: + log.logInfo("CACHE STATE UNKNOWN: " + entry.cacheFile); break; } - - // update statistics - currCacheSize += entry.cacheFile.length(); - cacheAge.put(ageString(entry.cacheFile.lastModified(), entry.cacheFile), entry.cacheFile); - - // enqueue in switchboard - switchboard.enQueue(entry); - } else if (entry.status == CACHE_PASSING) { - // even if the file should not be stored in the cache, it can be used to be indexed - if (storeError != null) log.logDebug("NOT STORED " + entry.cacheFile + ":" + storeError); - - // enqueue in switchboard - switchboard.enQueue(entry); - } - - // write log - - switch (entry.status) { - case CACHE_UNFILLED: - log.logInfo("CACHE UNFILLED: " + entry.cacheFile); break; - case CACHE_FILL: - log.logInfo("CACHE FILL: " + entry.cacheFile + - ((entry.cacheArray == null) ? "" : " (cacheArray is filled)") + - ((entry.scraper == null) ? "" : " (scraper is filled)")); - break; - case CACHE_HIT: - log.logInfo("CACHE HIT: " + entry.cacheFile); break; - case CACHE_STALE_NO_RELOAD: - log.logInfo("CACHE STALE, NO RELOAD: " + entry.cacheFile); break; - case CACHE_STALE_RELOAD_GOOD: - log.logInfo("CACHE STALE, NECESSARY RELOAD: " + entry.cacheFile); break; - case CACHE_STALE_RELOAD_BAD: - log.logInfo("CACHE STALE, SUPERFLUOUS RELOAD: " + entry.cacheFile); break; - case CACHE_PASSING: - log.logInfo("PASSING: " + entry.cacheFile); break; - default: - log.logInfo("CACHE STATE UNKNOWN: " + entry.cacheFile); break; - } } @@ -453,6 +455,32 @@ public final class plasmaHTCache { return null; } + public byte[] loadResource(URL url) { + // load the url as resource from the cache + File f = getCachePath(url); + if (f.exists()) try { + return serverFileUtils.read(f); + } catch (IOException e) { + return null; + } else { + return null; + } + } + + /* + public void saveResource(URL url, byte[] resource) { + File f = getCachePath(url); + f.getParentFile().mkdirs(); + FileOutputStream fos = null; + try { + fos = new FileOutputStream(f); + htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file + } finally { + if (fos!=null)try{fos.close();}catch(Exception e){} + } + } + */ + public static boolean isPOST(String urlString) { return ((urlString.indexOf("?") >= 0) || (urlString.indexOf("&") >= 0)); diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index a4b18928a..7b468d4e6 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -143,8 +143,8 @@ public final class plasmaParser { * @see #initMediaExt(String) */ static { - initMediaExt("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," + - "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"); + initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," + + "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj")); /* =================================================== * initializing the parser object pool @@ -200,25 +200,31 @@ public final class plasmaParser { setEnabledParserList(mimeTypes); } - public static void initMediaExt(String mediaExtString) { + public static List extString2extList(String extString) { LinkedList extensions = new LinkedList(); - if ((mediaExtString == null) || (mediaExtString.length() == 0)) { - + if ((extString == null) || (extString.length() == 0)) { + return extensions; } else { - - String[] xs = mediaExtString.split(","); + String[] xs = extString.split(","); for (int i = 0; i < xs.length; i++) extensions.add(xs[i].toLowerCase().trim()); } - initMediaExt(extensions); + return extensions; } public static void initMediaExt(List mediaExtList) { synchronized (mediaExtSet) { mediaExtSet.clear(); - mediaExtSet.addAll(mediaExtList); - } + mediaExtSet.addAll(mediaExtList); + } } + public static void initSupportedFileExt(List supportedFileExtList) { + synchronized (mediaExtSet) { + supportedFileExt.clear(); + supportedFileExt.addAll(supportedFileExtList); + } + } + public static boolean realtimeParsableMimeTypesContains(String mimeType) { mimeType = getRealMimeType(mimeType); synchronized (realtimeParsableMimeTypes) { @@ -238,6 +244,12 @@ public final class plasmaParser { } } + public static boolean supportedFileExtContains(String mediaExt) { + if (supportedFileExt == null) return false; + //System.out.println("supported ext: " + supportedFileExt.toString()); + return (supportedFileExt.contains(mediaExt)); + } + public static boolean mediaExtContains(String mediaExt) { if (mediaExt == null) return false; @@ -316,16 +328,16 @@ public final class plasmaParser { } synchronized (enabledParserList) { - enabledParserList.clear(); - enabledParserList.putAll(newEnabledParsers); - } + //enabledParserList.clear(); + enabledParserList.putAll(newEnabledParsers); + } synchronized (supportedFileExt) { - supportedFileExt.clear(); + //supportedFileExt.clear(); supportedFileExt.addAll(newSupportedFileExt); - } - + } + return (String[])newEnabledParsers.keySet().toArray(new String[newEnabledParsers.size()]); } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index b39487e16..708927b3f 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -114,26 +114,38 @@ public class plasmaSnippetCache { return (String) snippetsCache.get(key); } - public String retrieve(java.net.URL url, boolean fetchOnline, Set query, boolean queryAreHashes) { - if (query.size() == 0) return null; - if (!(queryAreHashes)) query = plasmaSearch.words2hashes(query); + public String retrieve(java.net.URL url, boolean fetchOnline, Set queryhashes) { + if (queryhashes.size() == 0) { + //System.out.println("found no queryhashes for url retrieve " + url); + return null; + } String urlhash = plasmaURL.urlHash(url); // try to get snippet from snippetCache - String wordhashes = yacySearch.set2string(query); + String wordhashes = yacySearch.set2string(queryhashes); String snippet = retrieve(wordhashes, urlhash); - if (snippet != null) return snippet; + if (snippet != null) { + //System.out.println("found snippet for url " + url + " in cache: " + snippet); + return snippet; + } // if the snippet is not in the cache, we can try to get it from the htcache plasmaParserDocument document = getDocument(url, fetchOnline); - if (document == null) return null; + if (document == null) { + //System.out.println("cannot load document for url " + url); + return null; + } + //System.out.println("loaded document for url " + url); String[] sentences = document.getSentences(); //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]); - if ((sentences == null) || (sentences.length == 0)) return null; + if ((sentences == null) || (sentences.length == 0)) { + //System.out.println("found no sentences in url " + url); + return null; + } // we have found a parseable non-empty file: use the lines TreeMap sentencematrix = hashMatrix(sentences); - Iterator i = query.iterator(); + Iterator i = queryhashes.iterator(); String hash; kelondroMScoreCluster hitTable = new kelondroMScoreCluster(); Iterator j; @@ -151,8 +163,9 @@ public class plasmaSnippetCache { Integer maxLine = (Integer) hitTable.getMaxObject(); if (maxLine == null) return null; snippet = sentences[maxLine.intValue()]; - if (snippet.length() > 140) return null; - + //System.out.println("loaded snippet for url " + url + ": " + snippet); + if (snippet.length() > 120) snippet = snippet.substring(0, 120); + // finally store this snippet in our own cache store(wordhashes, urlhash, snippet); return snippet; @@ -175,10 +188,10 @@ public class plasmaSnippetCache { // load the url as resource from the web try { //return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort); - byte[] resource = getResourceFromCache(url); + byte[] resource = cacheManager.loadResource(url); if ((fetchOnline) && (resource == null)) { loadResourceFromWeb(url, 5000); - resource = getResourceFromCache(url); + resource = cacheManager.loadResource(url); } return resource; } catch (IOException e) { @@ -186,20 +199,6 @@ public class plasmaSnippetCache { } } - private byte[] getResourceFromCache(URL url) { - // load the url as resource from the cache - String path = htmlFilterContentScraper.urlNormalform(url).substring(6); - File cache = cacheManager.cachePath; - File f = new File(cache, path); - if (f.exists()) try { - return serverFileUtils.read(f); - } catch (IOException e) { - return null; - } else { - return null; - } - } - private void loadResourceFromWeb(URL url, int socketTimeout) throws IOException { plasmaCrawlWorker.load( url, @@ -221,14 +220,23 @@ public class plasmaSnippetCache { httpHeader header = null; try { header = cacheManager.getCachedResponse(plasmaURL.urlHash(url)); - } catch (IOException e) { - return null; - } - if (header == null) return null; - if (plasmaParser.supportedMimeTypesContains(header.mime())) { - return parser.parseSource(url, header.mime(), resource); + } catch (IOException e) {} + + if (header == null) { + String filename = url.getFile(); + int p = filename.lastIndexOf('.'); + if ((p < 0) || + ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) { + return parser.parseSource(url, "text/html", resource); + } else { + return null; + } } else { - return null; + if (plasmaParser.supportedMimeTypesContains(header.mime())) { + return parser.parseSource(url, header.mime(), resource); + } else { + return null; + } } } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 3ad0ee37f..82a36abd7 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -263,9 +263,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser this.parser = new plasmaParser(); // define an extension-blacklist - log.logSystem("Parser: Initializing Media Extensions"); - plasmaParser.initMediaExt(getConfig("mediaExt",null)); - + log.logSystem("Parser: Initializing Extension Mappings for Media/Parser"); + plasmaParser.initMediaExt(plasmaParser.extString2extList(getConfig("mediaExt",""))); + plasmaParser.initSupportedFileExt(plasmaParser.extString2extList(getConfig("parseableExt",""))); + // define a realtime parsable mimetype list log.logSystem("Parser: Initializing Mime Types"); plasmaParser.initRealtimeParsableMimeTypes(getConfig("parseableRealtimeMimeTypes","application/xhtml+xml,text/html,text/plain")); @@ -300,6 +301,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser cleanProfiles(); // init facility DB + /* log.logSystem("Starting Facility Database"); File facilityDBpath = new File(getRootPath(), "DATA/SETTINGS/"); facilityDB = new kelondroTables(facilityDBpath); @@ -312,7 +314,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser facilityDB.update("statistik", (new serverDate()).toShortString(false).substring(0, 11), new long[]{1,2,3,4,5,6}); long[] testresult = facilityDB.selectLong("statistik", "yyyyMMddHHm"); testresult = facilityDB.selectLong("statistik", (new serverDate()).toShortString(false).substring(0, 11)); - + */ // generate snippets cache log.logSystem("Initializing Snippet Cache"); @@ -322,17 +324,22 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // start yacy core log.logSystem("Starting YaCy Protocol Core"); + //try{Thread.currentThread().sleep(5000);} catch (InterruptedException e) {} // for profiler yacyCore yc = new yacyCore(this); + //log.logSystem("Started YaCy Protocol Core"); + //System.gc(); try{Thread.currentThread().sleep(5000);} catch (InterruptedException e) {} // for profiler serverInstantThread.oneTimeJob(yc, "loadSeeds", yc.log, 3000); // deploy threads log.logSystem("Starting Threads"); + System.gc(); // help for profiler int indexing_cluster = Integer.parseInt(getConfig("80_indexing_cluster", "1")); if (indexing_cluster < 1) indexing_cluster = 1; deployThread("90_cleanup", "Cleanup", "simple cleaning process for monitoring information" , new serverInstantThread(this, "cleanupJob", "cleanupJobSize"), 10000); // all 5 Minutes deployThread("80_indexing", "Parsing/Indexing", "thread that performes document parsing and indexing" , new serverInstantThread(this, "deQueue", "queueSize"), 10000); + for (int i = 1; i < indexing_cluster; i++) { setConfig((i + 80) + "_indexing_idlesleep", getConfig("80_indexing_idlesleep", "")); setConfig((i + 80) + "_indexing_busysleep", getConfig("80_indexing_busysleep", "")); @@ -344,7 +351,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser deployThread("62_remotetriggeredcrawl", "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer", new serverInstantThread(this, "remoteTriggeredCrawlJob", "remoteTriggeredCrawlJobSize"), 30000); deployThread("61_globalcrawltrigger", "Global Crawl Trigger", "thread that triggeres remote peers for crawling", - new serverInstantThread(this, "limitCrawlTriggerJob", "limitCrawlTriggerJobSize"), 30000); + new serverInstantThread(this, "limitCrawlTriggerJob", "limitCrawlTriggerJobSize"), 30000); // error here? deployThread("50_localcrawl", "Local Crawl", "thread that performes a single crawl step from the local crawl queue", new serverInstantThread(this, "coreCrawlJob", "coreCrawlJobSize"), 10000); deployThread("40_peerseedcycle", "Seed-List Upload", "task that a principal peer performes to generate and upload a seed-list to a ftp account", @@ -357,6 +364,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // init migratiion from 0.37 -> 0.38 classicCache = new plasmaWordIndexClassicCacheMigration(plasmaPath, wordIndex); + if (classicCache.size() > 0) { setConfig("99_indexcachemigration_idlesleep" , 10000); setConfig("99_indexcachemigration_busysleep" , 40); @@ -451,7 +459,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser cacheLoader.close(); wikiDB.close(); messageDB.close(); - facilityDB.close(); + if (facilityDB != null) facilityDB.close(); urlPool.close(); profiles.close(); parser.close(); @@ -577,6 +585,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public boolean coreCrawlJob() { + System.gc(); // debug if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) { //log.logDebug("CoreCrawl: queue is empty"); return false; @@ -1128,35 +1137,43 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser char[] order; String urlmask; long time; - public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask) { + int fetchcount; + public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int fetchcount) { this.queryhashes = queryhashes; this.order = order; this.urlmask = urlmask; this.time = time; + this.fetchcount = fetchcount; } public void run() { try { // search the database locally + log.logDebug("presearch: started job"); plasmaWordIndexEntity idx = searchManager.searchHashes(queryhashes, time); - plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, 3); + log.logDebug("presearch: found " + idx.size() + " results"); + plasmaSearch.result acc = searchManager.order(idx, queryhashes, stopwords, order, time, fetchcount); if (acc == null) return; + log.logDebug("presearch: ordered results, now " + acc.sizeOrdered() + " URLs ready for fetch"); // take some elements and fetch the snippets int i = 0; plasmaCrawlLURL.entry urlentry; - String urlstring; - while ((acc.hasMoreElements()) && (i < 3)) { + String urlstring, snippet; + while ((acc.hasMoreElements()) && (i < fetchcount)) { urlentry = acc.nextElement(); if (urlentry.url().getHost().endsWith(".yacyh")) continue; urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url()); if (urlstring.matches(urlmask)) { //.* is default - snippetCache.retrieve(urlentry.url(), true, queryhashes, true); + log.logDebug("presearch: fetching URL " + urlstring); + snippet = snippetCache.retrieve(urlentry.url(), true, queryhashes); + if (snippet != null) log.logDebug("found snippet for URL " + urlstring + ": '" + snippet + "'"); i++; } } } catch (IOException e) { e.printStackTrace(); } + log.logDebug("presearch: job terminated"); } } @@ -1169,7 +1186,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (order2.equals("quality")) order[1] = plasmaSearch.O_QUALITY; else order[1] = plasmaSearch.O_AGE; // filter out words that appear in bluelist - Set queryhashes = plasmaSearch.words2hashes(querywords); Iterator it = querywords.iterator(); String word, gs = ""; while (it.hasNext()) { @@ -1177,13 +1193,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (blueList.contains(word)) it.remove(); else gs += "+" + word; } if (gs.length() > 0) gs = gs.substring(1); + Set queryhashes = plasmaSearch.words2hashes(querywords); // log log.logInfo("INIT WORD SEARCH: " + gs + " - " + count + " links, " + (time / 1000) + " seconds"); long timestamp = System.currentTimeMillis(); - //Thread preselect = new presearch(querywords, order, time / 10, urlmask); - //preselect.start(); + Thread preselect = new presearch(queryhashes, order, time / 10, urlmask, 5); + preselect.start(); // do global fetching int globalresults = 0; @@ -1266,7 +1283,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser prop.put("results_" + i + "_urlname", urlname); prop.put("results_" + i + "_date", dateString(urlentry.moddate())); prop.put("results_" + i + "_size", Long.toString(urlentry.size())); - snippet = snippetCache.retrieve(url, false, querywords, false); + snippet = snippetCache.retrieve(url, false, queryhashes); if ((snippet == null) || (snippet.length() < 10)) { prop.put("results_" + i + "_snippet", 0); prop.put("results_" + i + "_snippet_text", ""); @@ -1343,7 +1360,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String snippet; while ((acc.hasMoreElements()) && (i < count)) { urlentry = acc.nextElement(); - snippet = snippetCache.retrieve(urlentry.url(), false, hashes, true); + snippet = snippetCache.retrieve(urlentry.url(), false, hashes); if ((snippet == null) || (snippet.length() < 10)) { resource = urlentry.toString(); } else { diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 2efeb70e0..e68b01cdf 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -170,7 +170,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { if (System.currentTimeMillis() > messageTime) { System.gc(); // for better statistic wordsPerSecond = wordcount * 1000 / (1 + System.currentTimeMillis() - startTime); - log.logInfo("dumping status: " + wordcount + " words done, " + (cache.size() / wordsPerSecond) + " seconds remaining, free mem = " + (Runtime.getRuntime().freeMemory() / 1024 / 1024) + "MB"); + log.logInfo("dumping status: " + wordcount + " words done, " + (cache.size() / (wordsPerSecond + 1)) + " seconds remaining, free mem = " + (Runtime.getRuntime().freeMemory() / 1024 / 1024) + "MB"); messageTime = System.currentTimeMillis() + 5000; } } @@ -552,11 +552,9 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { flushThread.pause(); //serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size()); while (cache.size() >= this.maxWords) flushFromMem(); - if ((cache.size() > 10000) && (Runtime.getRuntime().freeMemory() < 11000000)) flushFromMem(); - while ((cache.size() > 0) && (Runtime.getRuntime().freeMemory() < 1000000)) { - flushFromMem(); - System.gc(); - } + if ((cache.size() > 10000) && (Runtime.getRuntime().freeMemory() < 5000000)) flushFromMem(); + if ((cache.size() > 0) && (Runtime.getRuntime().freeMemory() < 1000000)) flushFromMem(); + //if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries"); // put new words into cache diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java index 15cbb6487..b29e955ca 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntity.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java @@ -79,6 +79,7 @@ public class plasmaWordIndexEntity { } private kelondroTree indexFile(File databaseRoot, String wordHash) throws IOException { + if (wordHash.length() < 12) throw new IOException("word hash wrong: '" + wordHash + "'"); theLocation = wordHash2path(databaseRoot, wordHash); File fp = theLocation.getParentFile(); if (fp != null) fp.mkdirs(); @@ -97,7 +98,8 @@ public class plasmaWordIndexEntity { public static File wordHash2path(File databaseRoot, String hash) { // creates a path that constructs hashing on a file system - return new File (databaseRoot, "WORDS/" + + + return new File (databaseRoot, "WORDS/" + hash.substring(0,1) + "/" + hash.substring(1,2) + "/" + hash.substring(2,4) + "/" + hash.substring(4,6) + "/" + hash + ".db"); } diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java index 70cd5693e..1a213aa90 100644 --- a/source/de/anomic/server/serverByteBuffer.java +++ b/source/de/anomic/server/serverByteBuffer.java @@ -236,23 +236,26 @@ public final class serverByteBuffer extends OutputStream { length = length - start; return this; } - + private serverByteBuffer trim(int start, int end) { - if (end > length) throw new IndexOutOfBoundsException("trim: end > length"); - trim(start); + // the end value is outside (+1) of the wanted target array + if (start > length) throw new IndexOutOfBoundsException("trim: start > length"); + if (end > length) throw new IndexOutOfBoundsException("trim: end > length"); + if (start > end) throw new IndexOutOfBoundsException("trim: start > end"); + offset = offset + start; length = end - start; return this; } public serverByteBuffer trim() { - int l = 0; while ((l < length) && (buffer[l] <= 32)) l++; - int r = length; while ((r > 0) && (buffer[r - 1] <= 32)) r--; - if ((l <= r) && (l < length)) return trim(l, r); - return this; + int l = 0; while ((l < length) && (buffer[offset + l] <= 32)) l++; + int r = length; while ((r > 0) && (buffer[offset + r - 1] <= 32)) r--; + if (l > r) r = l; + return trim(l, r); } public String toString() { - return new String(getBytes(), offset, length); + return new String(buffer, offset, length); } public Properties propParser() { diff --git a/source/de/anomic/server/serverCore.java b/source/de/anomic/server/serverCore.java index 601095e34..1069393d8 100644 --- a/source/de/anomic/server/serverCore.java +++ b/source/de/anomic/server/serverCore.java @@ -450,8 +450,8 @@ public final class serverCore extends serverAbstractThread implements serverThre */ public SessionPool(SessionFactory objFactory) { super(objFactory); - this.setMaxIdle(75); // Maximum idle threads. - this.setMaxActive(150); // Maximum active threads. + this.setMaxIdle(50); // Maximum idle threads. + this.setMaxActive(100); // Maximum active threads. this.setMinEvictableIdleTimeMillis(30000); //Evictor runs every 30 secs. //this.setMaxWait(1000); // Wait 1 second till a thread is available } diff --git a/yacy.init b/yacy.init index 7b4d2b46d..2859e357d 100644 --- a/yacy.init +++ b/yacy.init @@ -100,6 +100,7 @@ parseableMimeTypes= # this is important to recognize - tags as not-html reference # These files will be excluded from indexing _(Please keep extensions in alphabetical order)_ mediaExt=ace,arj,asf,avi,bin,bz2,css,deb,doc,dmg,gif,gz,hqx,img,iso,jar,jpe,jpg,jpeg,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,sit,swf,sxc,sxd,sxi,sxw,tar,tgz,torrent,wmv,xcf,xls,zip +parseableExt=html,htm,txt # Promotion Strings # These strings appear in the Web Mask of the YACY search client