From 1e6d12f1469622ad71062d6c5b2f7780108248a8 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 10 Jul 2008 00:47:37 +0000 Subject: [PATCH] Major update to BLOB data structures: - introduced a new BLOB file format: kelondroBLOBHeap. This is a flat file with an index in RAM. very similar to the eco-tables, but with flexible value sizes. It will replace the kelondroBLOBTree, which is based on a kelondroTree, a file-AVL-based index data structure. - the HTCACHE header file was replaced by the new blob heap file structure - the robots.txt file was replaced by the new blob heap file structure - the robots parser was enhanced (bugfixing for double-loading of the same robots.txt) - other BLOB-dependent data structures were prepared to use also the new BLOB heap - fixed a bug in the snippet fetch process: the file header was not written to the header index There should now be less IO during snippet fetch and during crawling git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4978 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Wiki.java | 8 +- source/de/anomic/crawler/CrawlProfile.java | 4 +- source/de/anomic/crawler/HTTPLoader.java | 1 + source/de/anomic/crawler/RobotsTxt.java | 126 +++--- source/de/anomic/crawler/robotsParser.java | 172 ++++---- source/de/anomic/data/blogBoard.java | 6 +- source/de/anomic/data/blogBoardComments.java | 2 +- source/de/anomic/data/bookmarksDB.java | 8 +- source/de/anomic/data/messageBoard.java | 4 +- source/de/anomic/data/userDB.java | 4 +- source/de/anomic/data/wikiBoard.java | 4 +- source/de/anomic/kelondro/kelondroBLOB.java | 35 +- .../de/anomic/kelondro/kelondroBLOBHeap.java | 412 ++++++++++++++++++ .../de/anomic/kelondro/kelondroBLOBTree.java | 65 ++- .../anomic/kelondro/kelondroBytesLongMap.java | 2 +- source/de/anomic/kelondro/kelondroHeap.java | 191 -------- .../kelondro/kelondroMScoreCluster.java | 3 +- .../anomic/kelondro/kelondroMapObjects.java | 40 +- .../de/anomic/kelondro/kelondroObjects.java | 39 +- source/de/anomic/plasma/plasmaHTCache.java | 25 +- .../de/anomic/plasma/plasmaSwitchboard.java | 8 +- 21 files changed, 721 insertions(+), 438 deletions(-) create mode 100755 source/de/anomic/kelondro/kelondroBLOBHeap.java delete mode 100755 source/de/anomic/kelondro/kelondroHeap.java diff --git a/htroot/Wiki.java b/htroot/Wiki.java index 807dc2cff..29def50c7 100644 --- a/htroot/Wiki.java +++ b/htroot/Wiki.java @@ -173,11 +173,11 @@ public class Wiki { prop.put("mode", "3"); //Index String subject; try { - Iterator i = sb.wikiDB.keys(true); + Iterator i = sb.wikiDB.keys(true); wikiBoard.entry entry; int count=0; while (i.hasNext()) { - subject = i.next(); + subject = new String(i.next()); entry = sb.wikiDB.read(subject); prop.putHTML("mode_pages_"+count+"_name",wikiBoard.webalize(subject)); prop.putHTML("mode_pages_"+count+"_subject", subject); @@ -200,14 +200,14 @@ public class Wiki { prop.putHTML("mode_error_page", pagename); try { - Iterator it = sb.wikiDB.keysBkp(true); + Iterator it = sb.wikiDB.keysBkp(true); wikiBoard.entry entry; wikiBoard.entry oentry = null; wikiBoard.entry nentry = null; int count = 0; boolean oldselected = false, newselected = false; while (it.hasNext()) { - entry = sb.wikiDB.readBkp(it.next()); + entry = sb.wikiDB.readBkp(new String(it.next())); prop.put("mode_error_versions_" + count + "_date", wikiBoard.dateString(entry.date())); prop.put("mode_error_versions_" + count + "_fdate", dateString(entry.date())); if (wikiBoard.dateString(entry.date()).equals(post.get("old", null))) { diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index e24ee1a4e..9878fdf4b 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -101,7 +101,7 @@ public class CrawlProfile { public class profileIterator implements Iterator { // the iterator iterates all keys, which are byte[] objects - kelondroCloneableIterator handleIterator; + kelondroCloneableIterator handleIterator; String lastkey; public profileIterator(boolean up) throws IOException { handleIterator = profileTable.keys(up, false); @@ -117,7 +117,7 @@ public class CrawlProfile { } public entry next() { try { - lastkey = handleIterator.next(); + lastkey = new String(handleIterator.next()); return getEntry(lastkey); } catch (kelondroException e) { clear(); diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index 01bbac639..903d44324 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -240,6 +240,7 @@ public final class HTTPLoader { fos.write(responseBody); htCache.setCacheArray(responseBody); plasmaHTCache.writeFileAnnouncement(cacheFile); + //htCache.writeResourceInfo(); // write header to header BLOB-database } finally { if (fos!=null)try{fos.close();}catch(Exception e){/* ignore this */} } diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index d15bb8e4b..cbf815514 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -43,7 +43,6 @@ //the intact and unchanged copyright notice. //Contributions and changes to the program code must be marked as such. - package de.anomic.crawler; import java.io.BufferedInputStream; @@ -62,6 +61,8 @@ import de.anomic.http.HttpClient; import de.anomic.http.JakartaCommonsHttpClient; import de.anomic.http.JakartaCommonsHttpResponse; import de.anomic.http.httpHeader; +import de.anomic.kelondro.kelondroBLOB; +import de.anomic.kelondro.kelondroBLOBHeap; import de.anomic.kelondro.kelondroBLOBTree; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMapObjects; @@ -81,7 +82,17 @@ public class RobotsTxt { public RobotsTxt(File robotsTableFile) { this.robotsTableFile = robotsTableFile; robotsTableFile.getParentFile().mkdirs(); - robotsTable = new kelondroMapObjects(new kelondroBLOBTree(robotsTableFile, true, true, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true), 100); + kelondroBLOB blob = null; + if (robotsTableFile.getName().endsWith(".heap")) { + try { + blob = new kelondroBLOBHeap(robotsTableFile, 64, kelondroNaturalOrder.naturalOrder); + } catch (IOException e) { + e.printStackTrace(); + } + } else { + blob = new kelondroBLOBTree(robotsTableFile, true, true, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true); + } + robotsTable = new kelondroMapObjects(blob, 100); } private void resetDatabase() { @@ -352,24 +363,27 @@ public class RobotsTxt { return crawlDelay; } + //private static final HashSet loadedRobots = new HashSet(); // only for debugging + @SuppressWarnings("unchecked") public boolean isDisallowed(yacyURL nexturl) { if (nexturl == null) throw new IllegalArgumentException(); // generating the hostname:poart string needed to do a DB lookup String urlHostPort = getHostPort(nexturl); - - // do a DB lookup to determine if the robots data is already available - RobotsTxt.Entry robotsTxt4Host = getEntry(urlHostPort); - - // if we have not found any data or the data is older than 7 days, we need to load it from the remote server - if ( - (robotsTxt4Host == null) || - (robotsTxt4Host.getLoadedDate() == null) || - (System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000) - ) { - synchronized(this) { - + RobotsTxt.Entry robotsTxt4Host = null; + synchronized(this) { + + // do a DB lookup to determine if the robots data is already available + robotsTxt4Host = getEntry(urlHostPort); + + // if we have not found any data or the data is older than 7 days, we need to load it from the remote server + if ( + (robotsTxt4Host == null) || + (robotsTxt4Host.getLoadedDate() == null) || + (System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000) + ) { + // generating the proper url to download the robots txt yacyURL robotsURL = null; try { @@ -380,56 +394,60 @@ public class RobotsTxt { } Object[] result = null; - boolean accessCompletelyRestricted = false; - byte[] robotsTxt = null; - String eTag = null; - Date modDate = null; - try { - serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'."); - result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host); - - if (result != null) { - accessCompletelyRestricted = ((Boolean)result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue(); - robotsTxt = (byte[])result[DOWNLOAD_ROBOTS_TXT]; - eTag = (String) result[DOWNLOAD_ETAG]; - modDate = (Date) result[DOWNLOAD_MODDATE]; - } else if (robotsTxt4Host != null) { - robotsTxt4Host.setLoadedDate(new Date()); - addEntry(robotsTxt4Host); - } + serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'."); + try { + result = downloadRobotsTxt(robotsURL, 5, robotsTxt4Host); } catch (Exception e) { - serverLog.logSevere("ROBOTS","Unable to download the robots.txt file from URL '" + robotsURL + "'. " + e.getMessage()); + result = null; } + /* + assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) : + "robots-url=" + robotsURL.toString() + + ", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : new String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) + + ", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString()); + loadedRobots.add(robotsURL.toNormalform(false, false)); + */ - if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) { - ArrayList denyPath = null; - String sitemap = null; - Integer crawlDelay = null; - if (accessCompletelyRestricted) { + if (result == null) { + // no robots.txt available, make an entry to prevent that the robots loading is done twice + if (robotsTxt4Host == null) { + // generate artificial entry + robotsTxt4Host = new Entry( + urlHostPort, + new ArrayList(), + new Date(), + new Date(), + null, + null, + new Integer(0)); + } else { + robotsTxt4Host.setLoadedDate(new Date()); + } + + // store the data into the robots DB + addEntry(robotsTxt4Host); + } else { + Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]); + ArrayList denyPath = (ArrayList) parserResult[0]; + if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) { denyPath = new ArrayList(); denyPath.add("/"); - } else { - // parsing the robots.txt Data and converting it into an arraylist - try { - Object[] parserResult = robotsParser.parse(robotsTxt); - denyPath = (ArrayList) parserResult[0]; - sitemap = (String) parserResult[1]; - crawlDelay = (Integer) parserResult[2]; - } catch (IOException e) { - serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'."); - } - } + } - // storing the data into the robots DB - robotsTxt4Host = addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap,crawlDelay); + // store the data into the robots DB + robotsTxt4Host = addEntry( + urlHostPort, + denyPath, + new Date(), + (Date) result[DOWNLOAD_MODDATE], + (String) result[DOWNLOAD_ETAG], + (String) parserResult[1], + (Integer) parserResult[2]); } } } - if (robotsTxt4Host != null && robotsTxt4Host.isDisallowed(nexturl.getFile())) { - return true; - } - return false; + return robotsTxt4Host.isDisallowed(nexturl.getFile()); } private static Object[] downloadRobotsTxt(yacyURL robotsURL, int redirectionCount, RobotsTxt.Entry entry) throws Exception { diff --git a/source/de/anomic/crawler/robotsParser.java b/source/de/anomic/crawler/robotsParser.java index a727cfaee..fbe0951e6 100644 --- a/source/de/anomic/crawler/robotsParser.java +++ b/source/de/anomic/crawler/robotsParser.java @@ -46,6 +46,7 @@ package de.anomic.crawler; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; +import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; @@ -85,25 +86,26 @@ public final class robotsParser{ * at the Moment it only creates a list of Deny Paths */ - public static Object[] parse(File robotsFile) throws IOException { + public static Object[] parse(File robotsFile) { BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(robotsFile)); - return parse(reader); - } finally { if (reader != null) try{reader.close();}catch(Exception e){/* ignore this */} + return parse(reader); + } catch (FileNotFoundException e1) { } + return new Object[]{new ArrayList(), "", new Integer(0)}; } @SuppressWarnings("unchecked") - public static Object[] parse(byte[] robotsTxt) throws IOException { + public static Object[] parse(byte[] robotsTxt) { if ((robotsTxt == null)||(robotsTxt.length == 0)) return new Object[]{new ArrayList(0),null,null}; ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt); BufferedReader reader = new BufferedReader(new InputStreamReader(bin)); return parse(reader); } - public static Object[] parse(BufferedReader reader) throws IOException{ + public static Object[] parse(BufferedReader reader) { ArrayList deny4AllAgents = new ArrayList(); ArrayList deny4YaCyAgent = new ArrayList(); @@ -115,102 +117,104 @@ public final class robotsParser{ rule4YaCyFound = false, inBlock = false; - while ((line = reader.readLine()) != null) { - line = line.trim(); - lineUpper = line.toUpperCase(); - - if (line.length() == 0) { - // OLD: we have reached the end of the rule block - // rule4Yacy = false; inBlock = false; - - // NEW: just ignore it - } else if (line.startsWith(ROBOTS_COMMENT)) { - // we can ignore this. Just a comment line - } else if (lineUpper.startsWith(ROBOTS_SITEMAP)) { - pos = line.indexOf(" "); - if (pos != -1) { - sitemap = line.substring(pos).trim(); - } - } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) { - - if (inBlock) { - // we have detected the start of a new block - inBlock = false; - isRuleBlock4AllAgents = false; - isRuleBlock4YaCyAgent = false; - crawlDelay = null; // each block has a separate delay - } - - // cutting off comments at the line end - pos = line.indexOf(ROBOTS_COMMENT); - if (pos != -1) line = line.substring(0,pos).trim(); - - // replacing all tabs with spaces - line = line.replaceAll("\t"," "); - - // getting out the robots name - pos = line.indexOf(" "); - if (pos != -1) { - String userAgent = line.substring(pos).trim(); - isRuleBlock4AllAgents |= userAgent.equals("*"); - isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; - if (isRuleBlock4YaCyAgent) rule4YaCyFound = true; - } - } else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) { - pos = line.indexOf(" "); - if (pos != -1) { - try { - crawlDelay = Integer.valueOf(line.substring(pos).trim()); - } catch (NumberFormatException e) { - // invalid crawling delay - } - } - } else if (lineUpper.startsWith(ROBOTS_DISALLOW) || - lineUpper.startsWith(ROBOTS_ALLOW)) { - inBlock = true; - boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW); + try { + while ((line = reader.readLine()) != null) { + line = line.trim(); + lineUpper = line.toUpperCase(); - if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) { + if (line.length() == 0) { + // OLD: we have reached the end of the rule block + // rule4Yacy = false; inBlock = false; + + // NEW: just ignore it + } else if (line.startsWith(ROBOTS_COMMENT)) { + // we can ignore this. Just a comment line + } else if (lineUpper.startsWith(ROBOTS_SITEMAP)) { + pos = line.indexOf(" "); + if (pos != -1) { + sitemap = line.substring(pos).trim(); + } + } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) { + + if (inBlock) { + // we have detected the start of a new block + inBlock = false; + isRuleBlock4AllAgents = false; + isRuleBlock4YaCyAgent = false; + crawlDelay = null; // each block has a separate delay + } + // cutting off comments at the line end pos = line.indexOf(ROBOTS_COMMENT); if (pos != -1) line = line.substring(0,pos).trim(); - - // cutting of tailing * - if (line.endsWith("*")) line = line.substring(0,line.length()-1); // replacing all tabs with spaces line = line.replaceAll("\t"," "); - // getting the path + // getting out the robots name pos = line.indexOf(" "); if (pos != -1) { - // getting the path - String path = line.substring(pos).trim(); - - // unencoding all special charsx - try { - path = URLDecoder.decode(path,"UTF-8"); - } catch (Exception e) { - /* - * url decoding failed. E.g. because of - * "Incomplete trailing escape (%) pattern" - */ - } + String userAgent = line.substring(pos).trim(); + isRuleBlock4AllAgents |= userAgent.equals("*"); + isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; + if (isRuleBlock4YaCyAgent) rule4YaCyFound = true; + } + } else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) { + pos = line.indexOf(" "); + if (pos != -1) { + try { + crawlDelay = Integer.valueOf(line.substring(pos).trim()); + } catch (NumberFormatException e) { + // invalid crawling delay + } + } + } else if (lineUpper.startsWith(ROBOTS_DISALLOW) || + lineUpper.startsWith(ROBOTS_ALLOW)) { + inBlock = true; + boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW); + + if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) { + // cutting off comments at the line end + pos = line.indexOf(ROBOTS_COMMENT); + if (pos != -1) line = line.substring(0,pos).trim(); + + // cutting of tailing * + if (line.endsWith("*")) line = line.substring(0,line.length()-1); - // escaping all occurences of ; because this char is used as special char in the Robots DB - path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B"); + // replacing all tabs with spaces + line = line.replaceAll("\t"," "); - // adding it to the pathlist - if (!isDisallowRule) path = "!" + path; - if (isRuleBlock4AllAgents) deny4AllAgents.add(path); - if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path); + // getting the path + pos = line.indexOf(" "); + if (pos != -1) { + // getting the path + String path = line.substring(pos).trim(); + + // unencoding all special charsx + try { + path = URLDecoder.decode(path,"UTF-8"); + } catch (Exception e) { + /* + * url decoding failed. E.g. because of + * "Incomplete trailing escape (%) pattern" + */ + } + + // escaping all occurences of ; because this char is used as special char in the Robots DB + path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B"); + + // adding it to the pathlist + if (!isDisallowRule) path = "!" + path; + if (isRuleBlock4AllAgents) deny4AllAgents.add(path); + if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path); + } } } } - } + } catch (IOException e) {} ArrayList denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents; - return new Object[]{denyList,sitemap,crawlDelay}; + return new Object[]{denyList, sitemap, crawlDelay}; } } diff --git a/source/de/anomic/data/blogBoard.java b/source/de/anomic/data/blogBoard.java index 46e1c9257..5643e5cd6 100644 --- a/source/de/anomic/data/blogBoard.java +++ b/source/de/anomic/data/blogBoard.java @@ -227,7 +227,7 @@ public class blogBoard { database.remove(key); } catch (IOException e) { } } - public Iterator keys(boolean up) throws IOException { + public Iterator keys(boolean up) throws IOException { return database.keys(up, false); } /** @@ -282,7 +282,7 @@ public class blogBoard { * Subclass of blogBoard, which provides the blogIterator object-type */ public class BlogIterator implements Iterator { - Iterator blogIter; + Iterator blogIter; blogBoard.BlogEntry nextEntry; public BlogIterator(boolean up) throws IOException { this.blogIter = blogBoard.this.database.keys(up, false); @@ -300,7 +300,7 @@ public class blogBoard { public BlogEntry next() { try { - return readBlogEntry(this.blogIter.next()); + return readBlogEntry(new String(this.blogIter.next())); } catch (kelondroException e) { //resetDatabase(); return null; diff --git a/source/de/anomic/data/blogBoardComments.java b/source/de/anomic/data/blogBoardComments.java index e51902a5c..d7c97e479 100644 --- a/source/de/anomic/data/blogBoardComments.java +++ b/source/de/anomic/data/blogBoardComments.java @@ -218,7 +218,7 @@ public class blogBoardComments { database.remove(key); } catch (IOException e) { } } - public Iterator keys(boolean up) throws IOException { + public Iterator keys(boolean up) throws IOException { return database.keys(up, false); } diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index fc9106ea2..bfc7a5d07 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -1108,7 +1108,7 @@ public class bookmarksDB { * Subclass of bookmarksDB, which provides the tagIterator object-type */ public class tagIterator implements Iterator { - kelondroCloneableIterator tagIter; + kelondroCloneableIterator tagIter; bookmarksDB.Tag nextEntry; public tagIterator(boolean up) throws IOException { @@ -1128,7 +1128,7 @@ public class bookmarksDB { public Tag next() { try { - return getTag(this.tagIter.next()); + return getTag(new String(this.tagIter.next())); } catch (kelondroException e) { //resetDatabase(); return null; @@ -1151,7 +1151,7 @@ public class bookmarksDB { * Subclass of bookmarksDB, which provides the bookmarkIterator object-type */ public class bookmarkIterator implements Iterator { - Iterator bookmarkIter; + Iterator bookmarkIter; bookmarksDB.Bookmark nextEntry; public bookmarkIterator(boolean up) throws IOException { //flushBookmarkCache(); //XXX: this will cost performance @@ -1170,7 +1170,7 @@ public class bookmarksDB { public Bookmark next() { try { - return getBookmark(this.bookmarkIter.next()); + return getBookmark(new String(this.bookmarkIter.next())); } catch (kelondroException e) { //resetDatabase(); return null; diff --git a/source/de/anomic/data/messageBoard.java b/source/de/anomic/data/messageBoard.java index 3d9d796d9..c4a9b6097 100644 --- a/source/de/anomic/data/messageBoard.java +++ b/source/de/anomic/data/messageBoard.java @@ -234,7 +234,7 @@ public class messageBoard { public class catIter implements Iterator { - Iterator allIter = null; + Iterator allIter = null; String nextKey = null; String category = ""; @@ -246,7 +246,7 @@ public class messageBoard { public void findNext() { while (allIter.hasNext()) { - nextKey = allIter.next(); + nextKey = new String(allIter.next()); if (this.category==null || nextKey.startsWith(this.category)) return; } nextKey = null; diff --git a/source/de/anomic/data/userDB.java b/source/de/anomic/data/userDB.java index aa3eeb67f..c81da4fde 100644 --- a/source/de/anomic/data/userDB.java +++ b/source/de/anomic/data/userDB.java @@ -588,7 +588,7 @@ public final class userDB { public class userIterator implements Iterator { // the iterator iterates all userNames - kelondroCloneableIterator userIter; + kelondroCloneableIterator userIter; userDB.Entry nextEntry; public userIterator(boolean up) throws IOException { @@ -605,7 +605,7 @@ public final class userDB { } public Entry next() { try { - return getEntry(this.userIter.next()); + return getEntry(new String(this.userIter.next())); } catch (kelondroException e) { resetDatabase(); return null; diff --git a/source/de/anomic/data/wikiBoard.java b/source/de/anomic/data/wikiBoard.java index e2b20b3ea..5c406c498 100644 --- a/source/de/anomic/data/wikiBoard.java +++ b/source/de/anomic/data/wikiBoard.java @@ -316,11 +316,11 @@ public class wikiBoard { } */ - public Iterator keys(boolean up) throws IOException { + public Iterator keys(boolean up) throws IOException { return datbase.keys(up, false); } - public Iterator keysBkp(boolean up) throws IOException { + public Iterator keysBkp(boolean up) throws IOException { return bkpbase.keys(up, false); } } diff --git a/source/de/anomic/kelondro/kelondroBLOB.java b/source/de/anomic/kelondro/kelondroBLOB.java index ba7c831c1..9ec431e9c 100644 --- a/source/de/anomic/kelondro/kelondroBLOB.java +++ b/source/de/anomic/kelondro/kelondroBLOB.java @@ -28,8 +28,6 @@ package de.anomic.kelondro; import java.io.IOException; -import de.anomic.kelondro.kelondroBLOBTree.keyIterator; - public interface kelondroBLOB { /** @@ -57,7 +55,7 @@ public interface kelondroBLOB { * @return * @throws IOException */ - public kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException; + public kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException; /** * iterate over all keys @@ -66,7 +64,7 @@ public interface kelondroBLOB { * @return * @throws IOException */ - public keyIterator keys(boolean up, byte[] firstKey) throws IOException; + public kelondroCloneableIterator keys(boolean up, byte[] firstKey) throws IOException; /** * check if a specific key is in the database @@ -74,7 +72,7 @@ public interface kelondroBLOB { * @return * @throws IOException */ - public boolean has(String key) throws IOException; + public boolean has(byte[] key) throws IOException; /** * retrieve the whole BLOB from the table @@ -82,17 +80,7 @@ public interface kelondroBLOB { * @return * @throws IOException */ - public byte[] get(String key) throws IOException; - - /** - * retrieve a fragment of a BLOB from the table - * @param key the primary key - * @param pos the position within the BLOB fragment - * @param len the length of the fragment - * @return - * @throws IOException - */ - public byte[] get(String key, int pos, int len) throws IOException; + public byte[] get(byte[] key) throws IOException; /** * write a whole byte array as BLOB to the table @@ -100,25 +88,14 @@ public interface kelondroBLOB { * @param b * @throws IOException */ - public void put(String key, byte[] b) throws IOException; - - /** - * write a fragment of a BLOB to the table - * @param key the primary key - * @param pos the position of the BLOB fragment - * @param b a byte array - * @param off the offset within the array where the BLOB fragment starts - * @param len the length of the fragment - * @throws IOException - */ - public void put(String key, int pos, byte[] b, int off, int len) throws IOException; + public void put(byte[] key, byte[] b) throws IOException; /** * remove a BLOB * @param key the primary key * @throws IOException */ - public void remove(String key) throws IOException; + public void remove(byte[] key) throws IOException; /** * close the BLOB table diff --git a/source/de/anomic/kelondro/kelondroBLOBHeap.java b/source/de/anomic/kelondro/kelondroBLOBHeap.java new file mode 100755 index 000000000..9c58f73be --- /dev/null +++ b/source/de/anomic/kelondro/kelondroBLOBHeap.java @@ -0,0 +1,412 @@ +// kelondroBLOBHeap.java +// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 09.07.2008 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $ +// $LastChangedRevision: 4558 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.ArrayList; +import java.util.Iterator; + +import de.anomic.server.serverMemory; +import de.anomic.server.logging.serverLog; + +public final class kelondroBLOBHeap implements kelondroBLOB { + + private kelondroBytesLongMap index; // key/seek relation for used records + private ArrayList free; // list of {size, seek} pairs denoting space and position of free records + private File heapFile; // the file of the heap + private kelondroByteOrder ordering; // the ordering on keys + private RandomAccessFile file; // a random access to the file + + /* + * This class implements a BLOB management based on a sequence of records in a random access file + * The data structure is: + * file :== record* + * record :== reclen key blob + * reclen :== <4 byte integer == length of key and blob> + * key :== + * blob :== + * that means that each record has the size reclen+4 + * + * The elements are organized in two data structures: + * index : key/seek relation for used records + * free> : list of {size, seek} pairs denoting space and position of free records + * + * Because the blob sizes are stored with integers, one entry may not exceed 2GB + * + * If a record is removed, it becomes a free record. + * New records are either appended to the end of the file or filled into a free record. + * A free record must either fit exactly to the size of the new record, or an old record is splitted + * into a filled and a new, smaller empty record. + */ + + /** + * create a heap file: a arbitrary number of BLOBs, indexed by an access key + * The heap file will be indexed upon initialization. + * @param heapFile + * @param keylength + * @param ordering + * @throws IOException + */ + public kelondroBLOBHeap(File heapFile, int keylength, kelondroByteOrder ordering) throws IOException { + this.ordering = ordering; + this.heapFile = heapFile; + + this.index = new kelondroBytesLongMap(keylength, this.ordering, 0); + this.free = new ArrayList(); + this.file = new RandomAccessFile(heapFile, "rw"); + byte[] key = new byte[keylength]; + int reclen; + long seek = 0; + + loop: while (true) { // don't test available() here because this does not work for files > 2GB + + try { + // go to seek position + file.seek(seek); + + // read length of the following record without the length of the record size bytes + reclen = file.readInt(); + + // read key + file.readFully(key); + + } catch (IOException e) { + // EOF reached + break loop; // terminate loop + } + + // check if this record is empty + if (key == null || key[0] == 0) { + // it is an empty record, store to free list + free.add(new Long[]{new Long(seek), new Long(reclen)}); + } else { + // store key and access address of entry in index + try { + if (this.ordering.wellformed(key)) { + index.addl(key, seek); + } else { + serverLog.logWarning("kelondroBLOBHeap", "BLOB " + heapFile.getName() + ": skiped not wellformed key " + new String(key) + " at seek pos " + seek); + } + } catch (IOException e) { + e.printStackTrace(); + break loop; + } + } + // new seek position + seek += 4L + reclen; + } + + // DEBUG + /* + Iterator i = index.keys(true, null); + byte[] b; + int c = 0; + while (i.hasNext()) { + key = i.next(); + System.out.println("KEY=" + new String(key)); + b = get(key); + System.out.println("BLOB=" + new String(b)); + System.out.println(); + c++; + } + System.out.println("*** DEBUG - counted " + c + " BLOBs"); + */ + } + + /** + * the number of BLOBs in the heap + * @return the number of BLOBs in the heap + */ + public int size() { + return this.index.size(); + } + + /** + * test if a key is in the heap file. This does not need any IO, because it uses only the ram index + * @param key + * @return true if the key exists, false othervise + */ + public boolean has(byte[] key) { + assert index != null; + assert index.row().primaryKeyLength == key.length; + + // check if the index contains the key + try { + return index.getl(key) >= 0; + } catch (IOException e) { + e.printStackTrace(); + return false; + } + } + + /** + * add a BLOB to the heap: this adds the blob always to the end of the file + * @param key + * @param blob + * @throws IOException + */ + private void add(byte[] key, byte[] blob) throws IOException { + add(key, blob, 0, blob.length); + } + + /** + * add a BLOB to the heap: this adds the blob always to the end of the file + * @param key + * @param blob + * @throws IOException + */ + private void add(byte[] key, byte[] blob, int offset, int len) throws IOException { + assert len > 0; + assert index.row().primaryKeyLength == key.length; + assert blob == null || blob.length - offset >= len; + if ((blob == null) || (blob.length == 0)) return; + int pos = (int) file.length(); + file.seek(file.length()); + file.writeInt(len + key.length); + file.write(key); + file.write(blob, offset, len); + index.putl(key, pos); + } + + /** + * read a blob from the heap + * @param key + * @return + * @throws IOException + */ + public synchronized byte[] get(byte[] key) throws IOException { + assert index.row().primaryKeyLength == key.length; + + // check if the index contains the key + long pos = index.getl(key); + if (pos < 0) return null; + + // access the file and read the container + file.seek(pos); + int len = file.readInt() - index.row().primaryKeyLength; + if (serverMemory.available() < len) { + if (!serverMemory.request(len, false)) return null; // not enough memory available for this blob + } + byte[] blob = new byte[len]; + + // read the key + byte[] keyf = new byte[index.row().primaryKeyLength]; + file.readFully(keyf); + assert this.ordering.compare(key, keyf) == 0; + + // read the blob + file.readFully(blob); + + return blob; + } + + /** + * clears the content of the database + * @throws IOException + */ + public synchronized void clear() throws IOException { + index.clear(); + free.clear(); + try { + file.close(); + } catch (IOException e) { + e.printStackTrace(); + } + this.heapFile.delete(); + this.file = new RandomAccessFile(heapFile, "rw"); + } + + /** + * close the BLOB table + */ + public synchronized void close() { + index.close(); + free.clear(); + try { + file.close(); + } catch (IOException e) { + e.printStackTrace(); + } + index = null; + free = null; + file = null; + } + + /** + * ask for the length of the primary key + * @return the length of the key + */ + public int keylength() { + return this.index.row().primaryKeyLength; + } + + /** + * write a whole byte array as BLOB to the table + * @param key the primary key + * @param b + * @throws IOException + */ + public synchronized void put(byte[] key, byte[] b) throws IOException { + assert key.length == index.row().primaryKeyLength; + + // first remove the old entry + this.remove(key); + + // then look if we can use a free entry + if (this.free.size() > 0) { + // find the largest entry + long lseek = -1; + int lsize = 0; + int reclen = b.length + index.row().primaryKeyLength; + Long[] entry; + Iterator i = this.free.iterator(); + while (i.hasNext()) { + entry = i.next(); + if (entry[0].longValue() == (long) reclen) { + // we found an entry that has exactly the size that we need! + // we use that entry and stop looking for a larger entry + file.seek(entry[1].longValue()); + int reclenf = file.readInt(); + assert reclenf == reclen; + file.write(key); + file.write(b); + + // remove the entry from the free list + i.remove(); + + // add the entry to the index + this.index.putl(key, entry[1].longValue()); + + System.out.println("*** DEBUG BLOB: replaced-fit record at " + entry[1].longValue() + ", reclen=" + reclen + ", key=" + new String(key)); + + // finished! + return; + } + // look for the biggest size + if (entry[0].longValue() > lsize) { + lsize = (int) entry[0].longValue(); + lseek = entry[1].longValue(); + } + } + + // check if the found entry is large enough + if (lsize > reclen + 4) { + // split the free entry into two new entries + // if would be sufficient if lsize = reclen + 4, but this would mean to create + // an empty entry with zero next bytes for BLOB and key, which is not very good for the + // data structure in the file + + // write the new entry + file.seek(lseek); + file.writeInt(reclen); + file.write(key); + file.write(b); + + // add the index to the new entry + index.putl(key, lseek); + + // define the new empty entry + int newfreereclen = lsize - reclen - 4; + assert newfreereclen > 0; + file.writeInt(newfreereclen); + + // remove the old free entry + i = this.free.iterator(); + while (i.hasNext()) { + entry = i.next(); + if (entry[0].longValue() == (long) lsize && entry[1].longValue() == lseek) { + // remove the entry from the free list + i.remove(); + break; + } + } + + // add a new free entry + free.add(new Long[]{new Long(newfreereclen), new Long(lseek + 4 + reclen)}); + + System.out.println("*** DEBUG BLOB: replaced-split record at " + lseek + ", reclen=" + reclen + ", new reclen=" + newfreereclen + ", key=" + new String(key)); + + // finished! + return; + } + } + + // if there is no free entry or no free entry is large enough, append the entry at the end of the file + this.add(key, b); + } + + /** + * remove a BLOB + * @param key the primary key + * @throws IOException + */ + public synchronized void remove(byte[] key) throws IOException { + assert index.row().primaryKeyLength == key.length; + + // check if the index contains the key + long pos = index.getl(key); + if (pos < 0) return; + + // access the file and read the container + file.seek(pos); + int len = file.readInt(); + + // add entry to free array + this.free.add(new Long[]{new Long(len), new Long(pos)}); + + // fill zeros to the content + while (len-- > 0) file.write(0); + + // remove entry from index + this.index.removel(key); + } + + /** + * iterator over all keys + * @param up + * @param rotating + * @return + * @throws IOException + */ + public synchronized kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException { + return new kelondroRotateIterator(this.index.keys(up, null), null, 1); + } + + /** + * iterate over all keys + * @param up + * @param firstKey + * @return + * @throws IOException + */ + public synchronized kelondroCloneableIterator keys(boolean up, byte[] firstKey) throws IOException { + return this.index.keys(up, firstKey); + } + +} diff --git a/source/de/anomic/kelondro/kelondroBLOBTree.java b/source/de/anomic/kelondro/kelondroBLOBTree.java index 7f9bd9149..1dd421f9d 100644 --- a/source/de/anomic/kelondro/kelondroBLOBTree.java +++ b/source/de/anomic/kelondro/kelondroBLOBTree.java @@ -113,7 +113,7 @@ public class kelondroBLOBTree implements kelondroBLOB { } public int keylength() { - return this.rowdef.primaryKeyLength; + return this.keylen; } public synchronized int size() { @@ -140,7 +140,7 @@ public class kelondroBLOBTree implements kelondroBLOB { return new String(rawKey, 0, n + 1); } - public class keyIterator implements kelondroCloneableIterator { + public class keyIterator implements kelondroCloneableIterator { // the iterator iterates all keys kelondroCloneableIterator ri; String nextKey; @@ -158,10 +158,10 @@ public class kelondroBLOBTree implements kelondroBLOB { return nextKey != null; } - public String next() { + public byte[] next() { String result = nextKey; nextKey = n(); - return origKey(result.getBytes()); + return origKey(result.getBytes()).getBytes(); } public void remove() { @@ -190,16 +190,17 @@ public class kelondroBLOBTree implements kelondroBLOB { } return null; } + } - public synchronized kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException { + public synchronized kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException { // iterates only the keys of the Nodes // enumerated objects are of type String keyIterator i = new keyIterator(index.rows(up, null)); - if (rotating) return new kelondroRotateIterator(i, null, index.size()); else return i; + if (rotating) return new kelondroRotateIterator(i, null, index.size()); else return i; } - public synchronized keyIterator keys(boolean up, byte[] firstKey) throws IOException { + public synchronized kelondroCloneableIterator keys(boolean up, byte[] firstKey) throws IOException { return new keyIterator(index.rows(up, firstKey)); } @@ -235,13 +236,13 @@ public class kelondroBLOBTree implements kelondroBLOB { return buf[recpos] & 0xFF; } - public synchronized byte[] get(String key) throws IOException { - kelondroRA ra = getRA(key); + public synchronized byte[] get(byte[] key) throws IOException { + kelondroRA ra = getRA(new String(key)); if (ra == null) return null; return ra.readFully(); } - public synchronized byte[] get(String key, int pos, int len) throws IOException { + private synchronized byte[] get(String key, int pos, int len) throws IOException { int recpos = pos % reclen; int reccnt = pos / reclen; byte[] segment1; @@ -285,11 +286,11 @@ public class kelondroBLOBTree implements kelondroBLOB { return result; } - public synchronized void put(String key, byte[] b) throws IOException { - put(key, 0, b, 0, b.length); + public synchronized void put(byte[] key, byte[] b) throws IOException { + put(new String(key), 0, b, 0, b.length); } - public synchronized void put(String key, int pos, byte[] b, int off, int len) throws IOException { + private synchronized void put(String key, int pos, byte[] b, int off, int len) throws IOException { int recpos = pos % reclen; int reccnt = pos / reclen; byte[] buf; @@ -326,12 +327,30 @@ public class kelondroBLOBTree implements kelondroBLOB { } } - public synchronized void remove(String key) throws IOException { + private synchronized void put(String key, int pos, int b) throws IOException { + int recpos = pos % reclen; + int reccnt = pos / reclen; + byte[] buf; + // first write current record + buf = getValueCached(elementKey(key, reccnt)); + if (buf == null) { + buf = new byte[reclen]; + } else if (buf.length < reclen) { + byte[] buff = new byte[reclen]; + System.arraycopy(buf, 0, buff, 0, buf.length); + buf = buff; + buff = null; + } + buf[recpos] = (byte) b; + setValueCached(elementKey(key, reccnt), buf); + } + + public synchronized void remove(byte[] key) throws IOException { // remove value in cache and tree if (key == null) return; int recpos = 0; byte[] k; - while (index.get(k = elementKey(key, recpos)) != null) { + while (index.get(k = elementKey(new String(key), recpos)) != null) { index.remove(k); buffer.remove(k); recpos++; @@ -339,8 +358,8 @@ public class kelondroBLOBTree implements kelondroBLOB { //segmentCount--; writeSegmentCount(); } - public synchronized boolean has(String key) throws IOException { - return (key != null) && (getValueCached(elementKey(key, 0)) != null); + public synchronized boolean has(byte[] key) throws IOException { + return (key != null) && (getValueCached(elementKey(new String(key), 0)) != null); } public synchronized kelondroRA getRA(String filekey) { @@ -372,9 +391,7 @@ public class kelondroBLOBTree implements kelondroBLOB { } public void write(int i) throws IOException { - byte[] b = new byte[1]; - b[0] = (byte) i; - put(filekey, seekpos++, b, 0, 1); + put(filekey, seekpos++, i); } public int read(byte[] b, int off, int len) throws IOException { @@ -415,11 +432,11 @@ public class kelondroBLOBTree implements kelondroBLOB { if (args.length == 1) { // open a db and list keys try { - kelondroBLOBTree kd = new kelondroBLOBTree(new File(args[0]), true, true, 4 ,100, '_', kelondroNaturalOrder.naturalOrder, false, false, true); + kelondroBLOB kd = new kelondroBLOBTree(new File(args[0]), true, true, 4 ,100, '_', kelondroNaturalOrder.naturalOrder, false, false, true); System.out.println(kd.size() + " elements in DB"); - Iterator i = kd.keys(true, false); + Iterator i = kd.keys(true, false); while (i.hasNext()) - System.out.println(i.next()); + System.out.println(new String(i.next())); kd.close(); } catch (IOException e) { e.printStackTrace(); @@ -430,7 +447,7 @@ public class kelondroBLOBTree implements kelondroBLOB { public static int countElements(kelondroBLOBTree t) { int count = 0; try { - Iterator iter = t.keys(true, false); + Iterator iter = t.keys(true, false); while (iter.hasNext()) {count++; if (iter.next() == null) System.out.println("ERROR! null element found");} return count; } catch (IOException e) { diff --git a/source/de/anomic/kelondro/kelondroBytesLongMap.java b/source/de/anomic/kelondro/kelondroBytesLongMap.java index eafda567d..34ea874a4 100644 --- a/source/de/anomic/kelondro/kelondroBytesLongMap.java +++ b/source/de/anomic/kelondro/kelondroBytesLongMap.java @@ -41,7 +41,7 @@ public class kelondroBytesLongMap { } public kelondroBytesLongMap(int keylength, kelondroByteOrder objectOrder, int space) { - this.rowdef = new kelondroRow(new kelondroColumn[]{new kelondroColumn("key", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, keylength, "key"), new kelondroColumn("int c-8 {b256}")}, objectOrder, 0); + this.rowdef = new kelondroRow(new kelondroColumn[]{new kelondroColumn("key", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, keylength, "key"), new kelondroColumn("long c-8 {b256}")}, objectOrder, 0); this.index = new kelondroRAMIndex(rowdef, space); } diff --git a/source/de/anomic/kelondro/kelondroHeap.java b/source/de/anomic/kelondro/kelondroHeap.java deleted file mode 100755 index 18e204efb..000000000 --- a/source/de/anomic/kelondro/kelondroHeap.java +++ /dev/null @@ -1,191 +0,0 @@ -// kelondroHeap.java -// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 30.04.2008 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $ -// $LastChangedRevision: 4558 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.kelondro; - -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.RandomAccessFile; - -public final class kelondroHeap { - - private kelondroBytesLongMap index; - private File heapFile; - private kelondroByteOrder ordering; - - /** - * create a heap file: a arbitrary number of BLOBs, indexed by an access key - * The heap file will be opened at initialization time, indexed and closed again. - * Heap files are only opened when BLOBs are read from it or new one are appended - * @param heapFile - * @param keylength - * @param ordering - * @throws IOException - */ - public kelondroHeap(File heapFile, int keylength, kelondroByteOrder ordering) throws IOException { - this.index = null; - this.ordering = ordering; - this.heapFile = heapFile; - if (!(heapFile.exists())) throw new IOException("file " + heapFile + " does not exist"); - if (heapFile.length() >= Integer.MAX_VALUE) throw new IOException("file " + heapFile + " too large, index can only be crated for files less than 2GB"); - - this.index = new kelondroBytesLongMap(keylength, this.ordering, 0); - DataInputStream is = null; - String keystring; - byte[] key = new byte[keylength]; - int reclen; - long seek = 0, seek0; - is = new DataInputStream(new BufferedInputStream(new FileInputStream(heapFile), 64*1024)); - - // don't test available() here because this does not work for files > 2GB - loop: while (true) { - // remember seek position - seek0 = seek; - - // read length of the following record without the length of the record size bytes - try { - reclen = is.readInt(); - } catch (IOException e) { - break loop; // terminate loop - } - seek += 4L; - - // read key - try { - is.readFully(key); - } catch (IOException e) { - break loop; // terminate loop - } - keystring = new String(key); - seek += keystring.length(); - - // skip content - seek += reclen; - while (reclen > 0) reclen -= is.skip(reclen); - - // store access address to entry - try { - index.addl(key, seek0); - } catch (IOException e) { - e.printStackTrace(); - break loop; - } - } - is.close(); - } - - /** - * the number of BLOBs in the heap - * @return the number of BLOBs in the heap - */ - public int size() { - return this.index.size(); - } - - /** - * test if a key is in the heap file - * @param key - * @return true if the key exists, false othervise - */ - public boolean has(String key) { - assert index != null; - assert index.row().primaryKeyLength == key.length(); - - // check if the index contains the key - try { - return index.getl(key.getBytes()) >= 0; - } catch (IOException e) { - e.printStackTrace(); - return false; - } - } - - /** - * add a BLOB to the heap - * @param key - * @param blob - * @throws IOException - */ - public synchronized void add(String key, byte[] blob) throws IOException { - add(key, blob, 0, blob.length); - } - - /** - * add a BLOB to the heap - * @param key - * @param blob - * @throws IOException - */ - public synchronized void add(String key, byte[] blob, int offset, int len) throws IOException { - assert index.row().primaryKeyLength == key.length(); - if ((blob == null) || (blob.length == 0)) return; - DataOutputStream os = null; - try { - os = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(heapFile))); - } catch (FileNotFoundException e) { - throw new IOException(e.getMessage()); - } - int pos = os.size(); - os.writeInt(len); - os.write(key.getBytes()); - os.write(blob, offset, len); - os.close(); - index.putl(key.getBytes(), pos); - } - - /** - * read a blob from the heap - * @param key - * @return - * @throws IOException - */ - public byte[] get(String key) throws IOException { - assert index.row().primaryKeyLength == key.length(); - - // check if the index contains the key - long pos = index.getl(key.getBytes()); - if (pos < 0) return null; - - // access the file and read the container - RandomAccessFile raf = new RandomAccessFile(heapFile, "r"); - int len = raf.readInt(); - byte[] record = new byte[len]; - - raf.seek(pos + 4 + index.row().primaryKeyLength); - raf.readFully(record); - - raf.close(); - return record; - } - -} diff --git a/source/de/anomic/kelondro/kelondroMScoreCluster.java b/source/de/anomic/kelondro/kelondroMScoreCluster.java index c2ff95cbe..d70b17278 100644 --- a/source/de/anomic/kelondro/kelondroMScoreCluster.java +++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java @@ -91,8 +91,9 @@ public final class kelondroMScoreCluster { double d = 1000d * ((Double) o).doubleValue(); return (int) Math.round(d); } - String s = ""; + String s = null; if (o instanceof String) s = (String) o; + if (o instanceof byte[]) s = new String((byte[]) o); // this can be used to calculate a score from a string if ((s == null) || (s.length() == 0) || (s.charAt(0) == '-')) return 0; diff --git a/source/de/anomic/kelondro/kelondroMapObjects.java b/source/de/anomic/kelondro/kelondroMapObjects.java index 36d167936..5af583f2e 100644 --- a/source/de/anomic/kelondro/kelondroMapObjects.java +++ b/source/de/anomic/kelondro/kelondroMapObjects.java @@ -85,7 +85,7 @@ public class kelondroMapObjects extends kelondroObjects { // fill cluster and accumulator with values if ((sortfields != null) || (longaccfields != null) || (doubleaccfields != null)) try { - kelondroCloneableIterator it = dyn.keys(true, false); + kelondroCloneableIterator it = dyn.keys(true, false); String mapname; Object cell; long valuel; @@ -93,8 +93,8 @@ public class kelondroMapObjects extends kelondroObjects { Map map; this.elementCount = 0; while (it.hasNext()) { - mapname = it.next(); - map = getMap(mapname); + mapname = new String(it.next()); + map = getMap(new String(mapname)); if (map == null) break; if (sortfields != null) for (int i = 0; i < sortfields.length; i++) { @@ -299,13 +299,37 @@ public class kelondroMapObjects extends kelondroObjects { } } - public synchronized Iterator keys(final boolean up, /* sorted by */ String field) { + public synchronized Iterator keys(final boolean up, /* sorted by */ String field) { // sorted iteration using the sortClusters if (sortClusterMap == null) return null; final kelondroMScoreCluster cluster = sortClusterMap.get(field); if (cluster == null) return null; // sort field does not exist //System.out.println("DEBUG: cluster for field " + field + ": " + cluster.toString()); - return cluster.scores(up); + return new string2bytearrayIterator(cluster.scores(up)); + } + + public class string2bytearrayIterator implements Iterator { + + Iterator s; + + public string2bytearrayIterator(Iterator s) { + this.s = s; + } + + public boolean hasNext() { + return s.hasNext(); + } + + public byte[] next() { + String r = s.next(); + if (r == null) return null; + return r.getBytes(); + } + + public void remove() { + s.remove(); + } + } public synchronized mapIterator maps(final boolean up, final String field) { @@ -351,11 +375,11 @@ public class kelondroMapObjects extends kelondroObjects { // enumerates Map-Type elements // the key is also included in every map that is returned; it's key is 'key' - Iterator keyIterator; + Iterator keyIterator; boolean finish; HashMap n; - public mapIterator(Iterator keyIterator) { + public mapIterator(Iterator keyIterator) { this.keyIterator = keyIterator; this.finish = false; this.n = next0(); @@ -377,7 +401,7 @@ public class kelondroMapObjects extends kelondroObjects { String nextKey; HashMap map; while (keyIterator.hasNext()) { - nextKey = keyIterator.next(); + nextKey = new String(keyIterator.next()); if (nextKey == null) { finish = true; return null; diff --git a/source/de/anomic/kelondro/kelondroObjects.java b/source/de/anomic/kelondro/kelondroObjects.java index eff683197..07fe5bbbe 100644 --- a/source/de/anomic/kelondro/kelondroObjects.java +++ b/source/de/anomic/kelondro/kelondroObjects.java @@ -35,6 +35,8 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import de.anomic.server.serverDate; + public class kelondroObjects { private kelondroBLOB blob; @@ -58,11 +60,6 @@ public class kelondroObjects { this.cacheScore = new kelondroMScoreCluster(); } - public int keySize() { - return blob.keylength(); - } - - private static String map2string(final Map map, final String comment) throws IOException { final Iterator> iter = map.entrySet().iterator(); Map.Entry entry; @@ -99,9 +96,10 @@ public class kelondroObjects { assert (key.length() > 0); assert (newMap != null); if (cacheScore == null) return; // may appear during shutdown - + while (key.length() < blob.keylength()) key += "_"; + // write entry - blob.put(key, map2string(newMap, "").getBytes()); + blob.put(key.getBytes(), map2string(newMap, "W" + serverDate.formatShortSecond() + " ").getBytes()); // check for space in cache checkCacheSpace(); @@ -114,13 +112,14 @@ public class kelondroObjects { public synchronized void remove(String key) throws IOException { // update elementCount if (key == null) return; + while (key.length() < blob.keylength()) key += "_"; // remove from cache cacheScore.deleteScore(key); cache.remove(key); // remove from file - blob.remove(key); + blob.remove(key.getBytes()); } public synchronized HashMap get(final String key) throws IOException { @@ -128,18 +127,20 @@ public class kelondroObjects { return get(key, true); } - protected synchronized HashMap get(final String key, final boolean storeCache) throws IOException { + protected synchronized HashMap get(String key, final boolean storeCache) throws IOException { // load map from cache assert key != null; if (cache == null) return null; // case may appear during shutdown + while (key.length() < blob.keylength()) key += "_"; + HashMap map = cache.get(key); if (map != null) return map; // load map from kra - if (!(blob.has(key))) return null; + if (!(blob.has(key.getBytes()))) return null; // read object - byte[] b = blob.get(key); + byte[] b = blob.get(key.getBytes()); if (b == null) return null; map = string2map(new String(b)); @@ -166,15 +167,15 @@ public class kelondroObjects { } } - public synchronized kelondroCloneableIterator keys(final boolean up, final boolean rotating) throws IOException { + public synchronized kelondroCloneableIterator keys(final boolean up, final boolean rotating) throws IOException { // simple enumeration of key names without special ordering return blob.keys(up, rotating); } - public synchronized kelondroCloneableIterator keys(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException { + public synchronized kelondroCloneableIterator keys(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException { // simple enumeration of key names without special ordering - kelondroCloneableIterator i = blob.keys(up, firstKey); - if (rotating) return new kelondroRotateIterator(i, secondKey, blob.size()); else return i; + kelondroCloneableIterator i = blob.keys(up, firstKey); + if (rotating) return new kelondroRotateIterator(i, secondKey, blob.size()); else return i; } @@ -205,10 +206,10 @@ public class kelondroObjects { // enumerates Map-Type elements // the key is also included in every map that is returned; it's key is 'key' - Iterator keyIterator; + Iterator keyIterator; boolean finish; - public objectIterator(Iterator keyIterator) { + public objectIterator(Iterator keyIterator) { this.keyIterator = keyIterator; this.finish = false; } @@ -218,13 +219,13 @@ public class kelondroObjects { } public HashMap next() { - final String nextKey = keyIterator.next(); + final byte[] nextKey = keyIterator.next(); if (nextKey == null) { finish = true; return null; } try { - final HashMap obj = get(nextKey); + final HashMap obj = get(new String(nextKey)); if (obj == null) throw new kelondroException("no more elements available"); return obj; } catch (IOException e) { diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 0841864fa..51937f358 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -71,6 +71,8 @@ import java.util.regex.Pattern; import de.anomic.crawler.CrawlProfile; import de.anomic.http.httpHeader; +import de.anomic.kelondro.kelondroBLOB; +import de.anomic.kelondro.kelondroBLOBHeap; import de.anomic.kelondro.kelondroBLOBTree; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroMScoreCluster; @@ -92,7 +94,7 @@ import de.anomic.yacy.yacyURL; public final class plasmaHTCache { - public static final String DB_NAME = "responseHeader2.db"; + public static final String DB_NAME = "responseHeader.heap"; private static final int stackLimit = 150; // if we exceed that limit, we do not check idle public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day @@ -279,7 +281,17 @@ public final class plasmaHTCache { private static void openResponseHeaderDB() { // open the response header database File dbfile = new File(cachePath, DB_NAME); - responseHeaderDB = new kelondroMapObjects(new kelondroBLOBTree(dbfile, true, true, yacySeedDB.commonHashLength, 150, '#', kelondroBase64Order.enhancedCoder, false, false, true), 500); + kelondroBLOB blob = null; + if (DB_NAME.endsWith("heap")) { + try { + blob = new kelondroBLOBHeap(dbfile, yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder); + } catch (IOException e) { + e.printStackTrace(); + } + } else { + blob = new kelondroBLOBTree(dbfile, true, true, yacySeedDB.commonHashLength, 150, '#', kelondroBase64Order.enhancedCoder, false, false, true); + } + responseHeaderDB = new kelondroMapObjects(blob, 500); } private static void deleteOldHTCache(File directory) { @@ -895,7 +907,7 @@ public final class plasmaHTCache { String initiator, CrawlProfile.entry profile ) { - return new Entry( + Entry entry = new Entry( initDate, depth, url, @@ -905,6 +917,8 @@ public final class plasmaHTCache { initiator, profile ); + entry.writeResourceInfo(); + return entry; } public final static class Entry { @@ -1039,11 +1053,14 @@ public final class plasmaHTCache { return this.resInfo; } - public boolean writeResourceInfo() { + private boolean writeResourceInfo() { if (this.resInfo == null) return false; try { HashMap hm = new HashMap(); hm.putAll(this.resInfo.getMap()); + hm.put("@@URL", this.url.toNormalform(false, false)); + hm.put("@@DEPTH", Integer.toString(this.depth)); + if (this.initiator != null) hm.put("@@INITIATOR", this.initiator); responseHeaderDB.set(this.url.hash(), hm); } catch (Exception e) { resetResponseHeaderDB(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 2c3d28e94..57601e875 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -809,7 +809,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String DBFILE_USER = "DATA/SETTINGS/user.db"

*

Path to the user-DB, beginning from the YaCy-installation's top-folder. It holds all rights the created @@ -1556,10 +1556,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch