From 1e6d12f1469622ad71062d6c5b2f7780108248a8 Mon Sep 17 00:00:00 2001
From: orbiter
Date: Thu, 10 Jul 2008 00:47:37 +0000
Subject: [PATCH] Major update to BLOB data structures: - introduced a new BLOB
file format: kelondroBLOBHeap. This is a flat file with an index in RAM.
very similar to the eco-tables, but with flexible value sizes. It will
replace the kelondroBLOBTree, which is based on a kelondroTree, a
file-AVL-based index data structure. - the HTCACHE header file was replaced
by the new blob heap file structure - the robots.txt file was replaced by the
new blob heap file structure - the robots parser was enhanced (bugfixing for
double-loading of the same robots.txt) - other BLOB-dependent data structures
were prepared to use also the new BLOB heap - fixed a bug in the snippet
fetch process: the file header was not written to the header index There
should now be less IO during snippet fetch and during crawling
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4978 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
htroot/Wiki.java | 8 +-
source/de/anomic/crawler/CrawlProfile.java | 4 +-
source/de/anomic/crawler/HTTPLoader.java | 1 +
source/de/anomic/crawler/RobotsTxt.java | 126 +++---
source/de/anomic/crawler/robotsParser.java | 172 ++++----
source/de/anomic/data/blogBoard.java | 6 +-
source/de/anomic/data/blogBoardComments.java | 2 +-
source/de/anomic/data/bookmarksDB.java | 8 +-
source/de/anomic/data/messageBoard.java | 4 +-
source/de/anomic/data/userDB.java | 4 +-
source/de/anomic/data/wikiBoard.java | 4 +-
source/de/anomic/kelondro/kelondroBLOB.java | 35 +-
.../de/anomic/kelondro/kelondroBLOBHeap.java | 412 ++++++++++++++++++
.../de/anomic/kelondro/kelondroBLOBTree.java | 65 ++-
.../anomic/kelondro/kelondroBytesLongMap.java | 2 +-
source/de/anomic/kelondro/kelondroHeap.java | 191 --------
.../kelondro/kelondroMScoreCluster.java | 3 +-
.../anomic/kelondro/kelondroMapObjects.java | 40 +-
.../de/anomic/kelondro/kelondroObjects.java | 39 +-
source/de/anomic/plasma/plasmaHTCache.java | 25 +-
.../de/anomic/plasma/plasmaSwitchboard.java | 8 +-
21 files changed, 721 insertions(+), 438 deletions(-)
create mode 100755 source/de/anomic/kelondro/kelondroBLOBHeap.java
delete mode 100755 source/de/anomic/kelondro/kelondroHeap.java
diff --git a/htroot/Wiki.java b/htroot/Wiki.java
index 807dc2cff..29def50c7 100644
--- a/htroot/Wiki.java
+++ b/htroot/Wiki.java
@@ -173,11 +173,11 @@ public class Wiki {
prop.put("mode", "3"); //Index
String subject;
try {
- Iterator i = sb.wikiDB.keys(true);
+ Iterator i = sb.wikiDB.keys(true);
wikiBoard.entry entry;
int count=0;
while (i.hasNext()) {
- subject = i.next();
+ subject = new String(i.next());
entry = sb.wikiDB.read(subject);
prop.putHTML("mode_pages_"+count+"_name",wikiBoard.webalize(subject));
prop.putHTML("mode_pages_"+count+"_subject", subject);
@@ -200,14 +200,14 @@ public class Wiki {
prop.putHTML("mode_error_page", pagename);
try {
- Iterator it = sb.wikiDB.keysBkp(true);
+ Iterator it = sb.wikiDB.keysBkp(true);
wikiBoard.entry entry;
wikiBoard.entry oentry = null;
wikiBoard.entry nentry = null;
int count = 0;
boolean oldselected = false, newselected = false;
while (it.hasNext()) {
- entry = sb.wikiDB.readBkp(it.next());
+ entry = sb.wikiDB.readBkp(new String(it.next()));
prop.put("mode_error_versions_" + count + "_date", wikiBoard.dateString(entry.date()));
prop.put("mode_error_versions_" + count + "_fdate", dateString(entry.date()));
if (wikiBoard.dateString(entry.date()).equals(post.get("old", null))) {
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index e24ee1a4e..9878fdf4b 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -101,7 +101,7 @@ public class CrawlProfile {
public class profileIterator implements Iterator {
// the iterator iterates all keys, which are byte[] objects
- kelondroCloneableIterator handleIterator;
+ kelondroCloneableIterator handleIterator;
String lastkey;
public profileIterator(boolean up) throws IOException {
handleIterator = profileTable.keys(up, false);
@@ -117,7 +117,7 @@ public class CrawlProfile {
}
public entry next() {
try {
- lastkey = handleIterator.next();
+ lastkey = new String(handleIterator.next());
return getEntry(lastkey);
} catch (kelondroException e) {
clear();
diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java
index 01bbac639..903d44324 100644
--- a/source/de/anomic/crawler/HTTPLoader.java
+++ b/source/de/anomic/crawler/HTTPLoader.java
@@ -240,6 +240,7 @@ public final class HTTPLoader {
fos.write(responseBody);
htCache.setCacheArray(responseBody);
plasmaHTCache.writeFileAnnouncement(cacheFile);
+ //htCache.writeResourceInfo(); // write header to header BLOB-database
} finally {
if (fos!=null)try{fos.close();}catch(Exception e){/* ignore this */}
}
diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java
index d15bb8e4b..cbf815514 100644
--- a/source/de/anomic/crawler/RobotsTxt.java
+++ b/source/de/anomic/crawler/RobotsTxt.java
@@ -43,7 +43,6 @@
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
-
package de.anomic.crawler;
import java.io.BufferedInputStream;
@@ -62,6 +61,8 @@ import de.anomic.http.HttpClient;
import de.anomic.http.JakartaCommonsHttpClient;
import de.anomic.http.JakartaCommonsHttpResponse;
import de.anomic.http.httpHeader;
+import de.anomic.kelondro.kelondroBLOB;
+import de.anomic.kelondro.kelondroBLOBHeap;
import de.anomic.kelondro.kelondroBLOBTree;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMapObjects;
@@ -81,7 +82,17 @@ public class RobotsTxt {
public RobotsTxt(File robotsTableFile) {
this.robotsTableFile = robotsTableFile;
robotsTableFile.getParentFile().mkdirs();
- robotsTable = new kelondroMapObjects(new kelondroBLOBTree(robotsTableFile, true, true, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true), 100);
+ kelondroBLOB blob = null;
+ if (robotsTableFile.getName().endsWith(".heap")) {
+ try {
+ blob = new kelondroBLOBHeap(robotsTableFile, 64, kelondroNaturalOrder.naturalOrder);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ } else {
+ blob = new kelondroBLOBTree(robotsTableFile, true, true, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true);
+ }
+ robotsTable = new kelondroMapObjects(blob, 100);
}
private void resetDatabase() {
@@ -352,24 +363,27 @@ public class RobotsTxt {
return crawlDelay;
}
+ //private static final HashSet loadedRobots = new HashSet(); // only for debugging
+
@SuppressWarnings("unchecked")
public boolean isDisallowed(yacyURL nexturl) {
if (nexturl == null) throw new IllegalArgumentException();
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = getHostPort(nexturl);
-
- // do a DB lookup to determine if the robots data is already available
- RobotsTxt.Entry robotsTxt4Host = getEntry(urlHostPort);
-
- // if we have not found any data or the data is older than 7 days, we need to load it from the remote server
- if (
- (robotsTxt4Host == null) ||
- (robotsTxt4Host.getLoadedDate() == null) ||
- (System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000)
- ) {
- synchronized(this) {
-
+ RobotsTxt.Entry robotsTxt4Host = null;
+ synchronized(this) {
+
+ // do a DB lookup to determine if the robots data is already available
+ robotsTxt4Host = getEntry(urlHostPort);
+
+ // if we have not found any data or the data is older than 7 days, we need to load it from the remote server
+ if (
+ (robotsTxt4Host == null) ||
+ (robotsTxt4Host.getLoadedDate() == null) ||
+ (System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000)
+ ) {
+
// generating the proper url to download the robots txt
yacyURL robotsURL = null;
try {
@@ -380,56 +394,60 @@ public class RobotsTxt {
}
Object[] result = null;
- boolean accessCompletelyRestricted = false;
- byte[] robotsTxt = null;
- String eTag = null;
- Date modDate = null;
- try {
- serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
- result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host);
-
- if (result != null) {
- accessCompletelyRestricted = ((Boolean)result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue();
- robotsTxt = (byte[])result[DOWNLOAD_ROBOTS_TXT];
- eTag = (String) result[DOWNLOAD_ETAG];
- modDate = (Date) result[DOWNLOAD_MODDATE];
- } else if (robotsTxt4Host != null) {
- robotsTxt4Host.setLoadedDate(new Date());
- addEntry(robotsTxt4Host);
- }
+ serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
+ try {
+ result = downloadRobotsTxt(robotsURL, 5, robotsTxt4Host);
} catch (Exception e) {
- serverLog.logSevere("ROBOTS","Unable to download the robots.txt file from URL '" + robotsURL + "'. " + e.getMessage());
+ result = null;
}
+ /*
+ assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) :
+ "robots-url=" + robotsURL.toString() +
+ ", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : new String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) +
+ ", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString());
+ loadedRobots.add(robotsURL.toNormalform(false, false));
+ */
- if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) {
- ArrayList denyPath = null;
- String sitemap = null;
- Integer crawlDelay = null;
- if (accessCompletelyRestricted) {
+ if (result == null) {
+ // no robots.txt available, make an entry to prevent that the robots loading is done twice
+ if (robotsTxt4Host == null) {
+ // generate artificial entry
+ robotsTxt4Host = new Entry(
+ urlHostPort,
+ new ArrayList(),
+ new Date(),
+ new Date(),
+ null,
+ null,
+ new Integer(0));
+ } else {
+ robotsTxt4Host.setLoadedDate(new Date());
+ }
+
+ // store the data into the robots DB
+ addEntry(robotsTxt4Host);
+ } else {
+ Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
+ ArrayList denyPath = (ArrayList) parserResult[0];
+ if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
denyPath = new ArrayList();
denyPath.add("/");
- } else {
- // parsing the robots.txt Data and converting it into an arraylist
- try {
- Object[] parserResult = robotsParser.parse(robotsTxt);
- denyPath = (ArrayList) parserResult[0];
- sitemap = (String) parserResult[1];
- crawlDelay = (Integer) parserResult[2];
- } catch (IOException e) {
- serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'.");
- }
- }
+ }
- // storing the data into the robots DB
- robotsTxt4Host = addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap,crawlDelay);
+ // store the data into the robots DB
+ robotsTxt4Host = addEntry(
+ urlHostPort,
+ denyPath,
+ new Date(),
+ (Date) result[DOWNLOAD_MODDATE],
+ (String) result[DOWNLOAD_ETAG],
+ (String) parserResult[1],
+ (Integer) parserResult[2]);
}
}
}
- if (robotsTxt4Host != null && robotsTxt4Host.isDisallowed(nexturl.getFile())) {
- return true;
- }
- return false;
+ return robotsTxt4Host.isDisallowed(nexturl.getFile());
}
private static Object[] downloadRobotsTxt(yacyURL robotsURL, int redirectionCount, RobotsTxt.Entry entry) throws Exception {
diff --git a/source/de/anomic/crawler/robotsParser.java b/source/de/anomic/crawler/robotsParser.java
index a727cfaee..fbe0951e6 100644
--- a/source/de/anomic/crawler/robotsParser.java
+++ b/source/de/anomic/crawler/robotsParser.java
@@ -46,6 +46,7 @@ package de.anomic.crawler;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
+import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
@@ -85,25 +86,26 @@ public final class robotsParser{
* at the Moment it only creates a list of Deny Paths
*/
- public static Object[] parse(File robotsFile) throws IOException {
+ public static Object[] parse(File robotsFile) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(robotsFile));
- return parse(reader);
- } finally {
if (reader != null) try{reader.close();}catch(Exception e){/* ignore this */}
+ return parse(reader);
+ } catch (FileNotFoundException e1) {
}
+ return new Object[]{new ArrayList(), "", new Integer(0)};
}
@SuppressWarnings("unchecked")
- public static Object[] parse(byte[] robotsTxt) throws IOException {
+ public static Object[] parse(byte[] robotsTxt) {
if ((robotsTxt == null)||(robotsTxt.length == 0)) return new Object[]{new ArrayList(0),null,null};
ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
return parse(reader);
}
- public static Object[] parse(BufferedReader reader) throws IOException{
+ public static Object[] parse(BufferedReader reader) {
ArrayList deny4AllAgents = new ArrayList();
ArrayList deny4YaCyAgent = new ArrayList();
@@ -115,102 +117,104 @@ public final class robotsParser{
rule4YaCyFound = false,
inBlock = false;
- while ((line = reader.readLine()) != null) {
- line = line.trim();
- lineUpper = line.toUpperCase();
-
- if (line.length() == 0) {
- // OLD: we have reached the end of the rule block
- // rule4Yacy = false; inBlock = false;
-
- // NEW: just ignore it
- } else if (line.startsWith(ROBOTS_COMMENT)) {
- // we can ignore this. Just a comment line
- } else if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
- pos = line.indexOf(" ");
- if (pos != -1) {
- sitemap = line.substring(pos).trim();
- }
- } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
-
- if (inBlock) {
- // we have detected the start of a new block
- inBlock = false;
- isRuleBlock4AllAgents = false;
- isRuleBlock4YaCyAgent = false;
- crawlDelay = null; // each block has a separate delay
- }
-
- // cutting off comments at the line end
- pos = line.indexOf(ROBOTS_COMMENT);
- if (pos != -1) line = line.substring(0,pos).trim();
-
- // replacing all tabs with spaces
- line = line.replaceAll("\t"," ");
-
- // getting out the robots name
- pos = line.indexOf(" ");
- if (pos != -1) {
- String userAgent = line.substring(pos).trim();
- isRuleBlock4AllAgents |= userAgent.equals("*");
- isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
- if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
- }
- } else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
- pos = line.indexOf(" ");
- if (pos != -1) {
- try {
- crawlDelay = Integer.valueOf(line.substring(pos).trim());
- } catch (NumberFormatException e) {
- // invalid crawling delay
- }
- }
- } else if (lineUpper.startsWith(ROBOTS_DISALLOW) ||
- lineUpper.startsWith(ROBOTS_ALLOW)) {
- inBlock = true;
- boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
+ try {
+ while ((line = reader.readLine()) != null) {
+ line = line.trim();
+ lineUpper = line.toUpperCase();
- if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
+ if (line.length() == 0) {
+ // OLD: we have reached the end of the rule block
+ // rule4Yacy = false; inBlock = false;
+
+ // NEW: just ignore it
+ } else if (line.startsWith(ROBOTS_COMMENT)) {
+ // we can ignore this. Just a comment line
+ } else if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
+ pos = line.indexOf(" ");
+ if (pos != -1) {
+ sitemap = line.substring(pos).trim();
+ }
+ } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
+
+ if (inBlock) {
+ // we have detected the start of a new block
+ inBlock = false;
+ isRuleBlock4AllAgents = false;
+ isRuleBlock4YaCyAgent = false;
+ crawlDelay = null; // each block has a separate delay
+ }
+
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
-
- // cutting of tailing *
- if (line.endsWith("*")) line = line.substring(0,line.length()-1);
// replacing all tabs with spaces
line = line.replaceAll("\t"," ");
- // getting the path
+ // getting out the robots name
pos = line.indexOf(" ");
if (pos != -1) {
- // getting the path
- String path = line.substring(pos).trim();
-
- // unencoding all special charsx
- try {
- path = URLDecoder.decode(path,"UTF-8");
- } catch (Exception e) {
- /*
- * url decoding failed. E.g. because of
- * "Incomplete trailing escape (%) pattern"
- */
- }
+ String userAgent = line.substring(pos).trim();
+ isRuleBlock4AllAgents |= userAgent.equals("*");
+ isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
+ if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
+ }
+ } else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
+ pos = line.indexOf(" ");
+ if (pos != -1) {
+ try {
+ crawlDelay = Integer.valueOf(line.substring(pos).trim());
+ } catch (NumberFormatException e) {
+ // invalid crawling delay
+ }
+ }
+ } else if (lineUpper.startsWith(ROBOTS_DISALLOW) ||
+ lineUpper.startsWith(ROBOTS_ALLOW)) {
+ inBlock = true;
+ boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
+
+ if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
+ // cutting off comments at the line end
+ pos = line.indexOf(ROBOTS_COMMENT);
+ if (pos != -1) line = line.substring(0,pos).trim();
+
+ // cutting of tailing *
+ if (line.endsWith("*")) line = line.substring(0,line.length()-1);
- // escaping all occurences of ; because this char is used as special char in the Robots DB
- path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");
+ // replacing all tabs with spaces
+ line = line.replaceAll("\t"," ");
- // adding it to the pathlist
- if (!isDisallowRule) path = "!" + path;
- if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
- if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
+ // getting the path
+ pos = line.indexOf(" ");
+ if (pos != -1) {
+ // getting the path
+ String path = line.substring(pos).trim();
+
+ // unencoding all special charsx
+ try {
+ path = URLDecoder.decode(path,"UTF-8");
+ } catch (Exception e) {
+ /*
+ * url decoding failed. E.g. because of
+ * "Incomplete trailing escape (%) pattern"
+ */
+ }
+
+ // escaping all occurences of ; because this char is used as special char in the Robots DB
+ path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");
+
+ // adding it to the pathlist
+ if (!isDisallowRule) path = "!" + path;
+ if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
+ if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
+ }
}
}
}
- }
+ } catch (IOException e) {}
ArrayList denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents;
- return new Object[]{denyList,sitemap,crawlDelay};
+ return new Object[]{denyList, sitemap, crawlDelay};
}
}
diff --git a/source/de/anomic/data/blogBoard.java b/source/de/anomic/data/blogBoard.java
index 46e1c9257..5643e5cd6 100644
--- a/source/de/anomic/data/blogBoard.java
+++ b/source/de/anomic/data/blogBoard.java
@@ -227,7 +227,7 @@ public class blogBoard {
database.remove(key);
} catch (IOException e) { }
}
- public Iterator keys(boolean up) throws IOException {
+ public Iterator keys(boolean up) throws IOException {
return database.keys(up, false);
}
/**
@@ -282,7 +282,7 @@ public class blogBoard {
* Subclass of blogBoard, which provides the blogIterator object-type
*/
public class BlogIterator implements Iterator {
- Iterator blogIter;
+ Iterator blogIter;
blogBoard.BlogEntry nextEntry;
public BlogIterator(boolean up) throws IOException {
this.blogIter = blogBoard.this.database.keys(up, false);
@@ -300,7 +300,7 @@ public class blogBoard {
public BlogEntry next() {
try {
- return readBlogEntry(this.blogIter.next());
+ return readBlogEntry(new String(this.blogIter.next()));
} catch (kelondroException e) {
//resetDatabase();
return null;
diff --git a/source/de/anomic/data/blogBoardComments.java b/source/de/anomic/data/blogBoardComments.java
index e51902a5c..d7c97e479 100644
--- a/source/de/anomic/data/blogBoardComments.java
+++ b/source/de/anomic/data/blogBoardComments.java
@@ -218,7 +218,7 @@ public class blogBoardComments {
database.remove(key);
} catch (IOException e) { }
}
- public Iterator keys(boolean up) throws IOException {
+ public Iterator keys(boolean up) throws IOException {
return database.keys(up, false);
}
diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java
index fc9106ea2..bfc7a5d07 100644
--- a/source/de/anomic/data/bookmarksDB.java
+++ b/source/de/anomic/data/bookmarksDB.java
@@ -1108,7 +1108,7 @@ public class bookmarksDB {
* Subclass of bookmarksDB, which provides the tagIterator object-type
*/
public class tagIterator implements Iterator {
- kelondroCloneableIterator tagIter;
+ kelondroCloneableIterator tagIter;
bookmarksDB.Tag nextEntry;
public tagIterator(boolean up) throws IOException {
@@ -1128,7 +1128,7 @@ public class bookmarksDB {
public Tag next() {
try {
- return getTag(this.tagIter.next());
+ return getTag(new String(this.tagIter.next()));
} catch (kelondroException e) {
//resetDatabase();
return null;
@@ -1151,7 +1151,7 @@ public class bookmarksDB {
* Subclass of bookmarksDB, which provides the bookmarkIterator object-type
*/
public class bookmarkIterator implements Iterator {
- Iterator bookmarkIter;
+ Iterator bookmarkIter;
bookmarksDB.Bookmark nextEntry;
public bookmarkIterator(boolean up) throws IOException {
//flushBookmarkCache(); //XXX: this will cost performance
@@ -1170,7 +1170,7 @@ public class bookmarksDB {
public Bookmark next() {
try {
- return getBookmark(this.bookmarkIter.next());
+ return getBookmark(new String(this.bookmarkIter.next()));
} catch (kelondroException e) {
//resetDatabase();
return null;
diff --git a/source/de/anomic/data/messageBoard.java b/source/de/anomic/data/messageBoard.java
index 3d9d796d9..c4a9b6097 100644
--- a/source/de/anomic/data/messageBoard.java
+++ b/source/de/anomic/data/messageBoard.java
@@ -234,7 +234,7 @@ public class messageBoard {
public class catIter implements Iterator {
- Iterator allIter = null;
+ Iterator allIter = null;
String nextKey = null;
String category = "";
@@ -246,7 +246,7 @@ public class messageBoard {
public void findNext() {
while (allIter.hasNext()) {
- nextKey = allIter.next();
+ nextKey = new String(allIter.next());
if (this.category==null || nextKey.startsWith(this.category)) return;
}
nextKey = null;
diff --git a/source/de/anomic/data/userDB.java b/source/de/anomic/data/userDB.java
index aa3eeb67f..c81da4fde 100644
--- a/source/de/anomic/data/userDB.java
+++ b/source/de/anomic/data/userDB.java
@@ -588,7 +588,7 @@ public final class userDB {
public class userIterator implements Iterator {
// the iterator iterates all userNames
- kelondroCloneableIterator userIter;
+ kelondroCloneableIterator userIter;
userDB.Entry nextEntry;
public userIterator(boolean up) throws IOException {
@@ -605,7 +605,7 @@ public final class userDB {
}
public Entry next() {
try {
- return getEntry(this.userIter.next());
+ return getEntry(new String(this.userIter.next()));
} catch (kelondroException e) {
resetDatabase();
return null;
diff --git a/source/de/anomic/data/wikiBoard.java b/source/de/anomic/data/wikiBoard.java
index e2b20b3ea..5c406c498 100644
--- a/source/de/anomic/data/wikiBoard.java
+++ b/source/de/anomic/data/wikiBoard.java
@@ -316,11 +316,11 @@ public class wikiBoard {
}
*/
- public Iterator keys(boolean up) throws IOException {
+ public Iterator keys(boolean up) throws IOException {
return datbase.keys(up, false);
}
- public Iterator keysBkp(boolean up) throws IOException {
+ public Iterator keysBkp(boolean up) throws IOException {
return bkpbase.keys(up, false);
}
}
diff --git a/source/de/anomic/kelondro/kelondroBLOB.java b/source/de/anomic/kelondro/kelondroBLOB.java
index ba7c831c1..9ec431e9c 100644
--- a/source/de/anomic/kelondro/kelondroBLOB.java
+++ b/source/de/anomic/kelondro/kelondroBLOB.java
@@ -28,8 +28,6 @@ package de.anomic.kelondro;
import java.io.IOException;
-import de.anomic.kelondro.kelondroBLOBTree.keyIterator;
-
public interface kelondroBLOB {
/**
@@ -57,7 +55,7 @@ public interface kelondroBLOB {
* @return
* @throws IOException
*/
- public kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException;
+ public kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException;
/**
* iterate over all keys
@@ -66,7 +64,7 @@ public interface kelondroBLOB {
* @return
* @throws IOException
*/
- public keyIterator keys(boolean up, byte[] firstKey) throws IOException;
+ public kelondroCloneableIterator keys(boolean up, byte[] firstKey) throws IOException;
/**
* check if a specific key is in the database
@@ -74,7 +72,7 @@ public interface kelondroBLOB {
* @return
* @throws IOException
*/
- public boolean has(String key) throws IOException;
+ public boolean has(byte[] key) throws IOException;
/**
* retrieve the whole BLOB from the table
@@ -82,17 +80,7 @@ public interface kelondroBLOB {
* @return
* @throws IOException
*/
- public byte[] get(String key) throws IOException;
-
- /**
- * retrieve a fragment of a BLOB from the table
- * @param key the primary key
- * @param pos the position within the BLOB fragment
- * @param len the length of the fragment
- * @return
- * @throws IOException
- */
- public byte[] get(String key, int pos, int len) throws IOException;
+ public byte[] get(byte[] key) throws IOException;
/**
* write a whole byte array as BLOB to the table
@@ -100,25 +88,14 @@ public interface kelondroBLOB {
* @param b
* @throws IOException
*/
- public void put(String key, byte[] b) throws IOException;
-
- /**
- * write a fragment of a BLOB to the table
- * @param key the primary key
- * @param pos the position of the BLOB fragment
- * @param b a byte array
- * @param off the offset within the array where the BLOB fragment starts
- * @param len the length of the fragment
- * @throws IOException
- */
- public void put(String key, int pos, byte[] b, int off, int len) throws IOException;
+ public void put(byte[] key, byte[] b) throws IOException;
/**
* remove a BLOB
* @param key the primary key
* @throws IOException
*/
- public void remove(String key) throws IOException;
+ public void remove(byte[] key) throws IOException;
/**
* close the BLOB table
diff --git a/source/de/anomic/kelondro/kelondroBLOBHeap.java b/source/de/anomic/kelondro/kelondroBLOBHeap.java
new file mode 100755
index 000000000..9c58f73be
--- /dev/null
+++ b/source/de/anomic/kelondro/kelondroBLOBHeap.java
@@ -0,0 +1,412 @@
+// kelondroBLOBHeap.java
+// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 09.07.2008 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $
+// $LastChangedRevision: 4558 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+package de.anomic.kelondro;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.util.ArrayList;
+import java.util.Iterator;
+
+import de.anomic.server.serverMemory;
+import de.anomic.server.logging.serverLog;
+
+public final class kelondroBLOBHeap implements kelondroBLOB {
+
+ private kelondroBytesLongMap index; // key/seek relation for used records
+ private ArrayList free; // list of {size, seek} pairs denoting space and position of free records
+ private File heapFile; // the file of the heap
+ private kelondroByteOrder ordering; // the ordering on keys
+ private RandomAccessFile file; // a random access to the file
+
+ /*
+ * This class implements a BLOB management based on a sequence of records in a random access file
+ * The data structure is:
+ * file :== record*
+ * record :== reclen key blob
+ * reclen :== <4 byte integer == length of key and blob>
+ * key :==
+ * blob :==
+ * that means that each record has the size reclen+4
+ *
+ * The elements are organized in two data structures:
+ * index : key/seek relation for used records
+ * free> : list of {size, seek} pairs denoting space and position of free records
+ *
+ * Because the blob sizes are stored with integers, one entry may not exceed 2GB
+ *
+ * If a record is removed, it becomes a free record.
+ * New records are either appended to the end of the file or filled into a free record.
+ * A free record must either fit exactly to the size of the new record, or an old record is splitted
+ * into a filled and a new, smaller empty record.
+ */
+
+ /**
+ * create a heap file: a arbitrary number of BLOBs, indexed by an access key
+ * The heap file will be indexed upon initialization.
+ * @param heapFile
+ * @param keylength
+ * @param ordering
+ * @throws IOException
+ */
+ public kelondroBLOBHeap(File heapFile, int keylength, kelondroByteOrder ordering) throws IOException {
+ this.ordering = ordering;
+ this.heapFile = heapFile;
+
+ this.index = new kelondroBytesLongMap(keylength, this.ordering, 0);
+ this.free = new ArrayList();
+ this.file = new RandomAccessFile(heapFile, "rw");
+ byte[] key = new byte[keylength];
+ int reclen;
+ long seek = 0;
+
+ loop: while (true) { // don't test available() here because this does not work for files > 2GB
+
+ try {
+ // go to seek position
+ file.seek(seek);
+
+ // read length of the following record without the length of the record size bytes
+ reclen = file.readInt();
+
+ // read key
+ file.readFully(key);
+
+ } catch (IOException e) {
+ // EOF reached
+ break loop; // terminate loop
+ }
+
+ // check if this record is empty
+ if (key == null || key[0] == 0) {
+ // it is an empty record, store to free list
+ free.add(new Long[]{new Long(seek), new Long(reclen)});
+ } else {
+ // store key and access address of entry in index
+ try {
+ if (this.ordering.wellformed(key)) {
+ index.addl(key, seek);
+ } else {
+ serverLog.logWarning("kelondroBLOBHeap", "BLOB " + heapFile.getName() + ": skiped not wellformed key " + new String(key) + " at seek pos " + seek);
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ break loop;
+ }
+ }
+ // new seek position
+ seek += 4L + reclen;
+ }
+
+ // DEBUG
+ /*
+ Iterator i = index.keys(true, null);
+ byte[] b;
+ int c = 0;
+ while (i.hasNext()) {
+ key = i.next();
+ System.out.println("KEY=" + new String(key));
+ b = get(key);
+ System.out.println("BLOB=" + new String(b));
+ System.out.println();
+ c++;
+ }
+ System.out.println("*** DEBUG - counted " + c + " BLOBs");
+ */
+ }
+
+ /**
+ * the number of BLOBs in the heap
+ * @return the number of BLOBs in the heap
+ */
+ public int size() {
+ return this.index.size();
+ }
+
+ /**
+ * test if a key is in the heap file. This does not need any IO, because it uses only the ram index
+ * @param key
+ * @return true if the key exists, false othervise
+ */
+ public boolean has(byte[] key) {
+ assert index != null;
+ assert index.row().primaryKeyLength == key.length;
+
+ // check if the index contains the key
+ try {
+ return index.getl(key) >= 0;
+ } catch (IOException e) {
+ e.printStackTrace();
+ return false;
+ }
+ }
+
+ /**
+ * add a BLOB to the heap: this adds the blob always to the end of the file
+ * @param key
+ * @param blob
+ * @throws IOException
+ */
+ private void add(byte[] key, byte[] blob) throws IOException {
+ add(key, blob, 0, blob.length);
+ }
+
+ /**
+ * add a BLOB to the heap: this adds the blob always to the end of the file
+ * @param key
+ * @param blob
+ * @throws IOException
+ */
+ private void add(byte[] key, byte[] blob, int offset, int len) throws IOException {
+ assert len > 0;
+ assert index.row().primaryKeyLength == key.length;
+ assert blob == null || blob.length - offset >= len;
+ if ((blob == null) || (blob.length == 0)) return;
+ int pos = (int) file.length();
+ file.seek(file.length());
+ file.writeInt(len + key.length);
+ file.write(key);
+ file.write(blob, offset, len);
+ index.putl(key, pos);
+ }
+
+ /**
+ * read a blob from the heap
+ * @param key
+ * @return
+ * @throws IOException
+ */
+ public synchronized byte[] get(byte[] key) throws IOException {
+ assert index.row().primaryKeyLength == key.length;
+
+ // check if the index contains the key
+ long pos = index.getl(key);
+ if (pos < 0) return null;
+
+ // access the file and read the container
+ file.seek(pos);
+ int len = file.readInt() - index.row().primaryKeyLength;
+ if (serverMemory.available() < len) {
+ if (!serverMemory.request(len, false)) return null; // not enough memory available for this blob
+ }
+ byte[] blob = new byte[len];
+
+ // read the key
+ byte[] keyf = new byte[index.row().primaryKeyLength];
+ file.readFully(keyf);
+ assert this.ordering.compare(key, keyf) == 0;
+
+ // read the blob
+ file.readFully(blob);
+
+ return blob;
+ }
+
+ /**
+ * clears the content of the database
+ * @throws IOException
+ */
+ public synchronized void clear() throws IOException {
+ index.clear();
+ free.clear();
+ try {
+ file.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ this.heapFile.delete();
+ this.file = new RandomAccessFile(heapFile, "rw");
+ }
+
+ /**
+ * close the BLOB table
+ */
+ public synchronized void close() {
+ index.close();
+ free.clear();
+ try {
+ file.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ index = null;
+ free = null;
+ file = null;
+ }
+
+ /**
+ * ask for the length of the primary key
+ * @return the length of the key
+ */
+ public int keylength() {
+ return this.index.row().primaryKeyLength;
+ }
+
+ /**
+ * write a whole byte array as BLOB to the table
+ * @param key the primary key
+ * @param b
+ * @throws IOException
+ */
+ public synchronized void put(byte[] key, byte[] b) throws IOException {
+ assert key.length == index.row().primaryKeyLength;
+
+ // first remove the old entry
+ this.remove(key);
+
+ // then look if we can use a free entry
+ if (this.free.size() > 0) {
+ // find the largest entry
+ long lseek = -1;
+ int lsize = 0;
+ int reclen = b.length + index.row().primaryKeyLength;
+ Long[] entry;
+ Iterator i = this.free.iterator();
+ while (i.hasNext()) {
+ entry = i.next();
+ if (entry[0].longValue() == (long) reclen) {
+ // we found an entry that has exactly the size that we need!
+ // we use that entry and stop looking for a larger entry
+ file.seek(entry[1].longValue());
+ int reclenf = file.readInt();
+ assert reclenf == reclen;
+ file.write(key);
+ file.write(b);
+
+ // remove the entry from the free list
+ i.remove();
+
+ // add the entry to the index
+ this.index.putl(key, entry[1].longValue());
+
+ System.out.println("*** DEBUG BLOB: replaced-fit record at " + entry[1].longValue() + ", reclen=" + reclen + ", key=" + new String(key));
+
+ // finished!
+ return;
+ }
+ // look for the biggest size
+ if (entry[0].longValue() > lsize) {
+ lsize = (int) entry[0].longValue();
+ lseek = entry[1].longValue();
+ }
+ }
+
+ // check if the found entry is large enough
+ if (lsize > reclen + 4) {
+ // split the free entry into two new entries
+ // if would be sufficient if lsize = reclen + 4, but this would mean to create
+ // an empty entry with zero next bytes for BLOB and key, which is not very good for the
+ // data structure in the file
+
+ // write the new entry
+ file.seek(lseek);
+ file.writeInt(reclen);
+ file.write(key);
+ file.write(b);
+
+ // add the index to the new entry
+ index.putl(key, lseek);
+
+ // define the new empty entry
+ int newfreereclen = lsize - reclen - 4;
+ assert newfreereclen > 0;
+ file.writeInt(newfreereclen);
+
+ // remove the old free entry
+ i = this.free.iterator();
+ while (i.hasNext()) {
+ entry = i.next();
+ if (entry[0].longValue() == (long) lsize && entry[1].longValue() == lseek) {
+ // remove the entry from the free list
+ i.remove();
+ break;
+ }
+ }
+
+ // add a new free entry
+ free.add(new Long[]{new Long(newfreereclen), new Long(lseek + 4 + reclen)});
+
+ System.out.println("*** DEBUG BLOB: replaced-split record at " + lseek + ", reclen=" + reclen + ", new reclen=" + newfreereclen + ", key=" + new String(key));
+
+ // finished!
+ return;
+ }
+ }
+
+ // if there is no free entry or no free entry is large enough, append the entry at the end of the file
+ this.add(key, b);
+ }
+
+ /**
+ * remove a BLOB
+ * @param key the primary key
+ * @throws IOException
+ */
+ public synchronized void remove(byte[] key) throws IOException {
+ assert index.row().primaryKeyLength == key.length;
+
+ // check if the index contains the key
+ long pos = index.getl(key);
+ if (pos < 0) return;
+
+ // access the file and read the container
+ file.seek(pos);
+ int len = file.readInt();
+
+ // add entry to free array
+ this.free.add(new Long[]{new Long(len), new Long(pos)});
+
+ // fill zeros to the content
+ while (len-- > 0) file.write(0);
+
+ // remove entry from index
+ this.index.removel(key);
+ }
+
+ /**
+ * iterator over all keys
+ * @param up
+ * @param rotating
+ * @return
+ * @throws IOException
+ */
+ public synchronized kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException {
+ return new kelondroRotateIterator(this.index.keys(up, null), null, 1);
+ }
+
+ /**
+ * iterate over all keys
+ * @param up
+ * @param firstKey
+ * @return
+ * @throws IOException
+ */
+ public synchronized kelondroCloneableIterator keys(boolean up, byte[] firstKey) throws IOException {
+ return this.index.keys(up, firstKey);
+ }
+
+}
diff --git a/source/de/anomic/kelondro/kelondroBLOBTree.java b/source/de/anomic/kelondro/kelondroBLOBTree.java
index 7f9bd9149..1dd421f9d 100644
--- a/source/de/anomic/kelondro/kelondroBLOBTree.java
+++ b/source/de/anomic/kelondro/kelondroBLOBTree.java
@@ -113,7 +113,7 @@ public class kelondroBLOBTree implements kelondroBLOB {
}
public int keylength() {
- return this.rowdef.primaryKeyLength;
+ return this.keylen;
}
public synchronized int size() {
@@ -140,7 +140,7 @@ public class kelondroBLOBTree implements kelondroBLOB {
return new String(rawKey, 0, n + 1);
}
- public class keyIterator implements kelondroCloneableIterator {
+ public class keyIterator implements kelondroCloneableIterator {
// the iterator iterates all keys
kelondroCloneableIterator ri;
String nextKey;
@@ -158,10 +158,10 @@ public class kelondroBLOBTree implements kelondroBLOB {
return nextKey != null;
}
- public String next() {
+ public byte[] next() {
String result = nextKey;
nextKey = n();
- return origKey(result.getBytes());
+ return origKey(result.getBytes()).getBytes();
}
public void remove() {
@@ -190,16 +190,17 @@ public class kelondroBLOBTree implements kelondroBLOB {
}
return null;
}
+
}
- public synchronized kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException {
+ public synchronized kelondroCloneableIterator keys(boolean up, boolean rotating) throws IOException {
// iterates only the keys of the Nodes
// enumerated objects are of type String
keyIterator i = new keyIterator(index.rows(up, null));
- if (rotating) return new kelondroRotateIterator(i, null, index.size()); else return i;
+ if (rotating) return new kelondroRotateIterator(i, null, index.size()); else return i;
}
- public synchronized keyIterator keys(boolean up, byte[] firstKey) throws IOException {
+ public synchronized kelondroCloneableIterator keys(boolean up, byte[] firstKey) throws IOException {
return new keyIterator(index.rows(up, firstKey));
}
@@ -235,13 +236,13 @@ public class kelondroBLOBTree implements kelondroBLOB {
return buf[recpos] & 0xFF;
}
- public synchronized byte[] get(String key) throws IOException {
- kelondroRA ra = getRA(key);
+ public synchronized byte[] get(byte[] key) throws IOException {
+ kelondroRA ra = getRA(new String(key));
if (ra == null) return null;
return ra.readFully();
}
- public synchronized byte[] get(String key, int pos, int len) throws IOException {
+ private synchronized byte[] get(String key, int pos, int len) throws IOException {
int recpos = pos % reclen;
int reccnt = pos / reclen;
byte[] segment1;
@@ -285,11 +286,11 @@ public class kelondroBLOBTree implements kelondroBLOB {
return result;
}
- public synchronized void put(String key, byte[] b) throws IOException {
- put(key, 0, b, 0, b.length);
+ public synchronized void put(byte[] key, byte[] b) throws IOException {
+ put(new String(key), 0, b, 0, b.length);
}
- public synchronized void put(String key, int pos, byte[] b, int off, int len) throws IOException {
+ private synchronized void put(String key, int pos, byte[] b, int off, int len) throws IOException {
int recpos = pos % reclen;
int reccnt = pos / reclen;
byte[] buf;
@@ -326,12 +327,30 @@ public class kelondroBLOBTree implements kelondroBLOB {
}
}
- public synchronized void remove(String key) throws IOException {
+ private synchronized void put(String key, int pos, int b) throws IOException {
+ int recpos = pos % reclen;
+ int reccnt = pos / reclen;
+ byte[] buf;
+ // first write current record
+ buf = getValueCached(elementKey(key, reccnt));
+ if (buf == null) {
+ buf = new byte[reclen];
+ } else if (buf.length < reclen) {
+ byte[] buff = new byte[reclen];
+ System.arraycopy(buf, 0, buff, 0, buf.length);
+ buf = buff;
+ buff = null;
+ }
+ buf[recpos] = (byte) b;
+ setValueCached(elementKey(key, reccnt), buf);
+ }
+
+ public synchronized void remove(byte[] key) throws IOException {
// remove value in cache and tree
if (key == null) return;
int recpos = 0;
byte[] k;
- while (index.get(k = elementKey(key, recpos)) != null) {
+ while (index.get(k = elementKey(new String(key), recpos)) != null) {
index.remove(k);
buffer.remove(k);
recpos++;
@@ -339,8 +358,8 @@ public class kelondroBLOBTree implements kelondroBLOB {
//segmentCount--; writeSegmentCount();
}
- public synchronized boolean has(String key) throws IOException {
- return (key != null) && (getValueCached(elementKey(key, 0)) != null);
+ public synchronized boolean has(byte[] key) throws IOException {
+ return (key != null) && (getValueCached(elementKey(new String(key), 0)) != null);
}
public synchronized kelondroRA getRA(String filekey) {
@@ -372,9 +391,7 @@ public class kelondroBLOBTree implements kelondroBLOB {
}
public void write(int i) throws IOException {
- byte[] b = new byte[1];
- b[0] = (byte) i;
- put(filekey, seekpos++, b, 0, 1);
+ put(filekey, seekpos++, i);
}
public int read(byte[] b, int off, int len) throws IOException {
@@ -415,11 +432,11 @@ public class kelondroBLOBTree implements kelondroBLOB {
if (args.length == 1) {
// open a db and list keys
try {
- kelondroBLOBTree kd = new kelondroBLOBTree(new File(args[0]), true, true, 4 ,100, '_', kelondroNaturalOrder.naturalOrder, false, false, true);
+ kelondroBLOB kd = new kelondroBLOBTree(new File(args[0]), true, true, 4 ,100, '_', kelondroNaturalOrder.naturalOrder, false, false, true);
System.out.println(kd.size() + " elements in DB");
- Iterator i = kd.keys(true, false);
+ Iterator i = kd.keys(true, false);
while (i.hasNext())
- System.out.println(i.next());
+ System.out.println(new String(i.next()));
kd.close();
} catch (IOException e) {
e.printStackTrace();
@@ -430,7 +447,7 @@ public class kelondroBLOBTree implements kelondroBLOB {
public static int countElements(kelondroBLOBTree t) {
int count = 0;
try {
- Iterator iter = t.keys(true, false);
+ Iterator iter = t.keys(true, false);
while (iter.hasNext()) {count++; if (iter.next() == null) System.out.println("ERROR! null element found");}
return count;
} catch (IOException e) {
diff --git a/source/de/anomic/kelondro/kelondroBytesLongMap.java b/source/de/anomic/kelondro/kelondroBytesLongMap.java
index eafda567d..34ea874a4 100644
--- a/source/de/anomic/kelondro/kelondroBytesLongMap.java
+++ b/source/de/anomic/kelondro/kelondroBytesLongMap.java
@@ -41,7 +41,7 @@ public class kelondroBytesLongMap {
}
public kelondroBytesLongMap(int keylength, kelondroByteOrder objectOrder, int space) {
- this.rowdef = new kelondroRow(new kelondroColumn[]{new kelondroColumn("key", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, keylength, "key"), new kelondroColumn("int c-8 {b256}")}, objectOrder, 0);
+ this.rowdef = new kelondroRow(new kelondroColumn[]{new kelondroColumn("key", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, keylength, "key"), new kelondroColumn("long c-8 {b256}")}, objectOrder, 0);
this.index = new kelondroRAMIndex(rowdef, space);
}
diff --git a/source/de/anomic/kelondro/kelondroHeap.java b/source/de/anomic/kelondro/kelondroHeap.java
deleted file mode 100755
index 18e204efb..000000000
--- a/source/de/anomic/kelondro/kelondroHeap.java
+++ /dev/null
@@ -1,191 +0,0 @@
-// kelondroHeap.java
-// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
-// first published 30.04.2008 on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $
-// $LastChangedRevision: 4558 $
-// $LastChangedBy: orbiter $
-//
-// LICENSE
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-package de.anomic.kelondro;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.RandomAccessFile;
-
-public final class kelondroHeap {
-
- private kelondroBytesLongMap index;
- private File heapFile;
- private kelondroByteOrder ordering;
-
- /**
- * create a heap file: a arbitrary number of BLOBs, indexed by an access key
- * The heap file will be opened at initialization time, indexed and closed again.
- * Heap files are only opened when BLOBs are read from it or new one are appended
- * @param heapFile
- * @param keylength
- * @param ordering
- * @throws IOException
- */
- public kelondroHeap(File heapFile, int keylength, kelondroByteOrder ordering) throws IOException {
- this.index = null;
- this.ordering = ordering;
- this.heapFile = heapFile;
- if (!(heapFile.exists())) throw new IOException("file " + heapFile + " does not exist");
- if (heapFile.length() >= Integer.MAX_VALUE) throw new IOException("file " + heapFile + " too large, index can only be crated for files less than 2GB");
-
- this.index = new kelondroBytesLongMap(keylength, this.ordering, 0);
- DataInputStream is = null;
- String keystring;
- byte[] key = new byte[keylength];
- int reclen;
- long seek = 0, seek0;
- is = new DataInputStream(new BufferedInputStream(new FileInputStream(heapFile), 64*1024));
-
- // don't test available() here because this does not work for files > 2GB
- loop: while (true) {
- // remember seek position
- seek0 = seek;
-
- // read length of the following record without the length of the record size bytes
- try {
- reclen = is.readInt();
- } catch (IOException e) {
- break loop; // terminate loop
- }
- seek += 4L;
-
- // read key
- try {
- is.readFully(key);
- } catch (IOException e) {
- break loop; // terminate loop
- }
- keystring = new String(key);
- seek += keystring.length();
-
- // skip content
- seek += reclen;
- while (reclen > 0) reclen -= is.skip(reclen);
-
- // store access address to entry
- try {
- index.addl(key, seek0);
- } catch (IOException e) {
- e.printStackTrace();
- break loop;
- }
- }
- is.close();
- }
-
- /**
- * the number of BLOBs in the heap
- * @return the number of BLOBs in the heap
- */
- public int size() {
- return this.index.size();
- }
-
- /**
- * test if a key is in the heap file
- * @param key
- * @return true if the key exists, false othervise
- */
- public boolean has(String key) {
- assert index != null;
- assert index.row().primaryKeyLength == key.length();
-
- // check if the index contains the key
- try {
- return index.getl(key.getBytes()) >= 0;
- } catch (IOException e) {
- e.printStackTrace();
- return false;
- }
- }
-
- /**
- * add a BLOB to the heap
- * @param key
- * @param blob
- * @throws IOException
- */
- public synchronized void add(String key, byte[] blob) throws IOException {
- add(key, blob, 0, blob.length);
- }
-
- /**
- * add a BLOB to the heap
- * @param key
- * @param blob
- * @throws IOException
- */
- public synchronized void add(String key, byte[] blob, int offset, int len) throws IOException {
- assert index.row().primaryKeyLength == key.length();
- if ((blob == null) || (blob.length == 0)) return;
- DataOutputStream os = null;
- try {
- os = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(heapFile)));
- } catch (FileNotFoundException e) {
- throw new IOException(e.getMessage());
- }
- int pos = os.size();
- os.writeInt(len);
- os.write(key.getBytes());
- os.write(blob, offset, len);
- os.close();
- index.putl(key.getBytes(), pos);
- }
-
- /**
- * read a blob from the heap
- * @param key
- * @return
- * @throws IOException
- */
- public byte[] get(String key) throws IOException {
- assert index.row().primaryKeyLength == key.length();
-
- // check if the index contains the key
- long pos = index.getl(key.getBytes());
- if (pos < 0) return null;
-
- // access the file and read the container
- RandomAccessFile raf = new RandomAccessFile(heapFile, "r");
- int len = raf.readInt();
- byte[] record = new byte[len];
-
- raf.seek(pos + 4 + index.row().primaryKeyLength);
- raf.readFully(record);
-
- raf.close();
- return record;
- }
-
-}
diff --git a/source/de/anomic/kelondro/kelondroMScoreCluster.java b/source/de/anomic/kelondro/kelondroMScoreCluster.java
index c2ff95cbe..d70b17278 100644
--- a/source/de/anomic/kelondro/kelondroMScoreCluster.java
+++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java
@@ -91,8 +91,9 @@ public final class kelondroMScoreCluster {
double d = 1000d * ((Double) o).doubleValue();
return (int) Math.round(d);
}
- String s = "";
+ String s = null;
if (o instanceof String) s = (String) o;
+ if (o instanceof byte[]) s = new String((byte[]) o);
// this can be used to calculate a score from a string
if ((s == null) || (s.length() == 0) || (s.charAt(0) == '-')) return 0;
diff --git a/source/de/anomic/kelondro/kelondroMapObjects.java b/source/de/anomic/kelondro/kelondroMapObjects.java
index 36d167936..5af583f2e 100644
--- a/source/de/anomic/kelondro/kelondroMapObjects.java
+++ b/source/de/anomic/kelondro/kelondroMapObjects.java
@@ -85,7 +85,7 @@ public class kelondroMapObjects extends kelondroObjects {
// fill cluster and accumulator with values
if ((sortfields != null) || (longaccfields != null) || (doubleaccfields != null)) try {
- kelondroCloneableIterator it = dyn.keys(true, false);
+ kelondroCloneableIterator it = dyn.keys(true, false);
String mapname;
Object cell;
long valuel;
@@ -93,8 +93,8 @@ public class kelondroMapObjects extends kelondroObjects {
Map map;
this.elementCount = 0;
while (it.hasNext()) {
- mapname = it.next();
- map = getMap(mapname);
+ mapname = new String(it.next());
+ map = getMap(new String(mapname));
if (map == null) break;
if (sortfields != null) for (int i = 0; i < sortfields.length; i++) {
@@ -299,13 +299,37 @@ public class kelondroMapObjects extends kelondroObjects {
}
}
- public synchronized Iterator keys(final boolean up, /* sorted by */ String field) {
+ public synchronized Iterator keys(final boolean up, /* sorted by */ String field) {
// sorted iteration using the sortClusters
if (sortClusterMap == null) return null;
final kelondroMScoreCluster cluster = sortClusterMap.get(field);
if (cluster == null) return null; // sort field does not exist
//System.out.println("DEBUG: cluster for field " + field + ": " + cluster.toString());
- return cluster.scores(up);
+ return new string2bytearrayIterator(cluster.scores(up));
+ }
+
+ public class string2bytearrayIterator implements Iterator {
+
+ Iterator s;
+
+ public string2bytearrayIterator(Iterator s) {
+ this.s = s;
+ }
+
+ public boolean hasNext() {
+ return s.hasNext();
+ }
+
+ public byte[] next() {
+ String r = s.next();
+ if (r == null) return null;
+ return r.getBytes();
+ }
+
+ public void remove() {
+ s.remove();
+ }
+
}
public synchronized mapIterator maps(final boolean up, final String field) {
@@ -351,11 +375,11 @@ public class kelondroMapObjects extends kelondroObjects {
// enumerates Map-Type elements
// the key is also included in every map that is returned; it's key is 'key'
- Iterator keyIterator;
+ Iterator keyIterator;
boolean finish;
HashMap n;
- public mapIterator(Iterator keyIterator) {
+ public mapIterator(Iterator keyIterator) {
this.keyIterator = keyIterator;
this.finish = false;
this.n = next0();
@@ -377,7 +401,7 @@ public class kelondroMapObjects extends kelondroObjects {
String nextKey;
HashMap map;
while (keyIterator.hasNext()) {
- nextKey = keyIterator.next();
+ nextKey = new String(keyIterator.next());
if (nextKey == null) {
finish = true;
return null;
diff --git a/source/de/anomic/kelondro/kelondroObjects.java b/source/de/anomic/kelondro/kelondroObjects.java
index eff683197..07fe5bbbe 100644
--- a/source/de/anomic/kelondro/kelondroObjects.java
+++ b/source/de/anomic/kelondro/kelondroObjects.java
@@ -35,6 +35,8 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
+import de.anomic.server.serverDate;
+
public class kelondroObjects {
private kelondroBLOB blob;
@@ -58,11 +60,6 @@ public class kelondroObjects {
this.cacheScore = new kelondroMScoreCluster();
}
- public int keySize() {
- return blob.keylength();
- }
-
-
private static String map2string(final Map map, final String comment) throws IOException {
final Iterator> iter = map.entrySet().iterator();
Map.Entry entry;
@@ -99,9 +96,10 @@ public class kelondroObjects {
assert (key.length() > 0);
assert (newMap != null);
if (cacheScore == null) return; // may appear during shutdown
-
+ while (key.length() < blob.keylength()) key += "_";
+
// write entry
- blob.put(key, map2string(newMap, "").getBytes());
+ blob.put(key.getBytes(), map2string(newMap, "W" + serverDate.formatShortSecond() + " ").getBytes());
// check for space in cache
checkCacheSpace();
@@ -114,13 +112,14 @@ public class kelondroObjects {
public synchronized void remove(String key) throws IOException {
// update elementCount
if (key == null) return;
+ while (key.length() < blob.keylength()) key += "_";
// remove from cache
cacheScore.deleteScore(key);
cache.remove(key);
// remove from file
- blob.remove(key);
+ blob.remove(key.getBytes());
}
public synchronized HashMap get(final String key) throws IOException {
@@ -128,18 +127,20 @@ public class kelondroObjects {
return get(key, true);
}
- protected synchronized HashMap get(final String key, final boolean storeCache) throws IOException {
+ protected synchronized HashMap get(String key, final boolean storeCache) throws IOException {
// load map from cache
assert key != null;
if (cache == null) return null; // case may appear during shutdown
+ while (key.length() < blob.keylength()) key += "_";
+
HashMap map = cache.get(key);
if (map != null) return map;
// load map from kra
- if (!(blob.has(key))) return null;
+ if (!(blob.has(key.getBytes()))) return null;
// read object
- byte[] b = blob.get(key);
+ byte[] b = blob.get(key.getBytes());
if (b == null) return null;
map = string2map(new String(b));
@@ -166,15 +167,15 @@ public class kelondroObjects {
}
}
- public synchronized kelondroCloneableIterator keys(final boolean up, final boolean rotating) throws IOException {
+ public synchronized kelondroCloneableIterator keys(final boolean up, final boolean rotating) throws IOException {
// simple enumeration of key names without special ordering
return blob.keys(up, rotating);
}
- public synchronized kelondroCloneableIterator keys(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException {
+ public synchronized kelondroCloneableIterator keys(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException {
// simple enumeration of key names without special ordering
- kelondroCloneableIterator i = blob.keys(up, firstKey);
- if (rotating) return new kelondroRotateIterator(i, secondKey, blob.size()); else return i;
+ kelondroCloneableIterator i = blob.keys(up, firstKey);
+ if (rotating) return new kelondroRotateIterator(i, secondKey, blob.size()); else return i;
}
@@ -205,10 +206,10 @@ public class kelondroObjects {
// enumerates Map-Type elements
// the key is also included in every map that is returned; it's key is 'key'
- Iterator keyIterator;
+ Iterator keyIterator;
boolean finish;
- public objectIterator(Iterator keyIterator) {
+ public objectIterator(Iterator keyIterator) {
this.keyIterator = keyIterator;
this.finish = false;
}
@@ -218,13 +219,13 @@ public class kelondroObjects {
}
public HashMap next() {
- final String nextKey = keyIterator.next();
+ final byte[] nextKey = keyIterator.next();
if (nextKey == null) {
finish = true;
return null;
}
try {
- final HashMap obj = get(nextKey);
+ final HashMap obj = get(new String(nextKey));
if (obj == null) throw new kelondroException("no more elements available");
return obj;
} catch (IOException e) {
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index 0841864fa..51937f358 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -71,6 +71,8 @@ import java.util.regex.Pattern;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.httpHeader;
+import de.anomic.kelondro.kelondroBLOB;
+import de.anomic.kelondro.kelondroBLOBHeap;
import de.anomic.kelondro.kelondroBLOBTree;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroMScoreCluster;
@@ -92,7 +94,7 @@ import de.anomic.yacy.yacyURL;
public final class plasmaHTCache {
- public static final String DB_NAME = "responseHeader2.db";
+ public static final String DB_NAME = "responseHeader.heap";
private static final int stackLimit = 150; // if we exceed that limit, we do not check idle
public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day
@@ -279,7 +281,17 @@ public final class plasmaHTCache {
private static void openResponseHeaderDB() {
// open the response header database
File dbfile = new File(cachePath, DB_NAME);
- responseHeaderDB = new kelondroMapObjects(new kelondroBLOBTree(dbfile, true, true, yacySeedDB.commonHashLength, 150, '#', kelondroBase64Order.enhancedCoder, false, false, true), 500);
+ kelondroBLOB blob = null;
+ if (DB_NAME.endsWith("heap")) {
+ try {
+ blob = new kelondroBLOBHeap(dbfile, yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ } else {
+ blob = new kelondroBLOBTree(dbfile, true, true, yacySeedDB.commonHashLength, 150, '#', kelondroBase64Order.enhancedCoder, false, false, true);
+ }
+ responseHeaderDB = new kelondroMapObjects(blob, 500);
}
private static void deleteOldHTCache(File directory) {
@@ -895,7 +907,7 @@ public final class plasmaHTCache {
String initiator,
CrawlProfile.entry profile
) {
- return new Entry(
+ Entry entry = new Entry(
initDate,
depth,
url,
@@ -905,6 +917,8 @@ public final class plasmaHTCache {
initiator,
profile
);
+ entry.writeResourceInfo();
+ return entry;
}
public final static class Entry {
@@ -1039,11 +1053,14 @@ public final class plasmaHTCache {
return this.resInfo;
}
- public boolean writeResourceInfo() {
+ private boolean writeResourceInfo() {
if (this.resInfo == null) return false;
try {
HashMap hm = new HashMap();
hm.putAll(this.resInfo.getMap());
+ hm.put("@@URL", this.url.toNormalform(false, false));
+ hm.put("@@DEPTH", Integer.toString(this.depth));
+ if (this.initiator != null) hm.put("@@INITIATOR", this.initiator);
responseHeaderDB.set(this.url.hash(), hm);
} catch (Exception e) {
resetResponseHeaderDB();
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 2c3d28e94..57601e875 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -809,7 +809,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String DBFILE_USER = "DATA/SETTINGS/user.db"
* Path to the user-DB, beginning from the YaCy-installation's top-folder. It holds all rights the created
@@ -1556,10 +1556,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch