Major update to BLOB data structures:

- introduced a new BLOB file format: kelondroBLOBHeap. This is a flat file with an index in RAM.
  very similar to the eco-tables, but with flexible value sizes. It will replace the kelondroBLOBTree,
  which is based on a kelondroTree, a file-AVL-based index data structure.
- the HTCACHE header file was replaced by the new blob heap file structure
- the robots.txt file was replaced by the new blob heap file structure
- the robots parser was enhanced (bugfixing for double-loading of the same robots.txt)
- other BLOB-dependent data structures were prepared to use also the new BLOB heap
- fixed a bug in the snippet fetch process: the file header was not written to the header index
There should now be less IO during snippet fetch and during crawling


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4978 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 81f75f5056
commit 1e6d12f146

@ -173,11 +173,11 @@ public class Wiki {
prop.put("mode", "3"); //Index
String subject;
try {
Iterator<String> i = sb.wikiDB.keys(true);
Iterator<byte[]> i = sb.wikiDB.keys(true);
wikiBoard.entry entry;
int count=0;
while (i.hasNext()) {
subject = i.next();
subject = new String(i.next());
entry = sb.wikiDB.read(subject);
prop.putHTML("mode_pages_"+count+"_name",wikiBoard.webalize(subject));
prop.putHTML("mode_pages_"+count+"_subject", subject);
@ -200,14 +200,14 @@ public class Wiki {
prop.putHTML("mode_error_page", pagename);
try {
Iterator<String> it = sb.wikiDB.keysBkp(true);
Iterator<byte[]> it = sb.wikiDB.keysBkp(true);
wikiBoard.entry entry;
wikiBoard.entry oentry = null;
wikiBoard.entry nentry = null;
int count = 0;
boolean oldselected = false, newselected = false;
while (it.hasNext()) {
entry = sb.wikiDB.readBkp(it.next());
entry = sb.wikiDB.readBkp(new String(it.next()));
prop.put("mode_error_versions_" + count + "_date", wikiBoard.dateString(entry.date()));
prop.put("mode_error_versions_" + count + "_fdate", dateString(entry.date()));
if (wikiBoard.dateString(entry.date()).equals(post.get("old", null))) {

@ -101,7 +101,7 @@ public class CrawlProfile {
public class profileIterator implements Iterator<entry> {
// the iterator iterates all keys, which are byte[] objects
kelondroCloneableIterator<String> handleIterator;
kelondroCloneableIterator<byte[]> handleIterator;
String lastkey;
public profileIterator(boolean up) throws IOException {
handleIterator = profileTable.keys(up, false);
@ -117,7 +117,7 @@ public class CrawlProfile {
}
public entry next() {
try {
lastkey = handleIterator.next();
lastkey = new String(handleIterator.next());
return getEntry(lastkey);
} catch (kelondroException e) {
clear();

@ -240,6 +240,7 @@ public final class HTTPLoader {
fos.write(responseBody);
htCache.setCacheArray(responseBody);
plasmaHTCache.writeFileAnnouncement(cacheFile);
//htCache.writeResourceInfo(); // write header to header BLOB-database
} finally {
if (fos!=null)try{fos.close();}catch(Exception e){/* ignore this */}
}

@ -43,7 +43,6 @@
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.crawler;
import java.io.BufferedInputStream;
@ -62,6 +61,8 @@ import de.anomic.http.HttpClient;
import de.anomic.http.JakartaCommonsHttpClient;
import de.anomic.http.JakartaCommonsHttpResponse;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroBLOB;
import de.anomic.kelondro.kelondroBLOBHeap;
import de.anomic.kelondro.kelondroBLOBTree;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMapObjects;
@ -81,7 +82,17 @@ public class RobotsTxt {
public RobotsTxt(File robotsTableFile) {
this.robotsTableFile = robotsTableFile;
robotsTableFile.getParentFile().mkdirs();
robotsTable = new kelondroMapObjects(new kelondroBLOBTree(robotsTableFile, true, true, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true), 100);
kelondroBLOB blob = null;
if (robotsTableFile.getName().endsWith(".heap")) {
try {
blob = new kelondroBLOBHeap(robotsTableFile, 64, kelondroNaturalOrder.naturalOrder);
} catch (IOException e) {
e.printStackTrace();
}
} else {
blob = new kelondroBLOBTree(robotsTableFile, true, true, 256, 512, '_', kelondroNaturalOrder.naturalOrder, false, false, true);
}
robotsTable = new kelondroMapObjects(blob, 100);
}
private void resetDatabase() {
@ -352,24 +363,27 @@ public class RobotsTxt {
return crawlDelay;
}
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
@SuppressWarnings("unchecked")
public boolean isDisallowed(yacyURL nexturl) {
if (nexturl == null) throw new IllegalArgumentException();
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = getHostPort(nexturl);
// do a DB lookup to determine if the robots data is already available
RobotsTxt.Entry robotsTxt4Host = getEntry(urlHostPort);
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server
if (
(robotsTxt4Host == null) ||
(robotsTxt4Host.getLoadedDate() == null) ||
(System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000)
) {
synchronized(this) {
RobotsTxt.Entry robotsTxt4Host = null;
synchronized(this) {
// do a DB lookup to determine if the robots data is already available
robotsTxt4Host = getEntry(urlHostPort);
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server
if (
(robotsTxt4Host == null) ||
(robotsTxt4Host.getLoadedDate() == null) ||
(System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000)
) {
// generating the proper url to download the robots txt
yacyURL robotsURL = null;
try {
@ -380,56 +394,60 @@ public class RobotsTxt {
}
Object[] result = null;
boolean accessCompletelyRestricted = false;
byte[] robotsTxt = null;
String eTag = null;
Date modDate = null;
try {
serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host);
if (result != null) {
accessCompletelyRestricted = ((Boolean)result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue();
robotsTxt = (byte[])result[DOWNLOAD_ROBOTS_TXT];
eTag = (String) result[DOWNLOAD_ETAG];
modDate = (Date) result[DOWNLOAD_MODDATE];
} else if (robotsTxt4Host != null) {
robotsTxt4Host.setLoadedDate(new Date());
addEntry(robotsTxt4Host);
}
serverLog.logFine("ROBOTS","Trying to download the robots.txt file from URL '" + robotsURL + "'.");
try {
result = downloadRobotsTxt(robotsURL, 5, robotsTxt4Host);
} catch (Exception e) {
serverLog.logSevere("ROBOTS","Unable to download the robots.txt file from URL '" + robotsURL + "'. " + e.getMessage());
result = null;
}
/*
assert !loadedRobots.contains(robotsURL.toNormalform(false, false)) :
"robots-url=" + robotsURL.toString() +
", robots=" + ((result == null || result[DOWNLOAD_ROBOTS_TXT] == null) ? "NULL" : new String((byte[]) result[DOWNLOAD_ROBOTS_TXT])) +
", robotsTxt4Host=" + ((robotsTxt4Host == null) ? "NULL" : robotsTxt4Host.getLoadedDate().toString());
loadedRobots.add(robotsURL.toNormalform(false, false));
*/
if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) {
ArrayList<String> denyPath = null;
String sitemap = null;
Integer crawlDelay = null;
if (accessCompletelyRestricted) {
if (result == null) {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if (robotsTxt4Host == null) {
// generate artificial entry
robotsTxt4Host = new Entry(
urlHostPort,
new ArrayList<String>(),
new Date(),
new Date(),
null,
null,
new Integer(0));
} else {
robotsTxt4Host.setLoadedDate(new Date());
}
// store the data into the robots DB
addEntry(robotsTxt4Host);
} else {
Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
ArrayList<String> denyPath = (ArrayList<String>) parserResult[0];
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
denyPath = new ArrayList<String>();
denyPath.add("/");
} else {
// parsing the robots.txt Data and converting it into an arraylist
try {
Object[] parserResult = robotsParser.parse(robotsTxt);
denyPath = (ArrayList<String>) parserResult[0];
sitemap = (String) parserResult[1];
crawlDelay = (Integer) parserResult[2];
} catch (IOException e) {
serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'.");
}
}
}
// storing the data into the robots DB
robotsTxt4Host = addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap,crawlDelay);
// store the data into the robots DB
robotsTxt4Host = addEntry(
urlHostPort,
denyPath,
new Date(),
(Date) result[DOWNLOAD_MODDATE],
(String) result[DOWNLOAD_ETAG],
(String) parserResult[1],
(Integer) parserResult[2]);
}
}
}
if (robotsTxt4Host != null && robotsTxt4Host.isDisallowed(nexturl.getFile())) {
return true;
}
return false;
return robotsTxt4Host.isDisallowed(nexturl.getFile());
}
private static Object[] downloadRobotsTxt(yacyURL robotsURL, int redirectionCount, RobotsTxt.Entry entry) throws Exception {

@ -46,6 +46,7 @@ package de.anomic.crawler;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
@ -85,25 +86,26 @@ public final class robotsParser{
* at the Moment it only creates a list of Deny Paths
*/
public static Object[] parse(File robotsFile) throws IOException {
public static Object[] parse(File robotsFile) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(robotsFile));
return parse(reader);
} finally {
if (reader != null) try{reader.close();}catch(Exception e){/* ignore this */}
return parse(reader);
} catch (FileNotFoundException e1) {
}
return new Object[]{new ArrayList<String>(), "", new Integer(0)};
}
@SuppressWarnings("unchecked")
public static Object[] parse(byte[] robotsTxt) throws IOException {
public static Object[] parse(byte[] robotsTxt) {
if ((robotsTxt == null)||(robotsTxt.length == 0)) return new Object[]{new ArrayList(0),null,null};
ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
return parse(reader);
}
public static Object[] parse(BufferedReader reader) throws IOException{
public static Object[] parse(BufferedReader reader) {
ArrayList<String> deny4AllAgents = new ArrayList<String>();
ArrayList<String> deny4YaCyAgent = new ArrayList<String>();
@ -115,102 +117,104 @@ public final class robotsParser{
rule4YaCyFound = false,
inBlock = false;
while ((line = reader.readLine()) != null) {
line = line.trim();
lineUpper = line.toUpperCase();
if (line.length() == 0) {
// OLD: we have reached the end of the rule block
// rule4Yacy = false; inBlock = false;
// NEW: just ignore it
} else if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line
} else if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
pos = line.indexOf(" ");
if (pos != -1) {
sitemap = line.substring(pos).trim();
}
} else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
if (inBlock) {
// we have detected the start of a new block
inBlock = false;
isRuleBlock4AllAgents = false;
isRuleBlock4YaCyAgent = false;
crawlDelay = null; // each block has a separate delay
}
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// replacing all tabs with spaces
line = line.replaceAll("\t"," ");
// getting out the robots name
pos = line.indexOf(" ");
if (pos != -1) {
String userAgent = line.substring(pos).trim();
isRuleBlock4AllAgents |= userAgent.equals("*");
isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
}
} else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
pos = line.indexOf(" ");
if (pos != -1) {
try {
crawlDelay = Integer.valueOf(line.substring(pos).trim());
} catch (NumberFormatException e) {
// invalid crawling delay
}
}
} else if (lineUpper.startsWith(ROBOTS_DISALLOW) ||
lineUpper.startsWith(ROBOTS_ALLOW)) {
inBlock = true;
boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
try {
while ((line = reader.readLine()) != null) {
line = line.trim();
lineUpper = line.toUpperCase();
if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
if (line.length() == 0) {
// OLD: we have reached the end of the rule block
// rule4Yacy = false; inBlock = false;
// NEW: just ignore it
} else if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line
} else if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
pos = line.indexOf(" ");
if (pos != -1) {
sitemap = line.substring(pos).trim();
}
} else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
if (inBlock) {
// we have detected the start of a new block
inBlock = false;
isRuleBlock4AllAgents = false;
isRuleBlock4YaCyAgent = false;
crawlDelay = null; // each block has a separate delay
}
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// cutting of tailing *
if (line.endsWith("*")) line = line.substring(0,line.length()-1);
// replacing all tabs with spaces
line = line.replaceAll("\t"," ");
// getting the path
// getting out the robots name
pos = line.indexOf(" ");
if (pos != -1) {
// getting the path
String path = line.substring(pos).trim();
// unencoding all special charsx
try {
path = URLDecoder.decode(path,"UTF-8");
} catch (Exception e) {
/*
* url decoding failed. E.g. because of
* "Incomplete trailing escape (%) pattern"
*/
}
String userAgent = line.substring(pos).trim();
isRuleBlock4AllAgents |= userAgent.equals("*");
isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
}
} else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
pos = line.indexOf(" ");
if (pos != -1) {
try {
crawlDelay = Integer.valueOf(line.substring(pos).trim());
} catch (NumberFormatException e) {
// invalid crawling delay
}
}
} else if (lineUpper.startsWith(ROBOTS_DISALLOW) ||
lineUpper.startsWith(ROBOTS_ALLOW)) {
inBlock = true;
boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// cutting of tailing *
if (line.endsWith("*")) line = line.substring(0,line.length()-1);
// escaping all occurences of ; because this char is used as special char in the Robots DB
path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");
// replacing all tabs with spaces
line = line.replaceAll("\t"," ");
// adding it to the pathlist
if (!isDisallowRule) path = "!" + path;
if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
// getting the path
pos = line.indexOf(" ");
if (pos != -1) {
// getting the path
String path = line.substring(pos).trim();
// unencoding all special charsx
try {
path = URLDecoder.decode(path,"UTF-8");
} catch (Exception e) {
/*
* url decoding failed. E.g. because of
* "Incomplete trailing escape (%) pattern"
*/
}
// escaping all occurences of ; because this char is used as special char in the Robots DB
path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");
// adding it to the pathlist
if (!isDisallowRule) path = "!" + path;
if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
}
}
}
}
}
} catch (IOException e) {}
ArrayList<String> denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents;
return new Object[]{denyList,sitemap,crawlDelay};
return new Object[]{denyList, sitemap, crawlDelay};
}
}

@ -227,7 +227,7 @@ public class blogBoard {
database.remove(key);
} catch (IOException e) { }
}
public Iterator<String> keys(boolean up) throws IOException {
public Iterator<byte[]> keys(boolean up) throws IOException {
return database.keys(up, false);
}
/**
@ -282,7 +282,7 @@ public class blogBoard {
* Subclass of blogBoard, which provides the blogIterator object-type
*/
public class BlogIterator implements Iterator<BlogEntry> {
Iterator<String> blogIter;
Iterator<byte[]> blogIter;
blogBoard.BlogEntry nextEntry;
public BlogIterator(boolean up) throws IOException {
this.blogIter = blogBoard.this.database.keys(up, false);
@ -300,7 +300,7 @@ public class blogBoard {
public BlogEntry next() {
try {
return readBlogEntry(this.blogIter.next());
return readBlogEntry(new String(this.blogIter.next()));
} catch (kelondroException e) {
//resetDatabase();
return null;

@ -218,7 +218,7 @@ public class blogBoardComments {
database.remove(key);
} catch (IOException e) { }
}
public Iterator<String> keys(boolean up) throws IOException {
public Iterator<byte[]> keys(boolean up) throws IOException {
return database.keys(up, false);
}

@ -1108,7 +1108,7 @@ public class bookmarksDB {
* Subclass of bookmarksDB, which provides the tagIterator object-type
*/
public class tagIterator implements Iterator<Tag> {
kelondroCloneableIterator<String> tagIter;
kelondroCloneableIterator<byte[]> tagIter;
bookmarksDB.Tag nextEntry;
public tagIterator(boolean up) throws IOException {
@ -1128,7 +1128,7 @@ public class bookmarksDB {
public Tag next() {
try {
return getTag(this.tagIter.next());
return getTag(new String(this.tagIter.next()));
} catch (kelondroException e) {
//resetDatabase();
return null;
@ -1151,7 +1151,7 @@ public class bookmarksDB {
* Subclass of bookmarksDB, which provides the bookmarkIterator object-type
*/
public class bookmarkIterator implements Iterator<Bookmark> {
Iterator<String> bookmarkIter;
Iterator<byte[]> bookmarkIter;
bookmarksDB.Bookmark nextEntry;
public bookmarkIterator(boolean up) throws IOException {
//flushBookmarkCache(); //XXX: this will cost performance
@ -1170,7 +1170,7 @@ public class bookmarksDB {
public Bookmark next() {
try {
return getBookmark(this.bookmarkIter.next());
return getBookmark(new String(this.bookmarkIter.next()));
} catch (kelondroException e) {
//resetDatabase();
return null;

@ -234,7 +234,7 @@ public class messageBoard {
public class catIter implements Iterator<String> {
Iterator<String> allIter = null;
Iterator<byte[]> allIter = null;
String nextKey = null;
String category = "";
@ -246,7 +246,7 @@ public class messageBoard {
public void findNext() {
while (allIter.hasNext()) {
nextKey = allIter.next();
nextKey = new String(allIter.next());
if (this.category==null || nextKey.startsWith(this.category)) return;
}
nextKey = null;

@ -588,7 +588,7 @@ public final class userDB {
public class userIterator implements Iterator<Entry> {
// the iterator iterates all userNames
kelondroCloneableIterator<String> userIter;
kelondroCloneableIterator<byte[]> userIter;
userDB.Entry nextEntry;
public userIterator(boolean up) throws IOException {
@ -605,7 +605,7 @@ public final class userDB {
}
public Entry next() {
try {
return getEntry(this.userIter.next());
return getEntry(new String(this.userIter.next()));
} catch (kelondroException e) {
resetDatabase();
return null;

@ -316,11 +316,11 @@ public class wikiBoard {
}
*/
public Iterator<String> keys(boolean up) throws IOException {
public Iterator<byte[]> keys(boolean up) throws IOException {
return datbase.keys(up, false);
}
public Iterator<String> keysBkp(boolean up) throws IOException {
public Iterator<byte[]> keysBkp(boolean up) throws IOException {
return bkpbase.keys(up, false);
}
}

@ -28,8 +28,6 @@ package de.anomic.kelondro;
import java.io.IOException;
import de.anomic.kelondro.kelondroBLOBTree.keyIterator;
public interface kelondroBLOB {
/**
@ -57,7 +55,7 @@ public interface kelondroBLOB {
* @return
* @throws IOException
*/
public kelondroCloneableIterator<String> keys(boolean up, boolean rotating) throws IOException;
public kelondroCloneableIterator<byte[]> keys(boolean up, boolean rotating) throws IOException;
/**
* iterate over all keys
@ -66,7 +64,7 @@ public interface kelondroBLOB {
* @return
* @throws IOException
*/
public keyIterator keys(boolean up, byte[] firstKey) throws IOException;
public kelondroCloneableIterator<byte[]> keys(boolean up, byte[] firstKey) throws IOException;
/**
* check if a specific key is in the database
@ -74,7 +72,7 @@ public interface kelondroBLOB {
* @return
* @throws IOException
*/
public boolean has(String key) throws IOException;
public boolean has(byte[] key) throws IOException;
/**
* retrieve the whole BLOB from the table
@ -82,17 +80,7 @@ public interface kelondroBLOB {
* @return
* @throws IOException
*/
public byte[] get(String key) throws IOException;
/**
* retrieve a fragment of a BLOB from the table
* @param key the primary key
* @param pos the position within the BLOB fragment
* @param len the length of the fragment
* @return
* @throws IOException
*/
public byte[] get(String key, int pos, int len) throws IOException;
public byte[] get(byte[] key) throws IOException;
/**
* write a whole byte array as BLOB to the table
@ -100,25 +88,14 @@ public interface kelondroBLOB {
* @param b
* @throws IOException
*/
public void put(String key, byte[] b) throws IOException;
/**
* write a fragment of a BLOB to the table
* @param key the primary key
* @param pos the position of the BLOB fragment
* @param b a byte array
* @param off the offset within the array where the BLOB fragment starts
* @param len the length of the fragment
* @throws IOException
*/
public void put(String key, int pos, byte[] b, int off, int len) throws IOException;
public void put(byte[] key, byte[] b) throws IOException;
/**
* remove a BLOB
* @param key the primary key
* @throws IOException
*/
public void remove(String key) throws IOException;
public void remove(byte[] key) throws IOException;
/**
* close the BLOB table

@ -0,0 +1,412 @@
// kelondroBLOBHeap.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 09.07.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $
// $LastChangedRevision: 4558 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Iterator;
import de.anomic.server.serverMemory;
import de.anomic.server.logging.serverLog;
public final class kelondroBLOBHeap implements kelondroBLOB {
private kelondroBytesLongMap index; // key/seek relation for used records
private ArrayList<Long[]> free; // list of {size, seek} pairs denoting space and position of free records
private File heapFile; // the file of the heap
private kelondroByteOrder ordering; // the ordering on keys
private RandomAccessFile file; // a random access to the file
/*
* This class implements a BLOB management based on a sequence of records in a random access file
* The data structure is:
* file :== record*
* record :== reclen key blob
* reclen :== <4 byte integer == length of key and blob>
* key :== <bytes as defined with keylen, if first byte is zero then record is empty>
* blob :== <bytes of length reclen - keylen>
* that means that each record has the size reclen+4
*
* The elements are organized in two data structures:
* index<kelondroBytesLongMap> : key/seek relation for used records
* free<ArrayList<Integer[]>> : list of {size, seek} pairs denoting space and position of free records
*
* Because the blob sizes are stored with integers, one entry may not exceed 2GB
*
* If a record is removed, it becomes a free record.
* New records are either appended to the end of the file or filled into a free record.
* A free record must either fit exactly to the size of the new record, or an old record is splitted
* into a filled and a new, smaller empty record.
*/
/**
* create a heap file: a arbitrary number of BLOBs, indexed by an access key
* The heap file will be indexed upon initialization.
* @param heapFile
* @param keylength
* @param ordering
* @throws IOException
*/
public kelondroBLOBHeap(File heapFile, int keylength, kelondroByteOrder ordering) throws IOException {
this.ordering = ordering;
this.heapFile = heapFile;
this.index = new kelondroBytesLongMap(keylength, this.ordering, 0);
this.free = new ArrayList<Long[]>();
this.file = new RandomAccessFile(heapFile, "rw");
byte[] key = new byte[keylength];
int reclen;
long seek = 0;
loop: while (true) { // don't test available() here because this does not work for files > 2GB
try {
// go to seek position
file.seek(seek);
// read length of the following record without the length of the record size bytes
reclen = file.readInt();
// read key
file.readFully(key);
} catch (IOException e) {
// EOF reached
break loop; // terminate loop
}
// check if this record is empty
if (key == null || key[0] == 0) {
// it is an empty record, store to free list
free.add(new Long[]{new Long(seek), new Long(reclen)});
} else {
// store key and access address of entry in index
try {
if (this.ordering.wellformed(key)) {
index.addl(key, seek);
} else {
serverLog.logWarning("kelondroBLOBHeap", "BLOB " + heapFile.getName() + ": skiped not wellformed key " + new String(key) + " at seek pos " + seek);
}
} catch (IOException e) {
e.printStackTrace();
break loop;
}
}
// new seek position
seek += 4L + reclen;
}
// DEBUG
/*
Iterator<byte[]> i = index.keys(true, null);
byte[] b;
int c = 0;
while (i.hasNext()) {
key = i.next();
System.out.println("KEY=" + new String(key));
b = get(key);
System.out.println("BLOB=" + new String(b));
System.out.println();
c++;
}
System.out.println("*** DEBUG - counted " + c + " BLOBs");
*/
}
/**
* the number of BLOBs in the heap
* @return the number of BLOBs in the heap
*/
public int size() {
return this.index.size();
}
/**
* test if a key is in the heap file. This does not need any IO, because it uses only the ram index
* @param key
* @return true if the key exists, false othervise
*/
public boolean has(byte[] key) {
assert index != null;
assert index.row().primaryKeyLength == key.length;
// check if the index contains the key
try {
return index.getl(key) >= 0;
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
/**
* add a BLOB to the heap: this adds the blob always to the end of the file
* @param key
* @param blob
* @throws IOException
*/
private void add(byte[] key, byte[] blob) throws IOException {
add(key, blob, 0, blob.length);
}
/**
* add a BLOB to the heap: this adds the blob always to the end of the file
* @param key
* @param blob
* @throws IOException
*/
private void add(byte[] key, byte[] blob, int offset, int len) throws IOException {
assert len > 0;
assert index.row().primaryKeyLength == key.length;
assert blob == null || blob.length - offset >= len;
if ((blob == null) || (blob.length == 0)) return;
int pos = (int) file.length();
file.seek(file.length());
file.writeInt(len + key.length);
file.write(key);
file.write(blob, offset, len);
index.putl(key, pos);
}
/**
* read a blob from the heap
* @param key
* @return
* @throws IOException
*/
public synchronized byte[] get(byte[] key) throws IOException {
assert index.row().primaryKeyLength == key.length;
// check if the index contains the key
long pos = index.getl(key);
if (pos < 0) return null;
// access the file and read the container
file.seek(pos);
int len = file.readInt() - index.row().primaryKeyLength;
if (serverMemory.available() < len) {
if (!serverMemory.request(len, false)) return null; // not enough memory available for this blob
}
byte[] blob = new byte[len];
// read the key
byte[] keyf = new byte[index.row().primaryKeyLength];
file.readFully(keyf);
assert this.ordering.compare(key, keyf) == 0;
// read the blob
file.readFully(blob);
return blob;
}
/**
* clears the content of the database
* @throws IOException
*/
public synchronized void clear() throws IOException {
index.clear();
free.clear();
try {
file.close();
} catch (IOException e) {
e.printStackTrace();
}
this.heapFile.delete();
this.file = new RandomAccessFile(heapFile, "rw");
}
/**
* close the BLOB table
*/
public synchronized void close() {
index.close();
free.clear();
try {
file.close();
} catch (IOException e) {
e.printStackTrace();
}
index = null;
free = null;
file = null;
}
/**
* ask for the length of the primary key
* @return the length of the key
*/
public int keylength() {
return this.index.row().primaryKeyLength;
}
/**
* write a whole byte array as BLOB to the table
* @param key the primary key
* @param b
* @throws IOException
*/
public synchronized void put(byte[] key, byte[] b) throws IOException {
assert key.length == index.row().primaryKeyLength;
// first remove the old entry
this.remove(key);
// then look if we can use a free entry
if (this.free.size() > 0) {
// find the largest entry
long lseek = -1;
int lsize = 0;
int reclen = b.length + index.row().primaryKeyLength;
Long[] entry;
Iterator<Long[]> i = this.free.iterator();
while (i.hasNext()) {
entry = i.next();
if (entry[0].longValue() == (long) reclen) {
// we found an entry that has exactly the size that we need!
// we use that entry and stop looking for a larger entry
file.seek(entry[1].longValue());
int reclenf = file.readInt();
assert reclenf == reclen;
file.write(key);
file.write(b);
// remove the entry from the free list
i.remove();
// add the entry to the index
this.index.putl(key, entry[1].longValue());
System.out.println("*** DEBUG BLOB: replaced-fit record at " + entry[1].longValue() + ", reclen=" + reclen + ", key=" + new String(key));
// finished!
return;
}
// look for the biggest size
if (entry[0].longValue() > lsize) {
lsize = (int) entry[0].longValue();
lseek = entry[1].longValue();
}
}
// check if the found entry is large enough
if (lsize > reclen + 4) {
// split the free entry into two new entries
// if would be sufficient if lsize = reclen + 4, but this would mean to create
// an empty entry with zero next bytes for BLOB and key, which is not very good for the
// data structure in the file
// write the new entry
file.seek(lseek);
file.writeInt(reclen);
file.write(key);
file.write(b);
// add the index to the new entry
index.putl(key, lseek);
// define the new empty entry
int newfreereclen = lsize - reclen - 4;
assert newfreereclen > 0;
file.writeInt(newfreereclen);
// remove the old free entry
i = this.free.iterator();
while (i.hasNext()) {
entry = i.next();
if (entry[0].longValue() == (long) lsize && entry[1].longValue() == lseek) {
// remove the entry from the free list
i.remove();
break;
}
}
// add a new free entry
free.add(new Long[]{new Long(newfreereclen), new Long(lseek + 4 + reclen)});
System.out.println("*** DEBUG BLOB: replaced-split record at " + lseek + ", reclen=" + reclen + ", new reclen=" + newfreereclen + ", key=" + new String(key));
// finished!
return;
}
}
// if there is no free entry or no free entry is large enough, append the entry at the end of the file
this.add(key, b);
}
/**
* remove a BLOB
* @param key the primary key
* @throws IOException
*/
public synchronized void remove(byte[] key) throws IOException {
assert index.row().primaryKeyLength == key.length;
// check if the index contains the key
long pos = index.getl(key);
if (pos < 0) return;
// access the file and read the container
file.seek(pos);
int len = file.readInt();
// add entry to free array
this.free.add(new Long[]{new Long(len), new Long(pos)});
// fill zeros to the content
while (len-- > 0) file.write(0);
// remove entry from index
this.index.removel(key);
}
/**
* iterator over all keys
* @param up
* @param rotating
* @return
* @throws IOException
*/
public synchronized kelondroCloneableIterator<byte[]> keys(boolean up, boolean rotating) throws IOException {
return new kelondroRotateIterator<byte[]>(this.index.keys(up, null), null, 1);
}
/**
* iterate over all keys
* @param up
* @param firstKey
* @return
* @throws IOException
*/
public synchronized kelondroCloneableIterator<byte[]> keys(boolean up, byte[] firstKey) throws IOException {
return this.index.keys(up, firstKey);
}
}

@ -113,7 +113,7 @@ public class kelondroBLOBTree implements kelondroBLOB {
}
public int keylength() {
return this.rowdef.primaryKeyLength;
return this.keylen;
}
public synchronized int size() {
@ -140,7 +140,7 @@ public class kelondroBLOBTree implements kelondroBLOB {
return new String(rawKey, 0, n + 1);
}
public class keyIterator implements kelondroCloneableIterator<String> {
public class keyIterator implements kelondroCloneableIterator<byte[]> {
// the iterator iterates all keys
kelondroCloneableIterator<kelondroRow.Entry> ri;
String nextKey;
@ -158,10 +158,10 @@ public class kelondroBLOBTree implements kelondroBLOB {
return nextKey != null;
}
public String next() {
public byte[] next() {
String result = nextKey;
nextKey = n();
return origKey(result.getBytes());
return origKey(result.getBytes()).getBytes();
}
public void remove() {
@ -190,16 +190,17 @@ public class kelondroBLOBTree implements kelondroBLOB {
}
return null;
}
}
public synchronized kelondroCloneableIterator<String> keys(boolean up, boolean rotating) throws IOException {
public synchronized kelondroCloneableIterator<byte[]> keys(boolean up, boolean rotating) throws IOException {
// iterates only the keys of the Nodes
// enumerated objects are of type String
keyIterator i = new keyIterator(index.rows(up, null));
if (rotating) return new kelondroRotateIterator<String>(i, null, index.size()); else return i;
if (rotating) return new kelondroRotateIterator<byte[]>(i, null, index.size()); else return i;
}
public synchronized keyIterator keys(boolean up, byte[] firstKey) throws IOException {
public synchronized kelondroCloneableIterator<byte[]> keys(boolean up, byte[] firstKey) throws IOException {
return new keyIterator(index.rows(up, firstKey));
}
@ -235,13 +236,13 @@ public class kelondroBLOBTree implements kelondroBLOB {
return buf[recpos] & 0xFF;
}
public synchronized byte[] get(String key) throws IOException {
kelondroRA ra = getRA(key);
public synchronized byte[] get(byte[] key) throws IOException {
kelondroRA ra = getRA(new String(key));
if (ra == null) return null;
return ra.readFully();
}
public synchronized byte[] get(String key, int pos, int len) throws IOException {
private synchronized byte[] get(String key, int pos, int len) throws IOException {
int recpos = pos % reclen;
int reccnt = pos / reclen;
byte[] segment1;
@ -285,11 +286,11 @@ public class kelondroBLOBTree implements kelondroBLOB {
return result;
}
public synchronized void put(String key, byte[] b) throws IOException {
put(key, 0, b, 0, b.length);
public synchronized void put(byte[] key, byte[] b) throws IOException {
put(new String(key), 0, b, 0, b.length);
}
public synchronized void put(String key, int pos, byte[] b, int off, int len) throws IOException {
private synchronized void put(String key, int pos, byte[] b, int off, int len) throws IOException {
int recpos = pos % reclen;
int reccnt = pos / reclen;
byte[] buf;
@ -326,12 +327,30 @@ public class kelondroBLOBTree implements kelondroBLOB {
}
}
public synchronized void remove(String key) throws IOException {
private synchronized void put(String key, int pos, int b) throws IOException {
int recpos = pos % reclen;
int reccnt = pos / reclen;
byte[] buf;
// first write current record
buf = getValueCached(elementKey(key, reccnt));
if (buf == null) {
buf = new byte[reclen];
} else if (buf.length < reclen) {
byte[] buff = new byte[reclen];
System.arraycopy(buf, 0, buff, 0, buf.length);
buf = buff;
buff = null;
}
buf[recpos] = (byte) b;
setValueCached(elementKey(key, reccnt), buf);
}
public synchronized void remove(byte[] key) throws IOException {
// remove value in cache and tree
if (key == null) return;
int recpos = 0;
byte[] k;
while (index.get(k = elementKey(key, recpos)) != null) {
while (index.get(k = elementKey(new String(key), recpos)) != null) {
index.remove(k);
buffer.remove(k);
recpos++;
@ -339,8 +358,8 @@ public class kelondroBLOBTree implements kelondroBLOB {
//segmentCount--; writeSegmentCount();
}
public synchronized boolean has(String key) throws IOException {
return (key != null) && (getValueCached(elementKey(key, 0)) != null);
public synchronized boolean has(byte[] key) throws IOException {
return (key != null) && (getValueCached(elementKey(new String(key), 0)) != null);
}
public synchronized kelondroRA getRA(String filekey) {
@ -372,9 +391,7 @@ public class kelondroBLOBTree implements kelondroBLOB {
}
public void write(int i) throws IOException {
byte[] b = new byte[1];
b[0] = (byte) i;
put(filekey, seekpos++, b, 0, 1);
put(filekey, seekpos++, i);
}
public int read(byte[] b, int off, int len) throws IOException {
@ -415,11 +432,11 @@ public class kelondroBLOBTree implements kelondroBLOB {
if (args.length == 1) {
// open a db and list keys
try {
kelondroBLOBTree kd = new kelondroBLOBTree(new File(args[0]), true, true, 4 ,100, '_', kelondroNaturalOrder.naturalOrder, false, false, true);
kelondroBLOB kd = new kelondroBLOBTree(new File(args[0]), true, true, 4 ,100, '_', kelondroNaturalOrder.naturalOrder, false, false, true);
System.out.println(kd.size() + " elements in DB");
Iterator<String> i = kd.keys(true, false);
Iterator<byte[]> i = kd.keys(true, false);
while (i.hasNext())
System.out.println(i.next());
System.out.println(new String(i.next()));
kd.close();
} catch (IOException e) {
e.printStackTrace();
@ -430,7 +447,7 @@ public class kelondroBLOBTree implements kelondroBLOB {
public static int countElements(kelondroBLOBTree t) {
int count = 0;
try {
Iterator<String> iter = t.keys(true, false);
Iterator<byte[]> iter = t.keys(true, false);
while (iter.hasNext()) {count++; if (iter.next() == null) System.out.println("ERROR! null element found");}
return count;
} catch (IOException e) {

@ -41,7 +41,7 @@ public class kelondroBytesLongMap {
}
public kelondroBytesLongMap(int keylength, kelondroByteOrder objectOrder, int space) {
this.rowdef = new kelondroRow(new kelondroColumn[]{new kelondroColumn("key", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, keylength, "key"), new kelondroColumn("int c-8 {b256}")}, objectOrder, 0);
this.rowdef = new kelondroRow(new kelondroColumn[]{new kelondroColumn("key", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, keylength, "key"), new kelondroColumn("long c-8 {b256}")}, objectOrder, 0);
this.index = new kelondroRAMIndex(rowdef, space);
}

@ -1,191 +0,0 @@
// kelondroHeap.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 30.04.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $
// $LastChangedRevision: 4558 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
public final class kelondroHeap {
private kelondroBytesLongMap index;
private File heapFile;
private kelondroByteOrder ordering;
/**
* create a heap file: a arbitrary number of BLOBs, indexed by an access key
* The heap file will be opened at initialization time, indexed and closed again.
* Heap files are only opened when BLOBs are read from it or new one are appended
* @param heapFile
* @param keylength
* @param ordering
* @throws IOException
*/
public kelondroHeap(File heapFile, int keylength, kelondroByteOrder ordering) throws IOException {
this.index = null;
this.ordering = ordering;
this.heapFile = heapFile;
if (!(heapFile.exists())) throw new IOException("file " + heapFile + " does not exist");
if (heapFile.length() >= Integer.MAX_VALUE) throw new IOException("file " + heapFile + " too large, index can only be crated for files less than 2GB");
this.index = new kelondroBytesLongMap(keylength, this.ordering, 0);
DataInputStream is = null;
String keystring;
byte[] key = new byte[keylength];
int reclen;
long seek = 0, seek0;
is = new DataInputStream(new BufferedInputStream(new FileInputStream(heapFile), 64*1024));
// don't test available() here because this does not work for files > 2GB
loop: while (true) {
// remember seek position
seek0 = seek;
// read length of the following record without the length of the record size bytes
try {
reclen = is.readInt();
} catch (IOException e) {
break loop; // terminate loop
}
seek += 4L;
// read key
try {
is.readFully(key);
} catch (IOException e) {
break loop; // terminate loop
}
keystring = new String(key);
seek += keystring.length();
// skip content
seek += reclen;
while (reclen > 0) reclen -= is.skip(reclen);
// store access address to entry
try {
index.addl(key, seek0);
} catch (IOException e) {
e.printStackTrace();
break loop;
}
}
is.close();
}
/**
* the number of BLOBs in the heap
* @return the number of BLOBs in the heap
*/
public int size() {
return this.index.size();
}
/**
* test if a key is in the heap file
* @param key
* @return true if the key exists, false othervise
*/
public boolean has(String key) {
assert index != null;
assert index.row().primaryKeyLength == key.length();
// check if the index contains the key
try {
return index.getl(key.getBytes()) >= 0;
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
/**
* add a BLOB to the heap
* @param key
* @param blob
* @throws IOException
*/
public synchronized void add(String key, byte[] blob) throws IOException {
add(key, blob, 0, blob.length);
}
/**
* add a BLOB to the heap
* @param key
* @param blob
* @throws IOException
*/
public synchronized void add(String key, byte[] blob, int offset, int len) throws IOException {
assert index.row().primaryKeyLength == key.length();
if ((blob == null) || (blob.length == 0)) return;
DataOutputStream os = null;
try {
os = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(heapFile)));
} catch (FileNotFoundException e) {
throw new IOException(e.getMessage());
}
int pos = os.size();
os.writeInt(len);
os.write(key.getBytes());
os.write(blob, offset, len);
os.close();
index.putl(key.getBytes(), pos);
}
/**
* read a blob from the heap
* @param key
* @return
* @throws IOException
*/
public byte[] get(String key) throws IOException {
assert index.row().primaryKeyLength == key.length();
// check if the index contains the key
long pos = index.getl(key.getBytes());
if (pos < 0) return null;
// access the file and read the container
RandomAccessFile raf = new RandomAccessFile(heapFile, "r");
int len = raf.readInt();
byte[] record = new byte[len];
raf.seek(pos + 4 + index.row().primaryKeyLength);
raf.readFully(record);
raf.close();
return record;
}
}

@ -91,8 +91,9 @@ public final class kelondroMScoreCluster<E> {
double d = 1000d * ((Double) o).doubleValue();
return (int) Math.round(d);
}
String s = "";
String s = null;
if (o instanceof String) s = (String) o;
if (o instanceof byte[]) s = new String((byte[]) o);
// this can be used to calculate a score from a string
if ((s == null) || (s.length() == 0) || (s.charAt(0) == '-')) return 0;

@ -85,7 +85,7 @@ public class kelondroMapObjects extends kelondroObjects {
// fill cluster and accumulator with values
if ((sortfields != null) || (longaccfields != null) || (doubleaccfields != null)) try {
kelondroCloneableIterator<String> it = dyn.keys(true, false);
kelondroCloneableIterator<byte[]> it = dyn.keys(true, false);
String mapname;
Object cell;
long valuel;
@ -93,8 +93,8 @@ public class kelondroMapObjects extends kelondroObjects {
Map<String, String> map;
this.elementCount = 0;
while (it.hasNext()) {
mapname = it.next();
map = getMap(mapname);
mapname = new String(it.next());
map = getMap(new String(mapname));
if (map == null) break;
if (sortfields != null) for (int i = 0; i < sortfields.length; i++) {
@ -299,13 +299,37 @@ public class kelondroMapObjects extends kelondroObjects {
}
}
public synchronized Iterator<String> keys(final boolean up, /* sorted by */ String field) {
public synchronized Iterator<byte[]> keys(final boolean up, /* sorted by */ String field) {
// sorted iteration using the sortClusters
if (sortClusterMap == null) return null;
final kelondroMScoreCluster<String> cluster = sortClusterMap.get(field);
if (cluster == null) return null; // sort field does not exist
//System.out.println("DEBUG: cluster for field " + field + ": " + cluster.toString());
return cluster.scores(up);
return new string2bytearrayIterator(cluster.scores(up));
}
public class string2bytearrayIterator implements Iterator<byte[]> {
Iterator<String> s;
public string2bytearrayIterator(Iterator<String> s) {
this.s = s;
}
public boolean hasNext() {
return s.hasNext();
}
public byte[] next() {
String r = s.next();
if (r == null) return null;
return r.getBytes();
}
public void remove() {
s.remove();
}
}
public synchronized mapIterator maps(final boolean up, final String field) {
@ -351,11 +375,11 @@ public class kelondroMapObjects extends kelondroObjects {
// enumerates Map-Type elements
// the key is also included in every map that is returned; it's key is 'key'
Iterator<String> keyIterator;
Iterator<byte[]> keyIterator;
boolean finish;
HashMap<String, String> n;
public mapIterator(Iterator<String> keyIterator) {
public mapIterator(Iterator<byte[]> keyIterator) {
this.keyIterator = keyIterator;
this.finish = false;
this.n = next0();
@ -377,7 +401,7 @@ public class kelondroMapObjects extends kelondroObjects {
String nextKey;
HashMap<String, String> map;
while (keyIterator.hasNext()) {
nextKey = keyIterator.next();
nextKey = new String(keyIterator.next());
if (nextKey == null) {
finish = true;
return null;

@ -35,6 +35,8 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import de.anomic.server.serverDate;
public class kelondroObjects {
private kelondroBLOB blob;
@ -58,11 +60,6 @@ public class kelondroObjects {
this.cacheScore = new kelondroMScoreCluster<String>();
}
public int keySize() {
return blob.keylength();
}
private static String map2string(final Map<String, String> map, final String comment) throws IOException {
final Iterator<Map.Entry<String, String>> iter = map.entrySet().iterator();
Map.Entry<String, String> entry;
@ -99,9 +96,10 @@ public class kelondroObjects {
assert (key.length() > 0);
assert (newMap != null);
if (cacheScore == null) return; // may appear during shutdown
while (key.length() < blob.keylength()) key += "_";
// write entry
blob.put(key, map2string(newMap, "").getBytes());
blob.put(key.getBytes(), map2string(newMap, "W" + serverDate.formatShortSecond() + " ").getBytes());
// check for space in cache
checkCacheSpace();
@ -114,13 +112,14 @@ public class kelondroObjects {
public synchronized void remove(String key) throws IOException {
// update elementCount
if (key == null) return;
while (key.length() < blob.keylength()) key += "_";
// remove from cache
cacheScore.deleteScore(key);
cache.remove(key);
// remove from file
blob.remove(key);
blob.remove(key.getBytes());
}
public synchronized HashMap<String, String> get(final String key) throws IOException {
@ -128,18 +127,20 @@ public class kelondroObjects {
return get(key, true);
}
protected synchronized HashMap<String, String> get(final String key, final boolean storeCache) throws IOException {
protected synchronized HashMap<String, String> get(String key, final boolean storeCache) throws IOException {
// load map from cache
assert key != null;
if (cache == null) return null; // case may appear during shutdown
while (key.length() < blob.keylength()) key += "_";
HashMap<String, String> map = cache.get(key);
if (map != null) return map;
// load map from kra
if (!(blob.has(key))) return null;
if (!(blob.has(key.getBytes()))) return null;
// read object
byte[] b = blob.get(key);
byte[] b = blob.get(key.getBytes());
if (b == null) return null;
map = string2map(new String(b));
@ -166,15 +167,15 @@ public class kelondroObjects {
}
}
public synchronized kelondroCloneableIterator<String> keys(final boolean up, final boolean rotating) throws IOException {
public synchronized kelondroCloneableIterator<byte[]> keys(final boolean up, final boolean rotating) throws IOException {
// simple enumeration of key names without special ordering
return blob.keys(up, rotating);
}
public synchronized kelondroCloneableIterator<String> keys(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException {
public synchronized kelondroCloneableIterator<byte[]> keys(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException {
// simple enumeration of key names without special ordering
kelondroCloneableIterator<String> i = blob.keys(up, firstKey);
if (rotating) return new kelondroRotateIterator<String>(i, secondKey, blob.size()); else return i;
kelondroCloneableIterator<byte[]> i = blob.keys(up, firstKey);
if (rotating) return new kelondroRotateIterator<byte[]>(i, secondKey, blob.size()); else return i;
}
@ -205,10 +206,10 @@ public class kelondroObjects {
// enumerates Map-Type elements
// the key is also included in every map that is returned; it's key is 'key'
Iterator<String> keyIterator;
Iterator<byte[]> keyIterator;
boolean finish;
public objectIterator(Iterator<String> keyIterator) {
public objectIterator(Iterator<byte[]> keyIterator) {
this.keyIterator = keyIterator;
this.finish = false;
}
@ -218,13 +219,13 @@ public class kelondroObjects {
}
public HashMap<String, String> next() {
final String nextKey = keyIterator.next();
final byte[] nextKey = keyIterator.next();
if (nextKey == null) {
finish = true;
return null;
}
try {
final HashMap<String, String> obj = get(nextKey);
final HashMap<String, String> obj = get(new String(nextKey));
if (obj == null) throw new kelondroException("no more elements available");
return obj;
} catch (IOException e) {

@ -71,6 +71,8 @@ import java.util.regex.Pattern;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroBLOB;
import de.anomic.kelondro.kelondroBLOBHeap;
import de.anomic.kelondro.kelondroBLOBTree;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroMScoreCluster;
@ -92,7 +94,7 @@ import de.anomic.yacy.yacyURL;
public final class plasmaHTCache {
public static final String DB_NAME = "responseHeader2.db";
public static final String DB_NAME = "responseHeader.heap";
private static final int stackLimit = 150; // if we exceed that limit, we do not check idle
public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day
@ -279,7 +281,17 @@ public final class plasmaHTCache {
private static void openResponseHeaderDB() {
// open the response header database
File dbfile = new File(cachePath, DB_NAME);
responseHeaderDB = new kelondroMapObjects(new kelondroBLOBTree(dbfile, true, true, yacySeedDB.commonHashLength, 150, '#', kelondroBase64Order.enhancedCoder, false, false, true), 500);
kelondroBLOB blob = null;
if (DB_NAME.endsWith("heap")) {
try {
blob = new kelondroBLOBHeap(dbfile, yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder);
} catch (IOException e) {
e.printStackTrace();
}
} else {
blob = new kelondroBLOBTree(dbfile, true, true, yacySeedDB.commonHashLength, 150, '#', kelondroBase64Order.enhancedCoder, false, false, true);
}
responseHeaderDB = new kelondroMapObjects(blob, 500);
}
private static void deleteOldHTCache(File directory) {
@ -895,7 +907,7 @@ public final class plasmaHTCache {
String initiator,
CrawlProfile.entry profile
) {
return new Entry(
Entry entry = new Entry(
initDate,
depth,
url,
@ -905,6 +917,8 @@ public final class plasmaHTCache {
initiator,
profile
);
entry.writeResourceInfo();
return entry;
}
public final static class Entry {
@ -1039,11 +1053,14 @@ public final class plasmaHTCache {
return this.resInfo;
}
public boolean writeResourceInfo() {
private boolean writeResourceInfo() {
if (this.resInfo == null) return false;
try {
HashMap<String, String> hm = new HashMap<String, String>();
hm.putAll(this.resInfo.getMap());
hm.put("@@URL", this.url.toNormalform(false, false));
hm.put("@@DEPTH", Integer.toString(this.depth));
if (this.initiator != null) hm.put("@@INITIATOR", this.initiator);
responseHeaderDB.set(this.url.hash(), hm);
} catch (Exception e) {
resetResponseHeaderDB();

@ -809,7 +809,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
*
* @see plasmaSwitchboard#PLASMA_PATH for the folder this file lies in
*/
public static final String DBFILE_CRAWL_ROBOTS = "crawlRobotsTxt1.db";
public static final String DBFILE_CRAWL_ROBOTS = "crawlRobotsTxt.heap";
/**
* <p><code>public static final String <strong>DBFILE_USER</strong> = "DATA/SETTINGS/user.db"</code></p>
* <p>Path to the user-DB, beginning from the YaCy-installation's top-folder. It holds all rights the created
@ -1556,10 +1556,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
* b) the content should be indexed
* ========================================================================= */
if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && isSupportedContent)) {
// store response header
// store response header
/*
if (entry.writeResourceInfo()) {
this.log.logInfo("WROTE HEADER for " + entry.cacheFile());
}
}
*/
// work off unwritten files
if (entry.cacheArray() != null) {

Loading…
Cancel
Save