diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index 5bd0fe0fa..ba7d831c8 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -60,7 +60,6 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; -import de.anomic.index.indexRWIEntryOld; import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlProfile; @@ -205,7 +204,7 @@ public class IndexCreate_p { prop.put("error_reasonString", reasonString); plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - crawlingStartURL.getHost(), reasonString, new bitfield(indexRWIEntryOld.urlFlagLength)); + crawlingStartURL.getHost(), reasonString, new bitfield()); ee.store(); switchboard.urlPool.errorURL.stackPushEntry(ee); } @@ -283,7 +282,7 @@ public class IndexCreate_p { c++; } else { plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - (String) e.getValue(), rejectReason, new bitfield(indexRWIEntryOld.urlFlagLength)); + (String) e.getValue(), rejectReason, new bitfield()); ee.store(); switchboard.urlPool.errorURL.stackPushEntry(ee); } diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 88a0d10eb..5409041e5 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -50,7 +50,6 @@ import java.io.IOException; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; -import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlNURL; @@ -157,7 +156,7 @@ public final class crawlReceipt { } else { try { plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash); - plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexRWIEntryOld.urlFlagLength)); + plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield()); ee.store(); switchboard.urlPool.errorURL.stackPushEntry(ee); switchboard.urlPool.noticeURL.remove(receivedUrlhash); diff --git a/source/de/anomic/index/indexRWIEntryOld.java b/source/de/anomic/index/indexRWIEntryOld.java index 1461ad77e..0d56ac0f1 100644 --- a/source/de/anomic/index/indexRWIEntryOld.java +++ b/source/de/anomic/index/indexRWIEntryOld.java @@ -36,35 +36,14 @@ import de.anomic.yacy.yacySeedDB; public class indexRWIEntryOld implements Cloneable, indexRWIEntry { // this object stores attributes to URL references inside RWI collections - - // statics for value lengths - public static final int urlStringLength = 256;// not too short for links without parameters - public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or

) - public static final int urlNameLength = 40; // the tag content between and - public static final int urldescrtagsLength = 320;// the url, the description and tags in one string - public static final int urlErrorLength = 80; // a reason description for unavailable urls - public static final int urlDateLength = 4; // any date, shortened - public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index - public static final int urlFlagLength = 2; // any stuff - public static final int urlLanguageLength = 2; // taken from TLD suffix as quick-hack - public static final int urlDoctypeLength = 1; // taken from extension - public static final int urlSizeLength = 6; // the source size, from cache - public static final int urlWordCountLength = 3; // the number of words, from condenser - public static final int urlCrawlProfileHandleLength = 4; // name of the prefetch profile - public static final int urlCrawlDepthLength = 2; // prefetch depth, first is '0' - public static final int urlParentBranchesLength = 3; // number of anchors of the parent - public static final int urlForkFactorLength = 4; // sum of anchors of all ancestors - public static final int urlRetryLength = 2; // number of load retries - public static final int urlHostLength = 8; // the host as struncated name - public static final int urlHandleLength = 4; // a handle - public static final int urlQualityLength = 3; // taken from heuristic + public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{ new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"), - new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, urlQualityLength, "quality"), + new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "quality"), new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"), new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"), - new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, urlLanguageLength, "language"), + new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 2, "language"), new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"), new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "localflag"), new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"), @@ -118,7 +97,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link etc // - boolean: URL attributes assert (urlHash.length() == 12) : "urlhash = " + urlHash; - if ((language == null) || (language.length() != urlLanguageLength)) language = "uk"; + if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk"; this.entry = urlEntryRow.newEntry(); this.entry.setCol(col_urlhash, urlHash, null); this.entry.setCol(col_quality, quality); diff --git a/source/de/anomic/index/indexURLEntryOld.java b/source/de/anomic/index/indexURLEntryOld.java index 4e0ca13d0..ae0feeb8f 100644 --- a/source/de/anomic/index/indexURLEntryOld.java +++ b/source/de/anomic/index/indexURLEntryOld.java @@ -40,21 +40,21 @@ import de.anomic.tools.crypt; import de.anomic.yacy.yacySeedDB; public class indexURLEntryOld implements indexURLEntry { - + public static final kelondroRow rowdef = new kelondroRow( "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash - "String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string - "String urldescr-" + indexRWIEntryOld.urlDescrLength + ", " + // the description of the url - "Cardinal moddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // last-modified from the httpd - "Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // time when the url was loaded + "String urlstring-256, " + // the url as string + "String urldescr-80, " + // the description of the url + "Cardinal moddate-4 {b64e}, " + // last-modified from the httpd + "Cardinal loaddate-4 {b64e}, " + // time when the url was loaded "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash - "Cardinal copycount-" + indexRWIEntryOld.urlCopyCountLength + " {b64e}, " + // - "byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags - "Cardinal quality-" + indexRWIEntryOld.urlQualityLength + " {b64e}, " + // - "String language-" + indexRWIEntryOld.urlLanguageLength + ", " + // - "byte[] doctype-" + indexRWIEntryOld.urlDoctypeLength + ", " + // - "Cardinal size-" + indexRWIEntryOld.urlSizeLength + " {b64e}, " + // size of file in bytes - "Cardinal wc-" + indexRWIEntryOld.urlWordCountLength + " {b64e}"); // word count + "Cardinal copycount-2 {b64e}, " + // not used + "byte[] flags-2, " + // flags + "Cardinal quality-3 {b64e}, " + // deprecated + "String language-2, " + // language key; mainly the TDL + "byte[] doctype-1, " + // + "Cardinal size-6 {b64e}, " + // size of file in bytes + "Cardinal wc-3 {b64e}"); // word count private URL url; private String descr; @@ -176,8 +176,8 @@ public class indexURLEntryOld implements indexURLEntry { } public kelondroRow.Entry toRowEntry() throws IOException { - final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); - final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); + final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, rowdef.width(3)); + final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, rowdef.width(4)); final byte[][] entry = new byte[][] { urlHash.getBytes(), @@ -186,13 +186,13 @@ public class indexURLEntryOld implements indexURLEntry { moddatestr.getBytes(), loaddatestr.getBytes(), referrerHash.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(copyCount, indexRWIEntryOld.urlCopyCountLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(copyCount, rowdef.width(6)).getBytes(), flags.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(quality, indexRWIEntryOld.urlQualityLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(quality, rowdef.width(8)).getBytes(), language.getBytes(), new byte[] { (byte) doctype }, - kelondroBase64Order.enhancedCoder.encodeLong(size, indexRWIEntryOld.urlSizeLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(wordCount, indexRWIEntryOld.urlWordCountLength).getBytes()}; + kelondroBase64Order.enhancedCoder.encodeLong(size, rowdef.width(11)).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(wordCount, rowdef.width(12)).getBytes()}; return rowdef.newEntry(entry); } @@ -288,9 +288,7 @@ public class indexURLEntryOld implements indexURLEntry { .append(",size=").append(size).append(",wc=").append( wordCount).append(",cc=").append(copyCount).append( ",local=").append(((local()) ? "true" : "false")) - .append(",q=").append( - kelondroBase64Order.enhancedCoder.encodeLong( - quality, indexRWIEntryOld.urlQualityLength)) + .append(",q=0") .append(",dt=").append(doctype).append(",lang=").append( language).append(",url=").append( crypt.simpleEncode(url.toString())).append( diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java index 113d87b33..1b88d2057 100644 --- a/source/de/anomic/kelondro/kelondroRecords.java +++ b/source/de/anomic/kelondro/kelondroRecords.java @@ -1179,6 +1179,8 @@ public class kelondroRecords { byte[] key = nn.getKey(); if ((key == null) || ((key.length == 1) && (key[0] == (byte) 0x80)) || // the NUL pointer ('lost' chain terminator) + (key.length < 3) || + ((key.length > 3) && (key[2] == 0) && (key[3] == 0)) || ((key.length > 3) && (key[0] == (byte) 0x80) && (key[1] == 0) && (key[2] == 0) && (key[3] == 0)) || ((key.length > 0) && (key[0] == 0)) // a 'lost' pointer within a deleted-chain ) { diff --git a/source/de/anomic/kelondro/kelondroRow.java b/source/de/anomic/kelondro/kelondroRow.java index 16347829d..3a5b0359e 100644 --- a/source/de/anomic/kelondro/kelondroRow.java +++ b/source/de/anomic/kelondro/kelondroRow.java @@ -162,6 +162,7 @@ public class kelondroRow { } public Entry(byte[] rowinstance, int start, int length) { + assert objectsize == length; this.rowinstance = new byte[objectsize]; int ll = Math.min(objectsize, length); System.arraycopy(rowinstance, start, this.rowinstance, 0, ll); @@ -169,15 +170,20 @@ public class kelondroRow { } public Entry(byte[][] cols) { + assert row.length == cols.length; rowinstance = new byte[objectsize]; int ll; + int cs, cw; for (int i = 0; i < row.length; i++) { + cs = colstart[i]; + cw = row[i].cellwidth(); if ((i >= cols.length) || (cols[i] == null)) { - for (int j = 0; j < row[i].cellwidth(); j++) this.rowinstance[colstart[i] + j] = 0; + for (int j = 0; j < cw; j++) this.rowinstance[cs + j] = 0; } else { - ll = Math.min(cols[i].length, row[i].cellwidth()); - System.arraycopy(cols[i], 0, rowinstance, colstart[i], ll); - for (int j = ll; j < row[i].cellwidth(); j++) this.rowinstance[colstart[i] + j] = 0; + //assert cols[i].length <= cw : "i = " + i + ", cols[i].length = " + cols[i].length + ", cw = " + cw; + ll = Math.min(cols[i].length, cw); + System.arraycopy(cols[i], 0, rowinstance, cs, ll); + for (int j = ll; j < cw; j++) this.rowinstance[cs + j] = 0; } } } @@ -252,6 +258,7 @@ public class kelondroRow { System.arraycopy(cell, 0, rowinstance, offset, cell.length); while (length-- > cell.length) rowinstance[offset + length] = 0; } else { + //assert cell.length == length; System.arraycopy(cell, 0, rowinstance, offset, length); } } diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java index 95caa46c3..1e4c806df 100644 --- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java +++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java @@ -51,7 +51,6 @@ import java.io.File; import java.io.IOException; import de.anomic.index.indexURL; -import de.anomic.index.indexRWIEntryOld; import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlLoaderMessage; @@ -298,7 +297,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW yacyCore.seedDB.mySeed.hash, this.name, (failreason==null)?"Unknown reason":failreason, - new bitfield(indexRWIEntryOld.urlFlagLength) + new bitfield() ); // store the entry diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 0bb32a489..52039c7d9 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -54,7 +54,6 @@ import java.util.Iterator; import java.util.LinkedList; import de.anomic.index.indexURL; -import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroFlexTable; import de.anomic.kelondro.kelondroRow; @@ -133,20 +132,22 @@ public class plasmaCrawlEURL extends indexURL { * ======================================================================= */ private LinkedList rejectedStack = new LinkedList(); // strings: url + public final static kelondroRow rowdef = new kelondroRow( + "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash + "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash + "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator + "String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor + "String urlstring-256, " + // the url as string + "String urlname-40, " + // the name of the url, from anchor tag name + "Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared + "Cardinal loaddate-4 {b64e}, " + // the time when the url was last time tried to load + "Cardinal retrycount-2 {b64e}, " + // number of load retries + "String failcause-80, " + // string describing load failure + "byte[] flags-2"); // extra space + public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) { super(); - kelondroRow rowdef = new kelondroRow( - "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash - "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash - "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator - "String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor - "String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string - "String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag name - "Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared - "Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was last time tried to load - "Cardinal retrycount-" + indexRWIEntryOld.urlRetryLength + " {b64e}, " + // number of load retries - "String failcause-" + indexRWIEntryOld.urlErrorLength + ", " + // string describing load failure - "byte[] flags-" + indexRWIEntryOld.urlFlagLength); // extra space + if (newdb) { String newCacheName = "urlErr3.table"; @@ -291,8 +292,8 @@ public class plasmaCrawlEURL extends indexURL { // stores the values from the object variables into the database if (this.stored) return; if (this.hash == null) return; - String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); - String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); + String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, rowdef.width(6)); + String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, rowdef.width(7)); // store the hash in the hash cache try { @@ -306,7 +307,7 @@ public class plasmaCrawlEURL extends indexURL { this.name.getBytes(), initdatestr.getBytes(), trydatestr.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, indexRWIEntryOld.urlRetryLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, rowdef.width(8)).getBytes(), this.failreason.getBytes(), this.flags.getBytes() }; diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 9e07210df..7b31da4cc 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -51,7 +51,6 @@ import java.util.HashSet; import java.util.Iterator; import de.anomic.index.indexURL; -import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroException; @@ -66,7 +65,7 @@ import de.anomic.tools.bitfield; import de.anomic.yacy.yacySeedDB; public class plasmaCrawlNURL extends indexURL { - + public static final int STACK_TYPE_NULL = 0; // do not stack public static final int STACK_TYPE_CORE = 1; // put on local stack public static final int STACK_TYPE_LIMIT = 2; // put on global stack @@ -80,18 +79,19 @@ public class plasmaCrawlNURL extends indexURL { * column length definition for the {@link plasmaURL#urlIndexFile} DB */ public final static kelondroRow rowdef = new kelondroRow( - "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash - "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator - "String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string - "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash - "String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag name - "Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared - "String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle - "Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0 - "Cardinal parentbr-" + indexRWIEntryOld.urlParentBranchesLength + " {b64e}, " + // number of anchors of the parent - "Cardinal forkfactor-" + indexRWIEntryOld.urlForkFactorLength + " {b64e}, " + // sum of anchors of all ancestors - "byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags - "String handle-" + indexRWIEntryOld.urlHandleLength); // extra handle + "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash + "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator + "String urlstring-256, " + // the url as string + "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash + "String urlname-40, " + // the name of the url, from anchor tag name + "Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared + "String profile-4, " + // the name of the prefetch profile handle + "Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0 + "Cardinal parentbr-3 {b64e}, " + // number of anchors of the parent + "Cardinal forkfactor-4 {b64e}, " + // sum of anchors of all ancestors + "byte[] flags-2, " + // flags + "String handle-4" // extra handle + ); private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1 private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth @@ -259,7 +259,7 @@ public class plasmaCrawlNURL extends indexURL { private static String normalizeHandle(int h) { String d = Integer.toHexString(h); - while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d; + while (d.length() < rowdef.width(11)) d = "0" + d; return d; } @@ -481,7 +481,7 @@ public class plasmaCrawlNURL extends indexURL { this.depth = depth; this.anchors = anchors; this.forkfactor = forkfactor; - this.flags = new bitfield(indexRWIEntryOld.urlFlagLength); + this.flags = new bitfield(rowdef.width(10)); this.handle = 0; this.stored = false; } @@ -535,7 +535,7 @@ public class plasmaCrawlNURL extends indexURL { public void store() { // stores the values from the object variables into the database if (this.stored) return; - String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); + String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, rowdef.width(5)); // store the hash in the hash cache try { // even if the entry exists, we simply overwrite it @@ -547,9 +547,9 @@ public class plasmaCrawlNURL extends indexURL { this.name.getBytes("UTF-8"), loaddatestr.getBytes(), (this.profileHandle == null) ? null : this.profileHandle.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.depth, rowdef.width(7)).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, rowdef.width(8)).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, rowdef.width(9)).getBytes(), this.flags.getBytes(), normalizeHandle(this.handle).getBytes() }; diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index b8bfeffbf..e9eb1d787 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -48,7 +48,6 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Map; -import de.anomic.index.indexRWIEntryOld; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroException; @@ -63,12 +62,14 @@ public class plasmaCrawlProfile { private int bufferkb; private long preloadTime; + public static final int crawlProfileHandleLength = 4; // name of the prefetch profile + public plasmaCrawlProfile(File file, int bufferkb, long preloadTime) { this.profileTableFile = file; this.bufferkb = bufferkb; this.preloadTime = preloadTime; profileTableFile.getParentFile().mkdirs(); - kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#'); + kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, crawlProfileHandleLength, 2000, '#'); profileTable = new kelondroMap(dyn); domsCache = new HashMap(); } @@ -94,7 +95,7 @@ public class plasmaCrawlProfile { if (profileTable != null) try { profileTable.close(); } catch (IOException e) {} if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database"); profileTableFile.getParentFile().mkdirs(); - kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#'); + kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, crawlProfileHandleLength, 2000, '#'); profileTable = new kelondroMap(dyn); } @@ -256,7 +257,7 @@ public class plasmaCrawlProfile { boolean storeHTCache, boolean storeTXCache, boolean localIndexing, boolean remoteIndexing, boolean xsstopw, boolean xdstopw, boolean xpstopw) { - String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, indexRWIEntryOld.urlCrawlProfileHandleLength); + String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, crawlProfileHandleLength); mem = new HashMap(); mem.put("handle", handle); mem.put("name", name); diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 27fc0d9ed..8ea724ec9 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -60,7 +60,6 @@ import org.apache.commons.pool.impl.GenericObjectPool; import de.anomic.data.robotsParser; import de.anomic.http.httpc; import de.anomic.index.indexURL; -import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroCache; @@ -492,7 +491,7 @@ public final class plasmaCrawlStacker { this.depth = depth; this.anchors = anchors; this.forkfactor = forkfactor; - this.flags = new bitfield(indexRWIEntryOld.urlFlagLength); + this.flags = new bitfield(); this.handle = 0; } catch (Exception e) { e.printStackTrace(); @@ -571,11 +570,11 @@ public final class plasmaCrawlStacker { //.append("flags: ").append((flags==null) ? "null" : flags.toString()) ; return str.toString(); - } + } public byte[][] getBytes() { // stores the values from the object variables into the database - String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); + String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, plasmaCrawlNURL.rowdef.width(5)); // store the hash in the hash cache // even if the entry exists, we simply overwrite it @@ -589,19 +588,19 @@ public final class plasmaCrawlStacker { this.name.getBytes("UTF-8"), loaddatestr.getBytes(), (this.profileHandle == null) ? null : this.profileHandle.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.depth, plasmaCrawlNURL.rowdef.width(7)).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, plasmaCrawlNURL.rowdef.width(8)).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, plasmaCrawlNURL.rowdef.width(9)).getBytes(), this.flags.getBytes(), normalizeHandle(this.handle).getBytes() - }; + }; } catch (UnsupportedEncodingException e) { /* ignore this */ } return entry; } private String normalizeHandle(int h) { String d = Integer.toHexString(h); - while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d; + while (d.length() < plasmaCrawlNURL.rowdef.width(11)) d = "0" + d; return d; } } @@ -1059,7 +1058,7 @@ public final class plasmaCrawlStacker { yacyCore.seedDB.mySeed.hash, this.theMsg.name, rejectReason, - new bitfield(indexRWIEntryOld.urlFlagLength) + new bitfield() ); ee.store(); sb.urlPool.errorURL.stackPushEntry(ee); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 572120f86..07746041b 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1497,7 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (document == null) return; } catch (ParserException e) { this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage()); - addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexRWIEntryOld.urlFlagLength)); + addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield()); if (document != null) { document.close(); document = null; @@ -1764,7 +1764,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } else { log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase); - addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexRWIEntryOld.urlFlagLength)); + addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield()); } } catch (Exception ee) { if (ee instanceof InterruptedException) throw (InterruptedException)ee; @@ -1776,7 +1776,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, ""); } - addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexRWIEntryOld.urlFlagLength)); + addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield()); } } else { @@ -1784,7 +1784,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); - addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield(indexRWIEntryOld.urlFlagLength)); + addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield()); if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, ""); } diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index a6f28f456..16fa24d26 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -51,7 +51,6 @@ import java.util.ArrayList; import java.util.Date; import de.anomic.index.indexURL; -import de.anomic.index.indexRWIEntryOld; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; @@ -78,18 +77,18 @@ public class plasmaSwitchboardQueue { initQueueStack(); } - + + public static final kelondroRow rowdef = new kelondroRow( + "String url-256, " + // the url + "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash + "Cardinal modifiedsince-11 {b64e}, " + // from ifModifiedSince + "byte[] flags-1, " + // flags + "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator + "Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0 + "String profile-" + plasmaCrawlProfile.crawlProfileHandleLength + ", " + // the name of the prefetch profile handle + "String urldescr-80"); + private void initQueueStack() { - kelondroRow rowdef = new kelondroRow( - "String url-" + indexRWIEntryOld.urlStringLength + ", " + // the url - "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash - "Cardinal modifiedsince-11" + " {b64e}, " + // from ifModifiedSince - "byte[] flags-1" + ", " + // flags - "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator - "Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0 - "String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle - "String urldescr-" + indexRWIEntryOld.urlDescrLength); // - sbQueueStack = kelondroStack.open(sbQueueStackPath, rowdef); } @@ -110,7 +109,7 @@ public class plasmaSwitchboardQueue { kelondroBase64Order.enhancedCoder.encodeLong((entry.ifModifiedSince == null) ? 0 : entry.ifModifiedSince.getTime(), 11).getBytes(), new byte[]{entry.flags}, (entry.initiator == null) ? indexURL.dummyHash.getBytes() : entry.initiator.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(), + kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, rowdef.width(5)).getBytes(), (entry.profileHandle == null) ? indexURL.dummyHash.getBytes() : entry.profileHandle.getBytes(), (entry.anchorName == null) ? "-".getBytes("UTF-8") : entry.anchorName.getBytes("UTF-8") })); diff --git a/source/de/anomic/tools/bitfield.java b/source/de/anomic/tools/bitfield.java index b417145ab..7138ddbf1 100644 --- a/source/de/anomic/tools/bitfield.java +++ b/source/de/anomic/tools/bitfield.java @@ -44,6 +44,10 @@ public class bitfield { private byte[] bb; + public bitfield() { + this(0); + } + public bitfield(int bytelength) { this.bb= new byte[bytelength]; for (int i = 0 ; i < bytelength; i++) bb[i] = 0; @@ -58,20 +62,29 @@ public class bitfield { return (byte) ((64 | ((a + 16) | (1< 5) || (pos < 0)) throw new RuntimeException("atom position out of bounds: " + pos); return (byte) (((a + 16) & (0xff ^ (1< bb.length)) throw new RuntimeException("position out of bounds: " + pos); + if (pos < 0) throw new RuntimeException("position out of bounds: " + pos); + if (slot > bb.length) { + // extend capacity + byte[] nb = new byte[slot + 1]; + System.arraycopy(bb, 0, nb, 0, bb.length); + for (int i = bb.length; i < nb.length; i++) nb[i] = 0; + bb = nb; + nb = null; + } bb[slot] = (value) ? setAtom(bb[slot], pos % 6) : unsetAtom(bb[slot], pos % 6); } public boolean get(int pos) { int slot = pos / 6; - if ((pos < 0) || (slot > bb.length)) throw new RuntimeException("position out of bounds: " + pos); + if (pos < 0) throw new RuntimeException("position out of bounds: " + pos); + if (slot > bb.length) return false; return (bb[slot] & (1<<(pos%6))) > 0; }