refactoring

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2945 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent f77d624b94
commit 76fceb9997

@ -60,7 +60,6 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlProfile;
@ -205,7 +204,7 @@ public class IndexCreate_p {
prop.put("error_reasonString", reasonString); prop.put("error_reasonString", reasonString);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new bitfield(indexRWIEntryOld.urlFlagLength)); crawlingStartURL.getHost(), reasonString, new bitfield());
ee.store(); ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee); switchboard.urlPool.errorURL.stackPushEntry(ee);
} }
@ -283,7 +282,7 @@ public class IndexCreate_p {
c++; c++;
} else { } else {
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(indexRWIEntryOld.urlFlagLength)); (String) e.getValue(), rejectReason, new bitfield());
ee.store(); ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee); switchboard.urlPool.errorURL.stackPushEntry(ee);
} }

@ -50,7 +50,6 @@ import java.io.IOException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
@ -157,7 +156,7 @@ public final class crawlReceipt {
} else { } else {
try { try {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash); plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexRWIEntryOld.urlFlagLength)); plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield());
ee.store(); ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee); switchboard.urlPool.errorURL.stackPushEntry(ee);
switchboard.urlPool.noticeURL.remove(receivedUrlhash); switchboard.urlPool.noticeURL.remove(receivedUrlhash);

@ -36,35 +36,14 @@ import de.anomic.yacy.yacySeedDB;
public class indexRWIEntryOld implements Cloneable, indexRWIEntry { public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
// this object stores attributes to URL references inside RWI collections // this object stores attributes to URL references inside RWI collections
// statics for value lengths
public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
public static final int urlNameLength = 40; // the tag content between <a> and </a>
public static final int urldescrtagsLength = 320;// the url, the description and tags in one string
public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index
public static final int urlFlagLength = 2; // any stuff
public static final int urlLanguageLength = 2; // taken from TLD suffix as quick-hack
public static final int urlDoctypeLength = 1; // taken from extension
public static final int urlSizeLength = 6; // the source size, from cache
public static final int urlWordCountLength = 3; // the number of words, from condenser
public static final int urlCrawlProfileHandleLength = 4; // name of the prefetch profile
public static final int urlCrawlDepthLength = 2; // prefetch depth, first is '0'
public static final int urlParentBranchesLength = 3; // number of anchors of the parent
public static final int urlForkFactorLength = 4; // sum of anchors of all ancestors
public static final int urlRetryLength = 2; // number of load retries
public static final int urlHostLength = 8; // the host as struncated name
public static final int urlHandleLength = 4; // a handle
public static final int urlQualityLength = 3; // taken from heuristic
public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{ public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{
new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"), new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"),
new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, urlQualityLength, "quality"), new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "quality"),
new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"), new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"),
new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"), new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"),
new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, urlLanguageLength, "language"), new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 2, "language"),
new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"), new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"),
new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "localflag"), new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "localflag"),
new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"), new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"),
@ -118,7 +97,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link etc // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link etc
// - boolean: URL attributes // - boolean: URL attributes
assert (urlHash.length() == 12) : "urlhash = " + urlHash; assert (urlHash.length() == 12) : "urlhash = " + urlHash;
if ((language == null) || (language.length() != urlLanguageLength)) language = "uk"; if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
this.entry = urlEntryRow.newEntry(); this.entry = urlEntryRow.newEntry();
this.entry.setCol(col_urlhash, urlHash, null); this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_quality, quality); this.entry.setCol(col_quality, quality);

@ -40,21 +40,21 @@ import de.anomic.tools.crypt;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
public class indexURLEntryOld implements indexURLEntry { public class indexURLEntryOld implements indexURLEntry {
public static final kelondroRow rowdef = new kelondroRow( public static final kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string "String urlstring-256, " + // the url as string
"String urldescr-" + indexRWIEntryOld.urlDescrLength + ", " + // the description of the url "String urldescr-80, " + // the description of the url
"Cardinal moddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // last-modified from the httpd "Cardinal moddate-4 {b64e}, " + // last-modified from the httpd
"Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // time when the url was loaded "Cardinal loaddate-4 {b64e}, " + // time when the url was loaded
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"Cardinal copycount-" + indexRWIEntryOld.urlCopyCountLength + " {b64e}, " + // "Cardinal copycount-2 {b64e}, " + // not used
"byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags "byte[] flags-2, " + // flags
"Cardinal quality-" + indexRWIEntryOld.urlQualityLength + " {b64e}, " + // "Cardinal quality-3 {b64e}, " + // deprecated
"String language-" + indexRWIEntryOld.urlLanguageLength + ", " + // "String language-2, " + // language key; mainly the TDL
"byte[] doctype-" + indexRWIEntryOld.urlDoctypeLength + ", " + // "byte[] doctype-1, " + //
"Cardinal size-" + indexRWIEntryOld.urlSizeLength + " {b64e}, " + // size of file in bytes "Cardinal size-6 {b64e}, " + // size of file in bytes
"Cardinal wc-" + indexRWIEntryOld.urlWordCountLength + " {b64e}"); // word count "Cardinal wc-3 {b64e}"); // word count
private URL url; private URL url;
private String descr; private String descr;
@ -176,8 +176,8 @@ public class indexURLEntryOld implements indexURLEntry {
} }
public kelondroRow.Entry toRowEntry() throws IOException { public kelondroRow.Entry toRowEntry() throws IOException {
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, rowdef.width(3));
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, rowdef.width(4));
final byte[][] entry = new byte[][] { final byte[][] entry = new byte[][] {
urlHash.getBytes(), urlHash.getBytes(),
@ -186,13 +186,13 @@ public class indexURLEntryOld implements indexURLEntry {
moddatestr.getBytes(), moddatestr.getBytes(),
loaddatestr.getBytes(), loaddatestr.getBytes(),
referrerHash.getBytes(), referrerHash.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, indexRWIEntryOld.urlCopyCountLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(copyCount, rowdef.width(6)).getBytes(),
flags.getBytes(), flags.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, indexRWIEntryOld.urlQualityLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(quality, rowdef.width(8)).getBytes(),
language.getBytes(), language.getBytes(),
new byte[] { (byte) doctype }, new byte[] { (byte) doctype },
kelondroBase64Order.enhancedCoder.encodeLong(size, indexRWIEntryOld.urlSizeLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(size, rowdef.width(11)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, indexRWIEntryOld.urlWordCountLength).getBytes()}; kelondroBase64Order.enhancedCoder.encodeLong(wordCount, rowdef.width(12)).getBytes()};
return rowdef.newEntry(entry); return rowdef.newEntry(entry);
} }
@ -288,9 +288,7 @@ public class indexURLEntryOld implements indexURLEntry {
.append(",size=").append(size).append(",wc=").append( .append(",size=").append(size).append(",wc=").append(
wordCount).append(",cc=").append(copyCount).append( wordCount).append(",cc=").append(copyCount).append(
",local=").append(((local()) ? "true" : "false")) ",local=").append(((local()) ? "true" : "false"))
.append(",q=").append( .append(",q=0")
kelondroBase64Order.enhancedCoder.encodeLong(
quality, indexRWIEntryOld.urlQualityLength))
.append(",dt=").append(doctype).append(",lang=").append( .append(",dt=").append(doctype).append(",lang=").append(
language).append(",url=").append( language).append(",url=").append(
crypt.simpleEncode(url.toString())).append( crypt.simpleEncode(url.toString())).append(

@ -1179,6 +1179,8 @@ public class kelondroRecords {
byte[] key = nn.getKey(); byte[] key = nn.getKey();
if ((key == null) || if ((key == null) ||
((key.length == 1) && (key[0] == (byte) 0x80)) || // the NUL pointer ('lost' chain terminator) ((key.length == 1) && (key[0] == (byte) 0x80)) || // the NUL pointer ('lost' chain terminator)
(key.length < 3) ||
((key.length > 3) && (key[2] == 0) && (key[3] == 0)) ||
((key.length > 3) && (key[0] == (byte) 0x80) && (key[1] == 0) && (key[2] == 0) && (key[3] == 0)) || ((key.length > 3) && (key[0] == (byte) 0x80) && (key[1] == 0) && (key[2] == 0) && (key[3] == 0)) ||
((key.length > 0) && (key[0] == 0)) // a 'lost' pointer within a deleted-chain ((key.length > 0) && (key[0] == 0)) // a 'lost' pointer within a deleted-chain
) { ) {

@ -162,6 +162,7 @@ public class kelondroRow {
} }
public Entry(byte[] rowinstance, int start, int length) { public Entry(byte[] rowinstance, int start, int length) {
assert objectsize == length;
this.rowinstance = new byte[objectsize]; this.rowinstance = new byte[objectsize];
int ll = Math.min(objectsize, length); int ll = Math.min(objectsize, length);
System.arraycopy(rowinstance, start, this.rowinstance, 0, ll); System.arraycopy(rowinstance, start, this.rowinstance, 0, ll);
@ -169,15 +170,20 @@ public class kelondroRow {
} }
public Entry(byte[][] cols) { public Entry(byte[][] cols) {
assert row.length == cols.length;
rowinstance = new byte[objectsize]; rowinstance = new byte[objectsize];
int ll; int ll;
int cs, cw;
for (int i = 0; i < row.length; i++) { for (int i = 0; i < row.length; i++) {
cs = colstart[i];
cw = row[i].cellwidth();
if ((i >= cols.length) || (cols[i] == null)) { if ((i >= cols.length) || (cols[i] == null)) {
for (int j = 0; j < row[i].cellwidth(); j++) this.rowinstance[colstart[i] + j] = 0; for (int j = 0; j < cw; j++) this.rowinstance[cs + j] = 0;
} else { } else {
ll = Math.min(cols[i].length, row[i].cellwidth()); //assert cols[i].length <= cw : "i = " + i + ", cols[i].length = " + cols[i].length + ", cw = " + cw;
System.arraycopy(cols[i], 0, rowinstance, colstart[i], ll); ll = Math.min(cols[i].length, cw);
for (int j = ll; j < row[i].cellwidth(); j++) this.rowinstance[colstart[i] + j] = 0; System.arraycopy(cols[i], 0, rowinstance, cs, ll);
for (int j = ll; j < cw; j++) this.rowinstance[cs + j] = 0;
} }
} }
} }
@ -252,6 +258,7 @@ public class kelondroRow {
System.arraycopy(cell, 0, rowinstance, offset, cell.length); System.arraycopy(cell, 0, rowinstance, offset, cell.length);
while (length-- > cell.length) rowinstance[offset + length] = 0; while (length-- > cell.length) rowinstance[offset + length] = 0;
} else { } else {
//assert cell.length == length;
System.arraycopy(cell, 0, rowinstance, offset, length); System.arraycopy(cell, 0, rowinstance, offset, length);
} }
} }

@ -51,7 +51,6 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLoaderMessage; import de.anomic.plasma.plasmaCrawlLoaderMessage;
@ -298,7 +297,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
this.name, this.name,
(failreason==null)?"Unknown reason":failreason, (failreason==null)?"Unknown reason":failreason,
new bitfield(indexRWIEntryOld.urlFlagLength) new bitfield()
); );
// store the entry // store the entry

@ -54,7 +54,6 @@ import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroFlexTable; import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
@ -133,20 +132,22 @@ public class plasmaCrawlEURL extends indexURL {
* ======================================================================= */ * ======================================================================= */
private LinkedList rejectedStack = new LinkedList(); // strings: url private LinkedList rejectedStack = new LinkedList(); // strings: url
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
"String urlstring-256, " + // the url as string
"String urlname-40, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared
"Cardinal loaddate-4 {b64e}, " + // the time when the url was last time tried to load
"Cardinal retrycount-2 {b64e}, " + // number of load retries
"String failcause-80, " + // string describing load failure
"byte[] flags-2"); // extra space
public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) { public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) {
super(); super();
kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
"String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string
"String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared
"Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was last time tried to load
"Cardinal retrycount-" + indexRWIEntryOld.urlRetryLength + " {b64e}, " + // number of load retries
"String failcause-" + indexRWIEntryOld.urlErrorLength + ", " + // string describing load failure
"byte[] flags-" + indexRWIEntryOld.urlFlagLength); // extra space
if (newdb) { if (newdb) {
String newCacheName = "urlErr3.table"; String newCacheName = "urlErr3.table";
@ -291,8 +292,8 @@ public class plasmaCrawlEURL extends indexURL {
// stores the values from the object variables into the database // stores the values from the object variables into the database
if (this.stored) return; if (this.stored) return;
if (this.hash == null) return; if (this.hash == null) return;
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, rowdef.width(6));
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, rowdef.width(7));
// store the hash in the hash cache // store the hash in the hash cache
try { try {
@ -306,7 +307,7 @@ public class plasmaCrawlEURL extends indexURL {
this.name.getBytes(), this.name.getBytes(),
initdatestr.getBytes(), initdatestr.getBytes(),
trydatestr.getBytes(), trydatestr.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, indexRWIEntryOld.urlRetryLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, rowdef.width(8)).getBytes(),
this.failreason.getBytes(), this.failreason.getBytes(),
this.flags.getBytes() this.flags.getBytes()
}; };

@ -51,7 +51,6 @@ import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
@ -66,7 +65,7 @@ import de.anomic.tools.bitfield;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlNURL extends indexURL { public class plasmaCrawlNURL extends indexURL {
public static final int STACK_TYPE_NULL = 0; // do not stack public static final int STACK_TYPE_NULL = 0; // do not stack
public static final int STACK_TYPE_CORE = 1; // put on local stack public static final int STACK_TYPE_CORE = 1; // put on local stack
public static final int STACK_TYPE_LIMIT = 2; // put on global stack public static final int STACK_TYPE_LIMIT = 2; // put on global stack
@ -80,18 +79,19 @@ public class plasmaCrawlNURL extends indexURL {
* column length definition for the {@link plasmaURL#urlIndexFile} DB * column length definition for the {@link plasmaURL#urlIndexFile} DB
*/ */
public final static kelondroRow rowdef = new kelondroRow( public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string "String urlstring-256, " + // the url as string
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a> "String urlname-40, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared "Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared
"String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle "String profile-4, " + // the name of the prefetch profile handle
"Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0 "Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-" + indexRWIEntryOld.urlParentBranchesLength + " {b64e}, " + // number of anchors of the parent "Cardinal parentbr-3 {b64e}, " + // number of anchors of the parent
"Cardinal forkfactor-" + indexRWIEntryOld.urlForkFactorLength + " {b64e}, " + // sum of anchors of all ancestors "Cardinal forkfactor-4 {b64e}, " + // sum of anchors of all ancestors
"byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags "byte[] flags-2, " + // flags
"String handle-" + indexRWIEntryOld.urlHandleLength); // extra handle "String handle-4" // extra handle
);
private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1 private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth
@ -259,7 +259,7 @@ public class plasmaCrawlNURL extends indexURL {
private static String normalizeHandle(int h) { private static String normalizeHandle(int h) {
String d = Integer.toHexString(h); String d = Integer.toHexString(h);
while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d; while (d.length() < rowdef.width(11)) d = "0" + d;
return d; return d;
} }
@ -481,7 +481,7 @@ public class plasmaCrawlNURL extends indexURL {
this.depth = depth; this.depth = depth;
this.anchors = anchors; this.anchors = anchors;
this.forkfactor = forkfactor; this.forkfactor = forkfactor;
this.flags = new bitfield(indexRWIEntryOld.urlFlagLength); this.flags = new bitfield(rowdef.width(10));
this.handle = 0; this.handle = 0;
this.stored = false; this.stored = false;
} }
@ -535,7 +535,7 @@ public class plasmaCrawlNURL extends indexURL {
public void store() { public void store() {
// stores the values from the object variables into the database // stores the values from the object variables into the database
if (this.stored) return; if (this.stored) return;
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, rowdef.width(5));
// store the hash in the hash cache // store the hash in the hash cache
try { try {
// even if the entry exists, we simply overwrite it // even if the entry exists, we simply overwrite it
@ -547,9 +547,9 @@ public class plasmaCrawlNURL extends indexURL {
this.name.getBytes("UTF-8"), this.name.getBytes("UTF-8"),
loaddatestr.getBytes(), loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(), (this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.depth, rowdef.width(7)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, rowdef.width(8)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, rowdef.width(9)).getBytes(),
this.flags.getBytes(), this.flags.getBytes(),
normalizeHandle(this.handle).getBytes() normalizeHandle(this.handle).getBytes()
}; };

@ -48,7 +48,6 @@ import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
@ -63,12 +62,14 @@ public class plasmaCrawlProfile {
private int bufferkb; private int bufferkb;
private long preloadTime; private long preloadTime;
public static final int crawlProfileHandleLength = 4; // name of the prefetch profile
public plasmaCrawlProfile(File file, int bufferkb, long preloadTime) { public plasmaCrawlProfile(File file, int bufferkb, long preloadTime) {
this.profileTableFile = file; this.profileTableFile = file;
this.bufferkb = bufferkb; this.bufferkb = bufferkb;
this.preloadTime = preloadTime; this.preloadTime = preloadTime;
profileTableFile.getParentFile().mkdirs(); profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#'); kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, crawlProfileHandleLength, 2000, '#');
profileTable = new kelondroMap(dyn); profileTable = new kelondroMap(dyn);
domsCache = new HashMap(); domsCache = new HashMap();
} }
@ -94,7 +95,7 @@ public class plasmaCrawlProfile {
if (profileTable != null) try { profileTable.close(); } catch (IOException e) {} if (profileTable != null) try { profileTable.close(); } catch (IOException e) {}
if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database"); if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database");
profileTableFile.getParentFile().mkdirs(); profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#'); kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, crawlProfileHandleLength, 2000, '#');
profileTable = new kelondroMap(dyn); profileTable = new kelondroMap(dyn);
} }
@ -256,7 +257,7 @@ public class plasmaCrawlProfile {
boolean storeHTCache, boolean storeTXCache, boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing, boolean localIndexing, boolean remoteIndexing,
boolean xsstopw, boolean xdstopw, boolean xpstopw) { boolean xsstopw, boolean xdstopw, boolean xpstopw) {
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, indexRWIEntryOld.urlCrawlProfileHandleLength); String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, crawlProfileHandleLength);
mem = new HashMap(); mem = new HashMap();
mem.put("handle", handle); mem.put("handle", handle);
mem.put("name", name); mem.put("name", name);

@ -60,7 +60,6 @@ import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.data.robotsParser; import de.anomic.data.robotsParser;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCache;
@ -492,7 +491,7 @@ public final class plasmaCrawlStacker {
this.depth = depth; this.depth = depth;
this.anchors = anchors; this.anchors = anchors;
this.forkfactor = forkfactor; this.forkfactor = forkfactor;
this.flags = new bitfield(indexRWIEntryOld.urlFlagLength); this.flags = new bitfield();
this.handle = 0; this.handle = 0;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
@ -571,11 +570,11 @@ public final class plasmaCrawlStacker {
//.append("flags: ").append((flags==null) ? "null" : flags.toString()) //.append("flags: ").append((flags==null) ? "null" : flags.toString())
; ;
return str.toString(); return str.toString();
} }
public byte[][] getBytes() { public byte[][] getBytes() {
// stores the values from the object variables into the database // stores the values from the object variables into the database
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength); String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, plasmaCrawlNURL.rowdef.width(5));
// store the hash in the hash cache // store the hash in the hash cache
// even if the entry exists, we simply overwrite it // even if the entry exists, we simply overwrite it
@ -589,19 +588,19 @@ public final class plasmaCrawlStacker {
this.name.getBytes("UTF-8"), this.name.getBytes("UTF-8"),
loaddatestr.getBytes(), loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(), (this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.depth, plasmaCrawlNURL.rowdef.width(7)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, plasmaCrawlNURL.rowdef.width(8)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, plasmaCrawlNURL.rowdef.width(9)).getBytes(),
this.flags.getBytes(), this.flags.getBytes(),
normalizeHandle(this.handle).getBytes() normalizeHandle(this.handle).getBytes()
}; };
} catch (UnsupportedEncodingException e) { /* ignore this */ } } catch (UnsupportedEncodingException e) { /* ignore this */ }
return entry; return entry;
} }
private String normalizeHandle(int h) { private String normalizeHandle(int h) {
String d = Integer.toHexString(h); String d = Integer.toHexString(h);
while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d; while (d.length() < plasmaCrawlNURL.rowdef.width(11)) d = "0" + d;
return d; return d;
} }
} }
@ -1059,7 +1058,7 @@ public final class plasmaCrawlStacker {
yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
this.theMsg.name, this.theMsg.name,
rejectReason, rejectReason,
new bitfield(indexRWIEntryOld.urlFlagLength) new bitfield()
); );
ee.store(); ee.store();
sb.urlPool.errorURL.stackPushEntry(ee); sb.urlPool.errorURL.stackPushEntry(ee);

@ -1497,7 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (document == null) return; if (document == null) return;
} catch (ParserException e) { } catch (ParserException e) {
this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage()); this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexRWIEntryOld.urlFlagLength)); addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield());
if (document != null) { if (document != null) {
document.close(); document.close();
document = null; document = null;
@ -1764,7 +1764,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
} else { } else {
log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase); log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexRWIEntryOld.urlFlagLength)); addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield());
} }
} catch (Exception ee) { } catch (Exception ee) {
if (ee instanceof InterruptedException) throw (InterruptedException)ee; if (ee instanceof InterruptedException) throw (InterruptedException)ee;
@ -1776,7 +1776,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, ""); yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, "");
} }
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexRWIEntryOld.urlFlagLength)); addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield());
} }
} else { } else {
@ -1784,7 +1784,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption(); checkInterruption();
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield(indexRWIEntryOld.urlFlagLength)); addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield());
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) { if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, ""); yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
} }

@ -51,7 +51,6 @@ import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import de.anomic.index.indexURL; import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
@ -78,18 +77,18 @@ public class plasmaSwitchboardQueue {
initQueueStack(); initQueueStack();
} }
public static final kelondroRow rowdef = new kelondroRow(
"String url-256, " + // the url
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"Cardinal modifiedsince-11 {b64e}, " + // from ifModifiedSince
"byte[] flags-1, " + // flags
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0
"String profile-" + plasmaCrawlProfile.crawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"String urldescr-80");
private void initQueueStack() { private void initQueueStack() {
kelondroRow rowdef = new kelondroRow(
"String url-" + indexRWIEntryOld.urlStringLength + ", " + // the url
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"Cardinal modifiedsince-11" + " {b64e}, " + // from ifModifiedSince
"byte[] flags-1" + ", " + // flags
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0
"String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"String urldescr-" + indexRWIEntryOld.urlDescrLength); //
sbQueueStack = kelondroStack.open(sbQueueStackPath, rowdef); sbQueueStack = kelondroStack.open(sbQueueStackPath, rowdef);
} }
@ -110,7 +109,7 @@ public class plasmaSwitchboardQueue {
kelondroBase64Order.enhancedCoder.encodeLong((entry.ifModifiedSince == null) ? 0 : entry.ifModifiedSince.getTime(), 11).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong((entry.ifModifiedSince == null) ? 0 : entry.ifModifiedSince.getTime(), 11).getBytes(),
new byte[]{entry.flags}, new byte[]{entry.flags},
(entry.initiator == null) ? indexURL.dummyHash.getBytes() : entry.initiator.getBytes(), (entry.initiator == null) ? indexURL.dummyHash.getBytes() : entry.initiator.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, rowdef.width(5)).getBytes(),
(entry.profileHandle == null) ? indexURL.dummyHash.getBytes() : entry.profileHandle.getBytes(), (entry.profileHandle == null) ? indexURL.dummyHash.getBytes() : entry.profileHandle.getBytes(),
(entry.anchorName == null) ? "-".getBytes("UTF-8") : entry.anchorName.getBytes("UTF-8") (entry.anchorName == null) ? "-".getBytes("UTF-8") : entry.anchorName.getBytes("UTF-8")
})); }));

@ -44,6 +44,10 @@ public class bitfield {
private byte[] bb; private byte[] bb;
public bitfield() {
this(0);
}
public bitfield(int bytelength) { public bitfield(int bytelength) {
this.bb= new byte[bytelength]; this.bb= new byte[bytelength];
for (int i = 0 ; i < bytelength; i++) bb[i] = 0; for (int i = 0 ; i < bytelength; i++) bb[i] = 0;
@ -58,20 +62,29 @@ public class bitfield {
return (byte) ((64 | ((a + 16) | (1<<pos))) - 16); return (byte) ((64 | ((a + 16) | (1<<pos))) - 16);
} }
private static byte unsetAtom(byte a, int pos) { private static byte unsetAtom(byte a, int pos) {
if ((pos > 5) || (pos < 0)) throw new RuntimeException("atom position out of bounds: " + pos); if ((pos > 5) || (pos < 0)) throw new RuntimeException("atom position out of bounds: " + pos);
return (byte) (((a + 16) & (0xff ^ (1<<pos))) - 16); return (byte) (((a + 16) & (0xff ^ (1<<pos))) - 16);
} }
public void set(int pos, boolean value) { public void set(int pos, boolean value) {
int slot = pos / 6; int slot = pos / 6;
if ((pos < 0) || (slot > bb.length)) throw new RuntimeException("position out of bounds: " + pos); if (pos < 0) throw new RuntimeException("position out of bounds: " + pos);
if (slot > bb.length) {
// extend capacity
byte[] nb = new byte[slot + 1];
System.arraycopy(bb, 0, nb, 0, bb.length);
for (int i = bb.length; i < nb.length; i++) nb[i] = 0;
bb = nb;
nb = null;
}
bb[slot] = (value) ? setAtom(bb[slot], pos % 6) : unsetAtom(bb[slot], pos % 6); bb[slot] = (value) ? setAtom(bb[slot], pos % 6) : unsetAtom(bb[slot], pos % 6);
} }
public boolean get(int pos) { public boolean get(int pos) {
int slot = pos / 6; int slot = pos / 6;
if ((pos < 0) || (slot > bb.length)) throw new RuntimeException("position out of bounds: " + pos); if (pos < 0) throw new RuntimeException("position out of bounds: " + pos);
if (slot > bb.length) return false;
return (bb[slot] & (1<<(pos%6))) > 0; return (bb[slot] & (1<<(pos%6))) > 0;
} }

Loading…
Cancel
Save