refactoring

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2945 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent f77d624b94
commit 76fceb9997

@ -60,7 +60,6 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile;
@ -205,7 +204,7 @@ public class IndexCreate_p {
prop.put("error_reasonString", reasonString);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new bitfield(indexRWIEntryOld.urlFlagLength));
crawlingStartURL.getHost(), reasonString, new bitfield());
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
}
@ -283,7 +282,7 @@ public class IndexCreate_p {
c++;
} else {
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(indexRWIEntryOld.urlFlagLength));
(String) e.getValue(), rejectReason, new bitfield());
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
}

@ -50,7 +50,6 @@ import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlNURL;
@ -157,7 +156,7 @@ public final class crawlReceipt {
} else {
try {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexRWIEntryOld.urlFlagLength));
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield());
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
switchboard.urlPool.noticeURL.remove(receivedUrlhash);

@ -36,35 +36,14 @@ import de.anomic.yacy.yacySeedDB;
public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
// this object stores attributes to URL references inside RWI collections
// statics for value lengths
public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
public static final int urlNameLength = 40; // the tag content between <a> and </a>
public static final int urldescrtagsLength = 320;// the url, the description and tags in one string
public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index
public static final int urlFlagLength = 2; // any stuff
public static final int urlLanguageLength = 2; // taken from TLD suffix as quick-hack
public static final int urlDoctypeLength = 1; // taken from extension
public static final int urlSizeLength = 6; // the source size, from cache
public static final int urlWordCountLength = 3; // the number of words, from condenser
public static final int urlCrawlProfileHandleLength = 4; // name of the prefetch profile
public static final int urlCrawlDepthLength = 2; // prefetch depth, first is '0'
public static final int urlParentBranchesLength = 3; // number of anchors of the parent
public static final int urlForkFactorLength = 4; // sum of anchors of all ancestors
public static final int urlRetryLength = 2; // number of load retries
public static final int urlHostLength = 8; // the host as struncated name
public static final int urlHandleLength = 4; // a handle
public static final int urlQualityLength = 3; // taken from heuristic
public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{
new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"),
new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, urlQualityLength, "quality"),
new kelondroColumn("q", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "quality"),
new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 3, "lastModified"),
new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "hitcount"),
new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, urlLanguageLength, "language"),
new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 2, "language"),
new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"),
new kelondroColumn("f", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "localflag"),
new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b64e, 2, "posintext"),
@ -118,7 +97,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link etc
// - boolean: URL attributes
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
if ((language == null) || (language.length() != urlLanguageLength)) language = "uk";
if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
this.entry = urlEntryRow.newEntry();
this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_quality, quality);

@ -40,21 +40,21 @@ import de.anomic.tools.crypt;
import de.anomic.yacy.yacySeedDB;
public class indexURLEntryOld implements indexURLEntry {
public static final kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string
"String urldescr-" + indexRWIEntryOld.urlDescrLength + ", " + // the description of the url
"Cardinal moddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // last-modified from the httpd
"Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // time when the url was loaded
"String urlstring-256, " + // the url as string
"String urldescr-80, " + // the description of the url
"Cardinal moddate-4 {b64e}, " + // last-modified from the httpd
"Cardinal loaddate-4 {b64e}, " + // time when the url was loaded
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"Cardinal copycount-" + indexRWIEntryOld.urlCopyCountLength + " {b64e}, " + //
"byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags
"Cardinal quality-" + indexRWIEntryOld.urlQualityLength + " {b64e}, " + //
"String language-" + indexRWIEntryOld.urlLanguageLength + ", " + //
"byte[] doctype-" + indexRWIEntryOld.urlDoctypeLength + ", " + //
"Cardinal size-" + indexRWIEntryOld.urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + indexRWIEntryOld.urlWordCountLength + " {b64e}"); // word count
"Cardinal copycount-2 {b64e}, " + // not used
"byte[] flags-2, " + // flags
"Cardinal quality-3 {b64e}, " + // deprecated
"String language-2, " + // language key; mainly the TDL
"byte[] doctype-1, " + //
"Cardinal size-6 {b64e}, " + // size of file in bytes
"Cardinal wc-3 {b64e}"); // word count
private URL url;
private String descr;
@ -176,8 +176,8 @@ public class indexURLEntryOld implements indexURLEntry {
}
public kelondroRow.Entry toRowEntry() throws IOException {
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, rowdef.width(3));
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, rowdef.width(4));
final byte[][] entry = new byte[][] {
urlHash.getBytes(),
@ -186,13 +186,13 @@ public class indexURLEntryOld implements indexURLEntry {
moddatestr.getBytes(),
loaddatestr.getBytes(),
referrerHash.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, indexRWIEntryOld.urlCopyCountLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, rowdef.width(6)).getBytes(),
flags.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, indexRWIEntryOld.urlQualityLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, rowdef.width(8)).getBytes(),
language.getBytes(),
new byte[] { (byte) doctype },
kelondroBase64Order.enhancedCoder.encodeLong(size, indexRWIEntryOld.urlSizeLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, indexRWIEntryOld.urlWordCountLength).getBytes()};
kelondroBase64Order.enhancedCoder.encodeLong(size, rowdef.width(11)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, rowdef.width(12)).getBytes()};
return rowdef.newEntry(entry);
}
@ -288,9 +288,7 @@ public class indexURLEntryOld implements indexURLEntry {
.append(",size=").append(size).append(",wc=").append(
wordCount).append(",cc=").append(copyCount).append(
",local=").append(((local()) ? "true" : "false"))
.append(",q=").append(
kelondroBase64Order.enhancedCoder.encodeLong(
quality, indexRWIEntryOld.urlQualityLength))
.append(",q=0")
.append(",dt=").append(doctype).append(",lang=").append(
language).append(",url=").append(
crypt.simpleEncode(url.toString())).append(

@ -1179,6 +1179,8 @@ public class kelondroRecords {
byte[] key = nn.getKey();
if ((key == null) ||
((key.length == 1) && (key[0] == (byte) 0x80)) || // the NUL pointer ('lost' chain terminator)
(key.length < 3) ||
((key.length > 3) && (key[2] == 0) && (key[3] == 0)) ||
((key.length > 3) && (key[0] == (byte) 0x80) && (key[1] == 0) && (key[2] == 0) && (key[3] == 0)) ||
((key.length > 0) && (key[0] == 0)) // a 'lost' pointer within a deleted-chain
) {

@ -162,6 +162,7 @@ public class kelondroRow {
}
public Entry(byte[] rowinstance, int start, int length) {
assert objectsize == length;
this.rowinstance = new byte[objectsize];
int ll = Math.min(objectsize, length);
System.arraycopy(rowinstance, start, this.rowinstance, 0, ll);
@ -169,15 +170,20 @@ public class kelondroRow {
}
public Entry(byte[][] cols) {
assert row.length == cols.length;
rowinstance = new byte[objectsize];
int ll;
int cs, cw;
for (int i = 0; i < row.length; i++) {
cs = colstart[i];
cw = row[i].cellwidth();
if ((i >= cols.length) || (cols[i] == null)) {
for (int j = 0; j < row[i].cellwidth(); j++) this.rowinstance[colstart[i] + j] = 0;
for (int j = 0; j < cw; j++) this.rowinstance[cs + j] = 0;
} else {
ll = Math.min(cols[i].length, row[i].cellwidth());
System.arraycopy(cols[i], 0, rowinstance, colstart[i], ll);
for (int j = ll; j < row[i].cellwidth(); j++) this.rowinstance[colstart[i] + j] = 0;
//assert cols[i].length <= cw : "i = " + i + ", cols[i].length = " + cols[i].length + ", cw = " + cw;
ll = Math.min(cols[i].length, cw);
System.arraycopy(cols[i], 0, rowinstance, cs, ll);
for (int j = ll; j < cw; j++) this.rowinstance[cs + j] = 0;
}
}
}
@ -252,6 +258,7 @@ public class kelondroRow {
System.arraycopy(cell, 0, rowinstance, offset, cell.length);
while (length-- > cell.length) rowinstance[offset + length] = 0;
} else {
//assert cell.length == length;
System.arraycopy(cell, 0, rowinstance, offset, length);
}
}

@ -51,7 +51,6 @@ import java.io.File;
import java.io.IOException;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLoaderMessage;
@ -298,7 +297,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
yacyCore.seedDB.mySeed.hash,
this.name,
(failreason==null)?"Unknown reason":failreason,
new bitfield(indexRWIEntryOld.urlFlagLength)
new bitfield()
);
// store the entry

@ -54,7 +54,6 @@ import java.util.Iterator;
import java.util.LinkedList;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroRow;
@ -133,20 +132,22 @@ public class plasmaCrawlEURL extends indexURL {
* ======================================================================= */
private LinkedList rejectedStack = new LinkedList(); // strings: url
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
"String urlstring-256, " + // the url as string
"String urlname-40, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared
"Cardinal loaddate-4 {b64e}, " + // the time when the url was last time tried to load
"Cardinal retrycount-2 {b64e}, " + // number of load retries
"String failcause-80, " + // string describing load failure
"byte[] flags-2"); // extra space
public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) {
super();
kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
"String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string
"String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared
"Cardinal loaddate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was last time tried to load
"Cardinal retrycount-" + indexRWIEntryOld.urlRetryLength + " {b64e}, " + // number of load retries
"String failcause-" + indexRWIEntryOld.urlErrorLength + ", " + // string describing load failure
"byte[] flags-" + indexRWIEntryOld.urlFlagLength); // extra space
if (newdb) {
String newCacheName = "urlErr3.table";
@ -291,8 +292,8 @@ public class plasmaCrawlEURL extends indexURL {
// stores the values from the object variables into the database
if (this.stored) return;
if (this.hash == null) return;
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, rowdef.width(6));
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, rowdef.width(7));
// store the hash in the hash cache
try {
@ -306,7 +307,7 @@ public class plasmaCrawlEURL extends indexURL {
this.name.getBytes(),
initdatestr.getBytes(),
trydatestr.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, indexRWIEntryOld.urlRetryLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, rowdef.width(8)).getBytes(),
this.failreason.getBytes(),
this.flags.getBytes()
};

@ -51,7 +51,6 @@ import java.util.HashSet;
import java.util.Iterator;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException;
@ -66,7 +65,7 @@ import de.anomic.tools.bitfield;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlNURL extends indexURL {
public static final int STACK_TYPE_NULL = 0; // do not stack
public static final int STACK_TYPE_CORE = 1; // put on local stack
public static final int STACK_TYPE_LIMIT = 2; // put on global stack
@ -80,18 +79,19 @@ public class plasmaCrawlNURL extends indexURL {
* column length definition for the {@link plasmaURL#urlIndexFile} DB
*/
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String urlstring-" + indexRWIEntryOld.urlStringLength + ", " + // the url as string
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-" + indexRWIEntryOld.urlNameLength + ", " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-" + indexRWIEntryOld.urlDateLength + " {b64e}, " + // the time when the url was first time appeared
"String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-" + indexRWIEntryOld.urlParentBranchesLength + " {b64e}, " + // number of anchors of the parent
"Cardinal forkfactor-" + indexRWIEntryOld.urlForkFactorLength + " {b64e}, " + // sum of anchors of all ancestors
"byte[] flags-" + indexRWIEntryOld.urlFlagLength + ", " + // flags
"String handle-" + indexRWIEntryOld.urlHandleLength); // extra handle
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String urlstring-256, " + // the url as string
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-40, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared
"String profile-4, " + // the name of the prefetch profile handle
"Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b64e}, " + // number of anchors of the parent
"Cardinal forkfactor-4 {b64e}, " + // sum of anchors of all ancestors
"byte[] flags-2, " + // flags
"String handle-4" // extra handle
);
private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth
@ -259,7 +259,7 @@ public class plasmaCrawlNURL extends indexURL {
private static String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d;
while (d.length() < rowdef.width(11)) d = "0" + d;
return d;
}
@ -481,7 +481,7 @@ public class plasmaCrawlNURL extends indexURL {
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new bitfield(indexRWIEntryOld.urlFlagLength);
this.flags = new bitfield(rowdef.width(10));
this.handle = 0;
this.stored = false;
}
@ -535,7 +535,7 @@ public class plasmaCrawlNURL extends indexURL {
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, rowdef.width(5));
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
@ -547,9 +547,9 @@ public class plasmaCrawlNURL extends indexURL {
this.name.getBytes("UTF-8"),
loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, rowdef.width(7)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, rowdef.width(8)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, rowdef.width(9)).getBytes(),
this.flags.getBytes(),
normalizeHandle(this.handle).getBytes()
};

@ -48,7 +48,6 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroException;
@ -63,12 +62,14 @@ public class plasmaCrawlProfile {
private int bufferkb;
private long preloadTime;
public static final int crawlProfileHandleLength = 4; // name of the prefetch profile
public plasmaCrawlProfile(File file, int bufferkb, long preloadTime) {
this.profileTableFile = file;
this.bufferkb = bufferkb;
this.preloadTime = preloadTime;
profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#');
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, crawlProfileHandleLength, 2000, '#');
profileTable = new kelondroMap(dyn);
domsCache = new HashMap();
}
@ -94,7 +95,7 @@ public class plasmaCrawlProfile {
if (profileTable != null) try { profileTable.close(); } catch (IOException e) {}
if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database");
profileTableFile.getParentFile().mkdirs();
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, indexRWIEntryOld.urlCrawlProfileHandleLength, 2000, '#');
kelondroDyn dyn = kelondroDyn.open(profileTableFile, bufferkb * 1024, preloadTime, crawlProfileHandleLength, 2000, '#');
profileTable = new kelondroMap(dyn);
}
@ -256,7 +257,7 @@ public class plasmaCrawlProfile {
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
boolean xsstopw, boolean xdstopw, boolean xpstopw) {
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, indexRWIEntryOld.urlCrawlProfileHandleLength);
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, crawlProfileHandleLength);
mem = new HashMap();
mem.put("handle", handle);
mem.put("name", name);

@ -60,7 +60,6 @@ import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.data.robotsParser;
import de.anomic.http.httpc;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache;
@ -492,7 +491,7 @@ public final class plasmaCrawlStacker {
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new bitfield(indexRWIEntryOld.urlFlagLength);
this.flags = new bitfield();
this.handle = 0;
} catch (Exception e) {
e.printStackTrace();
@ -571,11 +570,11 @@ public final class plasmaCrawlStacker {
//.append("flags: ").append((flags==null) ? "null" : flags.toString())
;
return str.toString();
}
}
public byte[][] getBytes() {
// stores the values from the object variables into the database
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, indexRWIEntryOld.urlDateLength);
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, plasmaCrawlNURL.rowdef.width(5));
// store the hash in the hash cache
// even if the entry exists, we simply overwrite it
@ -589,19 +588,19 @@ public final class plasmaCrawlStacker {
this.name.getBytes("UTF-8"),
loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, indexRWIEntryOld.urlParentBranchesLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, indexRWIEntryOld.urlForkFactorLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, plasmaCrawlNURL.rowdef.width(7)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, plasmaCrawlNURL.rowdef.width(8)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, plasmaCrawlNURL.rowdef.width(9)).getBytes(),
this.flags.getBytes(),
normalizeHandle(this.handle).getBytes()
};
};
} catch (UnsupportedEncodingException e) { /* ignore this */ }
return entry;
}
private String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < indexRWIEntryOld.urlHandleLength) d = "0" + d;
while (d.length() < plasmaCrawlNURL.rowdef.width(11)) d = "0" + d;
return d;
}
}
@ -1059,7 +1058,7 @@ public final class plasmaCrawlStacker {
yacyCore.seedDB.mySeed.hash,
this.theMsg.name,
rejectReason,
new bitfield(indexRWIEntryOld.urlFlagLength)
new bitfield()
);
ee.store();
sb.urlPool.errorURL.stackPushEntry(ee);

@ -1497,7 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (document == null) return;
} catch (ParserException e) {
this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexRWIEntryOld.urlFlagLength));
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield());
if (document != null) {
document.close();
document = null;
@ -1764,7 +1764,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
} else {
log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield(indexRWIEntryOld.urlFlagLength));
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield());
}
} catch (Exception ee) {
if (ee instanceof InterruptedException) throw (InterruptedException)ee;
@ -1776,7 +1776,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, "");
}
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield(indexRWIEntryOld.urlFlagLength));
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield());
}
} else {
@ -1784,7 +1784,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield(indexRWIEntryOld.urlFlagLength));
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield());
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
}

@ -51,7 +51,6 @@ import java.util.ArrayList;
import java.util.Date;
import de.anomic.index.indexURL;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
@ -78,18 +77,18 @@ public class plasmaSwitchboardQueue {
initQueueStack();
}
public static final kelondroRow rowdef = new kelondroRow(
"String url-256, " + // the url
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"Cardinal modifiedsince-11 {b64e}, " + // from ifModifiedSince
"byte[] flags-1, " + // flags
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0
"String profile-" + plasmaCrawlProfile.crawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"String urldescr-80");
private void initQueueStack() {
kelondroRow rowdef = new kelondroRow(
"String url-" + indexRWIEntryOld.urlStringLength + ", " + // the url
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"Cardinal modifiedsince-11" + " {b64e}, " + // from ifModifiedSince
"byte[] flags-1" + ", " + // flags
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"Cardinal depth-" + indexRWIEntryOld.urlCrawlDepthLength + " {b64e}, " + // the prefetch depth so far, starts at 0
"String profile-" + indexRWIEntryOld.urlCrawlProfileHandleLength + ", " + // the name of the prefetch profile handle
"String urldescr-" + indexRWIEntryOld.urlDescrLength); //
sbQueueStack = kelondroStack.open(sbQueueStackPath, rowdef);
}
@ -110,7 +109,7 @@ public class plasmaSwitchboardQueue {
kelondroBase64Order.enhancedCoder.encodeLong((entry.ifModifiedSince == null) ? 0 : entry.ifModifiedSince.getTime(), 11).getBytes(),
new byte[]{entry.flags},
(entry.initiator == null) ? indexURL.dummyHash.getBytes() : entry.initiator.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, indexRWIEntryOld.urlCrawlDepthLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong((long) entry.depth, rowdef.width(5)).getBytes(),
(entry.profileHandle == null) ? indexURL.dummyHash.getBytes() : entry.profileHandle.getBytes(),
(entry.anchorName == null) ? "-".getBytes("UTF-8") : entry.anchorName.getBytes("UTF-8")
}));

@ -44,6 +44,10 @@ public class bitfield {
private byte[] bb;
public bitfield() {
this(0);
}
public bitfield(int bytelength) {
this.bb= new byte[bytelength];
for (int i = 0 ; i < bytelength; i++) bb[i] = 0;
@ -58,20 +62,29 @@ public class bitfield {
return (byte) ((64 | ((a + 16) | (1<<pos))) - 16);
}
private static byte unsetAtom(byte a, int pos) {
private static byte unsetAtom(byte a, int pos) {
if ((pos > 5) || (pos < 0)) throw new RuntimeException("atom position out of bounds: " + pos);
return (byte) (((a + 16) & (0xff ^ (1<<pos))) - 16);
}
public void set(int pos, boolean value) {
int slot = pos / 6;
if ((pos < 0) || (slot > bb.length)) throw new RuntimeException("position out of bounds: " + pos);
if (pos < 0) throw new RuntimeException("position out of bounds: " + pos);
if (slot > bb.length) {
// extend capacity
byte[] nb = new byte[slot + 1];
System.arraycopy(bb, 0, nb, 0, bb.length);
for (int i = bb.length; i < nb.length; i++) nb[i] = 0;
bb = nb;
nb = null;
}
bb[slot] = (value) ? setAtom(bb[slot], pos % 6) : unsetAtom(bb[slot], pos % 6);
}
public boolean get(int pos) {
int slot = pos / 6;
if ((pos < 0) || (slot > bb.length)) throw new RuntimeException("position out of bounds: " + pos);
if (pos < 0) throw new RuntimeException("position out of bounds: " + pos);
if (slot > bb.length) return false;
return (bb[slot] & (1<<(pos%6))) > 0;
}

Loading…
Cancel
Save