* strict handling of NURL entry element generation, storage and stacking

* more space for EURL reason strings (you must delete the EURL db to use this)


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2324 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 5f72be2a95
commit 7fd90ca7c8

@ -49,7 +49,7 @@ public class indexURL {
public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
public static final int urlNameLength = 40; // the tag content between <a> and </a>
public static final int urlErrorLength = 20; // a reason description for unavailable urls
public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index
public static final int urlFlagLength = 2; // any stuff

@ -176,7 +176,9 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// if the url does not alredy exists in the destination stack we insert it now
if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) {
this.sb.urlPool.noticeURL.newEntry(urlEntry,(stackTypes[i] != -1)?stackTypes[i]:plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(urlEntry);
ne.store();
this.sb.urlPool.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash());
}
// removing hash from the import db

@ -188,7 +188,8 @@ public class plasmaCrawlEURL extends indexURL {
this.initiator = entry.getColString(2, "UTF-8");
this.executor = entry.getColString(3, "UTF-8");
this.url = new URL(entry.getColString(4, "UTF-8").trim());
this.name = entry.getColString(5, "UTF-8").trim();
String n = entry.getColString(5, "UTF-8");
this.name = (n == null) ? "" : n.trim();
this.initdate = new Date(86400000 * entry.getColLongB64E(6));
this.trydate = new Date(86400000 * entry.getColLongB64E(7));
this.trycount = (int) entry.getColLongB64E(8);

@ -288,30 +288,27 @@ public class plasmaCrawlNURL extends indexURL {
public synchronized Entry newEntry(String initiator, URL url, Date loaddate,
String referrer, String name, String profile,
int depth, int anchors, int forkfactor, int stackMode) {
Entry e = new Entry(initiator, url, referrer, name, loaddate,
int depth, int anchors, int forkfactor) {
return new Entry(initiator, url, referrer, name, loaddate,
profile, depth, anchors, forkfactor);
push(stackMode, url.getHost(), e.hash);
return e;
}
public synchronized Entry newEntry(Entry oldEntry, int stackMode) {
public synchronized Entry newEntry(Entry oldEntry) {
if (oldEntry == null) return null;
return newEntry(
return new Entry(
oldEntry.initiator(),
oldEntry.url(),
oldEntry.loaddate(),
oldEntry.referrerHash(),
oldEntry.name(),
oldEntry.loaddate(),
oldEntry.profileHandle(),
oldEntry.depth(),
oldEntry.anchors,
oldEntry.forkfactor,
stackMode
oldEntry.forkfactor
);
}
private void push(int stackType, String domain, String hash) {
public void push(int stackType, String domain, String hash) {
try {
switch (stackType) {
case STACK_TYPE_CORE: coreStack.add(domain, hash.getBytes()); break;

@ -394,7 +394,7 @@ public final class plasmaCrawlStacker {
this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global.");
}
plasmaCrawlNURL.Entry ee = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */
loadDate, /* load date */
referrerHash, /* last url in crawling queue */
@ -402,11 +402,14 @@ public final class plasmaCrawlStacker {
(profile == null) ? null : profile.handle(), // profile must not be null!
currentdepth, /*depth so far*/
0, /*anchors, default value */
0, /*forkfactor, default value */
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/
0 /*forkfactor, default value */
);
ee.store();
ne.store();
this.sb.urlPool.noticeURL.push(
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/,
nexturl.getHost(),
ne.hash());
return null;
}

Loading…
Cancel
Save