* strict handling of NURL entry element generation, storage and stacking

* more space for EURL reason strings (you must delete the EURL db to use this)


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2324 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 5f72be2a95
commit 7fd90ca7c8

@ -49,7 +49,7 @@ public class indexURL {
public static final int urlStringLength = 256;// not too short for links without parameters public static final int urlStringLength = 256;// not too short for links without parameters
public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>) public static final int urlDescrLength = 80; // The headline of a web page (meta-tag or <h1>)
public static final int urlNameLength = 40; // the tag content between <a> and </a> public static final int urlNameLength = 40; // the tag content between <a> and </a>
public static final int urlErrorLength = 20; // a reason description for unavailable urls public static final int urlErrorLength = 80; // a reason description for unavailable urls
public static final int urlDateLength = 4; // any date, shortened public static final int urlDateLength = 4; // any date, shortened
public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index public static final int urlCopyCountLength = 2; // counter for numbers of copies of this index
public static final int urlFlagLength = 2; // any stuff public static final int urlFlagLength = 2; // any stuff

@ -176,7 +176,9 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// if the url does not alredy exists in the destination stack we insert it now // if the url does not alredy exists in the destination stack we insert it now
if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) { if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) {
this.sb.urlPool.noticeURL.newEntry(urlEntry,(stackTypes[i] != -1)?stackTypes[i]:plasmaCrawlNURL.STACK_TYPE_CORE); plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(urlEntry);
ne.store();
this.sb.urlPool.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash());
} }
// removing hash from the import db // removing hash from the import db

@ -188,7 +188,8 @@ public class plasmaCrawlEURL extends indexURL {
this.initiator = entry.getColString(2, "UTF-8"); this.initiator = entry.getColString(2, "UTF-8");
this.executor = entry.getColString(3, "UTF-8"); this.executor = entry.getColString(3, "UTF-8");
this.url = new URL(entry.getColString(4, "UTF-8").trim()); this.url = new URL(entry.getColString(4, "UTF-8").trim());
this.name = entry.getColString(5, "UTF-8").trim(); String n = entry.getColString(5, "UTF-8");
this.name = (n == null) ? "" : n.trim();
this.initdate = new Date(86400000 * entry.getColLongB64E(6)); this.initdate = new Date(86400000 * entry.getColLongB64E(6));
this.trydate = new Date(86400000 * entry.getColLongB64E(7)); this.trydate = new Date(86400000 * entry.getColLongB64E(7));
this.trycount = (int) entry.getColLongB64E(8); this.trycount = (int) entry.getColLongB64E(8);

@ -288,30 +288,27 @@ public class plasmaCrawlNURL extends indexURL {
public synchronized Entry newEntry(String initiator, URL url, Date loaddate, public synchronized Entry newEntry(String initiator, URL url, Date loaddate,
String referrer, String name, String profile, String referrer, String name, String profile,
int depth, int anchors, int forkfactor, int stackMode) { int depth, int anchors, int forkfactor) {
Entry e = new Entry(initiator, url, referrer, name, loaddate, return new Entry(initiator, url, referrer, name, loaddate,
profile, depth, anchors, forkfactor); profile, depth, anchors, forkfactor);
push(stackMode, url.getHost(), e.hash);
return e;
} }
public synchronized Entry newEntry(Entry oldEntry, int stackMode) { public synchronized Entry newEntry(Entry oldEntry) {
if (oldEntry == null) return null; if (oldEntry == null) return null;
return newEntry( return new Entry(
oldEntry.initiator(), oldEntry.initiator(),
oldEntry.url(), oldEntry.url(),
oldEntry.loaddate(),
oldEntry.referrerHash(), oldEntry.referrerHash(),
oldEntry.name(), oldEntry.name(),
oldEntry.loaddate(),
oldEntry.profileHandle(), oldEntry.profileHandle(),
oldEntry.depth(), oldEntry.depth(),
oldEntry.anchors, oldEntry.anchors,
oldEntry.forkfactor, oldEntry.forkfactor
stackMode
); );
} }
private void push(int stackType, String domain, String hash) { public void push(int stackType, String domain, String hash) {
try { try {
switch (stackType) { switch (stackType) {
case STACK_TYPE_CORE: coreStack.add(domain, hash.getBytes()); break; case STACK_TYPE_CORE: coreStack.add(domain, hash.getBytes()); break;

@ -394,7 +394,7 @@ public final class plasmaCrawlStacker {
this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global."); this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global.");
} }
plasmaCrawlNURL.Entry ee = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ plasmaCrawlNURL.Entry ne = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */ nexturl, /* url clear text string */
loadDate, /* load date */ loadDate, /* load date */
referrerHash, /* last url in crawling queue */ referrerHash, /* last url in crawling queue */
@ -402,11 +402,14 @@ public final class plasmaCrawlStacker {
(profile == null) ? null : profile.handle(), // profile must not be null! (profile == null) ? null : profile.handle(), // profile must not be null!
currentdepth, /*depth so far*/ currentdepth, /*depth so far*/
0, /*anchors, default value */ 0, /*anchors, default value */
0, /*forkfactor, default value */ 0 /*forkfactor, default value */
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/
); );
ee.store(); ne.store();
this.sb.urlPool.noticeURL.push(
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/,
nexturl.getHost(),
ne.hash());
return null; return null;
} }

Loading…
Cancel
Save