diff --git a/source/de/anomic/crawler/retrieval/Request.java b/source/de/anomic/crawler/retrieval/Request.java index 6769ca80b..7d8328072 100755 --- a/source/de/anomic/crawler/retrieval/Request.java +++ b/source/de/anomic/crawler/retrieval/Request.java @@ -38,45 +38,64 @@ import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.workflow.WorkflowJob; -public class Request extends WorkflowJob { +public class Request extends WorkflowJob +{ // row definition for balancer-related NURL-entries - public final static Row rowdef = new Row( - "String urlhash-" + Word.commonHashLength + ", " + // the url's hash - "String initiator-" + Word.commonHashLength + ", " + // the crawling initiator - "String urlstring-256, " + // the url as string - "String refhash-" + Word.commonHashLength + ", " + // the url's referrer hash - "String urlname-80, " + // the name of the url, from anchor tag name - "Cardinal appdate-8 {b256}, " + // the date of the resource; either file date or first appearance - "String profile-" + Word.commonHashLength + ", " + // the name of the prefetch profile handle - "Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0 - "Cardinal parentbr-3 {b256}, " + // number of anchors of the parent - "Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors - "byte[] flags-4, " + // flags - "Cardinal handle-4 {b256}, " + // handle (NOT USED) - "Cardinal loaddate-8 {b256}, " + // NOT USED - "Cardinal lastmodified-8 {b256}, " + // NOT USED - "Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known - Base64Order.enhancedCoder - ); + public final static Row rowdef = new Row("String urlhash-" + Word.commonHashLength + ", " + // the url's hash + "String initiator-" + + Word.commonHashLength + + ", " + + // the crawling initiator + "String urlstring-256, " + + // the url as string + "String refhash-" + + Word.commonHashLength + + ", " + + // the url's referrer hash + "String urlname-80, " + + // the name of the url, from anchor tag name + "Cardinal appdate-8 {b256}, " + + // the date of the resource; either file date or first appearance + "String profile-" + + Word.commonHashLength + + ", " + + // the name of the prefetch profile handle + "Cardinal depth-2 {b256}, " + + // the prefetch depth so far, starts at 0 + "Cardinal parentbr-3 {b256}, " + + // number of anchors of the parent + "Cardinal forkfactor-4 {b256}, " + + // sum of anchors of all ancestors + "byte[] flags-4, " + + // flags + "Cardinal handle-4 {b256}, " + + // handle (NOT USED) + "Cardinal loaddate-8 {b256}, " + + // NOT USED + "Cardinal lastmodified-8 {b256}, " + + // NOT USED + "Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known + Base64Order.enhancedCoder); - private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy; - // if this is generated by a crawl, the own peer hash in entered - private byte[] refhash; // the url's referrer hash - private DigestURI url; // the url as string - private String name; // the name of the url, from anchor tag name - private long appdate; // the time when the url was first time appeared. - private String profileHandle; // the name of the fetch profile - private int depth; // the prefetch depth so far, starts at 0 - private int anchors; // number of anchors of the parent - private int forkfactor; // sum of anchors of all ancestors + private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy; + // if this is generated by a crawl, the own peer hash in entered + private byte[] refhash; // the url's referrer hash + private DigestURI url; // the url as string + private String name; // the name of the url, from anchor tag name + private long appdate; // the time when the url was first time appeared. + private String profileHandle; // the name of the fetch profile + private int depth; // the prefetch depth so far, starts at 0 + private int anchors; // number of anchors of the parent + private int forkfactor; // sum of anchors of all ancestors private Bitfield flags; - private long size; // size of resource in bytes (if known) or 0 if not known - private String statusMessage; - private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection + private long size; // size of resource in bytes (if known) or 0 if not known + private String statusMessage; + private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection /** * convenience method for 'full' request object + * * @param url * @param referrerhash */ @@ -85,9 +104,8 @@ public class Request extends WorkflowJob { } /** - * A Request Entry is a object that is created to provide - * all information to load a specific resource. - * + * A Request Entry is a object that is created to provide all information to load a specific resource. + * * @param initiator the hash of the initiator peer * @param url the {@link URL} to crawl * @param referrer the hash of the referrer URL @@ -99,35 +117,36 @@ public class Request extends WorkflowJob { * @param forkfactor sum of anchors of all ancestors */ public Request( - final byte[] initiator, - final DigestURI url, - final byte[] referrerhash, - final String name, - final Date appdate, - final String profileHandle, - final int depth, - final int anchors, - final int forkfactor, - final long size - ) { + final byte[] initiator, + final DigestURI url, + final byte[] referrerhash, + final String name, + final Date appdate, + final String profileHandle, + final int depth, + final int anchors, + final int forkfactor, + final long size) { // create new entry and store it into database assert url != null; - assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle + " != " + Word.commonHashLength; + assert profileHandle == null || profileHandle.length() == Word.commonHashLength : profileHandle + + " != " + + Word.commonHashLength; url.removeRef(); // remove anchor reference - this.initiator = (initiator == null) ? null : ((initiator.length == 0) ? null : initiator); - this.url = url; - this.refhash = referrerhash; - this.name = (name == null) ? "" : name; - this.appdate = (appdate == null) ? 0 : appdate.getTime(); + this.initiator = (initiator == null) ? null : ((initiator.length == 0) ? null : initiator); + this.url = url; + this.refhash = referrerhash; + this.name = (name == null) ? "" : name; + this.appdate = (appdate == null) ? 0 : appdate.getTime(); this.profileHandle = profileHandle; // must not be null - this.depth = depth; - this.anchors = anchors; - this.forkfactor = forkfactor; - this.flags = new Bitfield(rowdef.width(10)); + this.depth = depth; + this.anchors = anchors; + this.forkfactor = forkfactor; + this.flags = new Bitfield(rowdef.width(10)); this.statusMessage = "loaded(args)"; - this.initialHash = url.hashCode(); - this.status = WorkflowJob.STATUS_INITIATED; - this.size = size; + this.initialHash = url.hashCode(); + this.status = WorkflowJob.STATUS_INITIATED; + this.size = size; } public Request(final Row.Entry entry) throws IOException { @@ -136,27 +155,35 @@ public class Request extends WorkflowJob { } private void insertEntry(final Row.Entry entry) throws IOException { - final String urlstring = entry.getColUTF8(2); - if (urlstring == null) throw new IOException ("url string is null"); - this.initiator = entry.getColBytes(1, true); - this.initiator = (this.initiator == null) ? null : ((this.initiator.length == 0) ? null : this.initiator); - this.url = new DigestURI(urlstring, entry.getPrimaryKeyBytes()); - this.refhash = (entry.empty(3)) ? null : entry.getColBytes(3, true); - this.name = (entry.empty(4)) ? "" : entry.getColUTF8(4).trim(); - this.appdate = entry.getColLong(5); - this.profileHandle = (entry.empty(6)) ? null : entry.getColASCII(6).trim(); - this.depth = (int) entry.getColLong(7); - this.anchors = (int) entry.getColLong(8); - this.forkfactor = (int) entry.getColLong(9); - this.flags = new Bitfield(entry.getColBytes(10, true)); - //this.loaddate = entry.getColLong(12); - //this.lastmodified = entry.getColLong(13); - this.size = entry.getColLong(14); - this.statusMessage = "loaded(kelondroRow.Entry)"; - this.initialHash = this.url.hashCode(); + try { + final String urlstring = entry.getColUTF8(2); + if ( urlstring == null ) { + throw new IOException("url string is null"); + } + this.initiator = entry.getColBytes(1, true); + this.initiator = + (this.initiator == null) ? null : ((this.initiator.length == 0) ? null : this.initiator); + this.url = new DigestURI(urlstring, entry.getPrimaryKeyBytes()); + this.refhash = (entry.empty(3)) ? null : entry.getColBytes(3, true); + this.name = (entry.empty(4)) ? "" : entry.getColUTF8(4).trim(); + this.appdate = entry.getColLong(5); + this.profileHandle = (entry.empty(6)) ? null : entry.getColASCII(6).trim(); + this.depth = (int) entry.getColLong(7); + this.anchors = (int) entry.getColLong(8); + this.forkfactor = (int) entry.getColLong(9); + this.flags = new Bitfield(entry.getColBytes(10, true)); + //this.loaddate = entry.getColLong(12); + //this.lastmodified = entry.getColLong(13); + this.size = entry.getColLong(14); + this.statusMessage = "loaded(kelondroRow.Entry)"; + this.initialHash = this.url.hashCode(); + } catch ( Throwable e ) { + throw new IOException(e.getMessage()); + } return; } + @Override public int hashCode() { // overloads Object.hashCode() return this.initialHash; @@ -179,7 +206,8 @@ public class Request extends WorkflowJob { final byte[] sizestr = NaturalOrder.encodeLong(this.size, rowdef.width(14)); // store the hash in the hash cache final byte[] namebytes = UTF8.getBytes(this.name); - final byte[][] entry = new byte[][] { + final byte[][] entry = + new byte[][] { this.url.hash(), this.initiator, this.url.toString().getBytes(), @@ -194,7 +222,8 @@ public class Request extends WorkflowJob { NaturalOrder.encodeLong(0, rowdef.width(11)), loaddatestr, serverdatestr, - sizestr}; + sizestr + }; return rowdef.newEntry(entry); } @@ -227,6 +256,7 @@ public class Request extends WorkflowJob { // the date when the url appeared first return new Date(this.appdate); } + /* public Date loaddate() { // the date when the url was loaded @@ -255,7 +285,9 @@ public class Request extends WorkflowJob { public String profileHandle() { // the handle of the crawl profile - assert this.profileHandle.length() == Word.commonHashLength : this.profileHandle + " != " + Word.commonHashLength; + assert this.profileHandle.length() == Word.commonHashLength : this.profileHandle + + " != " + + Word.commonHashLength; return this.profileHandle; }