fixed dates in metadata

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6860 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 0a5fd15703
commit c45117f81f

@ -249,7 +249,6 @@ public class Crawler_p {
null, null,
"CRAWLING-ROOT", "CRAWLING-ROOT",
new Date(), new Date(),
null,
pe.handle(), pe.handle(),
0, 0,
0, 0,
@ -303,7 +302,6 @@ public class Crawler_p {
null, null,
"", "",
new Date(), new Date(),
null,
pe.handle(), pe.handle(),
0, 0,
0, 0,
@ -386,7 +384,6 @@ public class Crawler_p {
null, null,
e.getValue(), e.getValue(),
new Date(), new Date(),
null,
profile.handle(), profile.handle(),
0, 0,
0, 0,

@ -106,7 +106,7 @@ public class IndexCreateWWWGlobalQueue_p {
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth()); prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) ); prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name()); prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true)); prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash()); prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());

@ -127,7 +127,7 @@ public class IndexCreateWWWLocalQueue_p {
case INITIATOR: case INITIATOR:
value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : new String(entry.initiator()); value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : new String(entry.initiator());
break; break;
case MODIFIED: value = daydate(entry.loaddate()); break; case MODIFIED: value = daydate(entry.appdate()); break;
default: value = null; default: value = null;
} }
@ -177,7 +177,7 @@ public class IndexCreateWWWLocalQueue_p {
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth()); prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) ); prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name()); prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true)); prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash()); prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());

@ -103,7 +103,7 @@ public class IndexCreateWWWRemoteQueue_p {
prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth()); prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.loaddate()) ); prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.appdate()) );
prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name()); prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name());
prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString()); prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString());
prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash()); prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash());

@ -181,7 +181,6 @@ public class QuickCrawlLink_p {
null, null,
(title==null)?"CRAWLING-ROOT":title, (title==null)?"CRAWLING-ROOT":title,
new Date(), new Date(),
null,
pe.handle(), pe.handle(),
0, 0,
0, 0,

@ -186,9 +186,11 @@ public class ViewFile {
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);
} }
if (response != null) resource = response.getContent(); if (response != null) {
resource = response.getContent();
responseHeader = response.getResponseHeader(); responseHeader = response.getResponseHeader();
} }
}
if (responseHeader == null) responseHeader = Cache.getResponseHeader(url); if (responseHeader == null) responseHeader = Cache.getResponseHeader(url);

@ -105,7 +105,7 @@ public class queues_p {
prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle()); prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle());
prop.put(tableName + "_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put(tableName + "_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put(tableName + "_" + showNum + "_depth", urle.depth()); prop.put(tableName + "_" + showNum + "_depth", urle.depth());
prop.put(tableName + "_" + showNum + "_modified", daydate(urle.loaddate())); prop.put(tableName + "_" + showNum + "_modified", daydate(urle.appdate()));
prop.putXML(tableName + "_" + showNum + "_anchor", urle.name()); prop.putXML(tableName + "_" + showNum + "_anchor", urle.name());
prop.putXML(tableName + "_" + showNum + "_url", urle.url().toNormalform(false, true)); prop.putXML(tableName + "_" + showNum + "_url", urle.url().toNormalform(false, true));
prop.put(tableName + "_" + showNum + "_hash", urle.url().hash()); prop.put(tableName + "_" + showNum + "_hash", urle.url().hash());

@ -83,7 +83,6 @@ public class rct_p {
url, url,
(referrer == null) ? null : referrer.hash(), (referrer == null) ? null : referrer.hash(),
"REMOTE-CRAWLING", "REMOTE-CRAWLING",
null,
loaddate, loaddate,
sb.crawler.defaultRemoteProfile.handle(), sb.crawler.defaultRemoteProfile.handle(),
0, 0,

@ -448,7 +448,6 @@ public class CrawlQueues {
url, url,
(referrer == null) ? null : referrer.hash(), (referrer == null) ? null : referrer.hash(),
item.getDescription(), item.getDescription(),
null,
loaddate, loaddate,
sb.crawler.defaultRemoteProfile.handle(), sb.crawler.defaultRemoteProfile.handle(),
0, 0,

@ -155,7 +155,7 @@ public final class CrawlStacker {
public void enqueueEntry(final Request entry) { public void enqueueEntry(final Request entry) {
// DEBUG // DEBUG
if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + new String(entry.initiator()) + ", name=" + entry.name() + ", load=" + entry.loaddate() + ", depth=" + entry.depth()); if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + new String(entry.initiator()) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth());
if (prefetchHost(entry.url().getHost())) { if (prefetchHost(entry.url().getHost())) {
try { try {

@ -54,8 +54,8 @@ public class Request extends WorkflowJob {
"Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors "Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors
"byte[] flags-4, " + // flags "byte[] flags-4, " + // flags
"String handle-4, " + // extra handle "String handle-4, " + // extra handle
"Cardinal loaddate-8 {b256}," + // time when the file was loaded "Cardinal loaddate-8 {b256}," + // NOT USED
"Cardinal serverdate-8 {b256}," + // time when that the server returned as document date "Cardinal lastmodified-8 {b256}," + // NOT USED
"Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince "Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince
Base64Order.enhancedCoder Base64Order.enhancedCoder
); );
@ -66,8 +66,6 @@ public class Request extends WorkflowJob {
private DigestURI url; // the url as string private DigestURI url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a> private String name; // the name of the url, from anchor tag <a>name</a>
private long appdate; // the time when the url was first time appeared private long appdate; // the time when the url was first time appeared
private long loaddate; // the time when the url was loaded
private long serverdate; // the document date from the target server
private long imsdate; // the time of a ifModifiedSince request private long imsdate; // the time of a ifModifiedSince request
private String profileHandle; // the name of the fetch profile private String profileHandle; // the name of the fetch profile
private int depth; // the prefetch depth so far, starts at 0 private int depth; // the prefetch depth so far, starts at 0
@ -84,7 +82,7 @@ public class Request extends WorkflowJob {
* @param referrerhash * @param referrerhash
*/ */
public Request(final DigestURI url, final byte[] referrerhash) { public Request(final DigestURI url, final byte[] referrerhash) {
this(null, url, referrerhash, null, null, null, null, 0, 0, 0); this(null, url, referrerhash, null, null, null, 0, 0, 0);
} }
/** /**
@ -107,7 +105,6 @@ public class Request extends WorkflowJob {
final byte[] referrerhash, final byte[] referrerhash,
final String name, final String name,
final Date appdate, final Date appdate,
final Date loaddate,
final String profileHandle, final String profileHandle,
final int depth, final int depth,
final int anchors, final int anchors,
@ -122,14 +119,12 @@ public class Request extends WorkflowJob {
this.refhash = referrerhash; this.refhash = referrerhash;
this.name = (name == null) ? "" : name; this.name = (name == null) ? "" : name;
this.appdate = (appdate == null) ? 0 : appdate.getTime(); this.appdate = (appdate == null) ? 0 : appdate.getTime();
this.loaddate = (loaddate == null) ? 0 : loaddate.getTime();
this.profileHandle = profileHandle; // must not be null this.profileHandle = profileHandle; // must not be null
this.depth = depth; this.depth = depth;
this.anchors = anchors; this.anchors = anchors;
this.forkfactor = forkfactor; this.forkfactor = forkfactor;
this.flags = new Bitfield(rowdef.width(10)); this.flags = new Bitfield(rowdef.width(10));
this.handle = 0; this.handle = 0;
this.serverdate = 0;
this.imsdate = 0; this.imsdate = 0;
this.statusMessage = "loaded(args)"; this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode(); this.initialHash = url.hashCode();
@ -156,8 +151,8 @@ public class Request extends WorkflowJob {
this.forkfactor = (int) entry.getColLong(9); this.forkfactor = (int) entry.getColLong(9);
this.flags = new Bitfield(entry.getColBytes(10, true)); this.flags = new Bitfield(entry.getColBytes(10, true));
this.handle = Integer.parseInt(entry.getColString(11, null), 16); this.handle = Integer.parseInt(entry.getColString(11, null), 16);
this.loaddate = entry.getColLong(12); //this.loaddate = entry.getColLong(12);
this.serverdate = entry.getColLong(13); //this.lastmodified = entry.getColLong(13);
this.imsdate = entry.getColLong(14); this.imsdate = entry.getColLong(14);
this.statusMessage = "loaded(kelondroRow.Entry)"; this.statusMessage = "loaded(kelondroRow.Entry)";
this.initialHash = url.hashCode(); this.initialHash = url.hashCode();
@ -187,8 +182,8 @@ public class Request extends WorkflowJob {
public Row.Entry toRow() { public Row.Entry toRow() {
final byte[] appdatestr = NaturalOrder.encodeLong(appdate, rowdef.width(5)); final byte[] appdatestr = NaturalOrder.encodeLong(appdate, rowdef.width(5));
final byte[] loaddatestr = NaturalOrder.encodeLong(loaddate, rowdef.width(12)); final byte[] loaddatestr = NaturalOrder.encodeLong(0 /*loaddate*/, rowdef.width(12));
final byte[] serverdatestr = NaturalOrder.encodeLong(serverdate, rowdef.width(13)); final byte[] serverdatestr = NaturalOrder.encodeLong(0 /*lastmodified*/, rowdef.width(13));
final byte[] imsdatestr = NaturalOrder.encodeLong(imsdate, rowdef.width(14)); final byte[] imsdatestr = NaturalOrder.encodeLong(imsdate, rowdef.width(14));
// store the hash in the hash cache // store the hash in the hash cache
byte[] namebytes; byte[] namebytes;
@ -245,17 +240,17 @@ public class Request extends WorkflowJob {
// the date when the url appeared first // the date when the url appeared first
return new Date(this.appdate); return new Date(this.appdate);
} }
/*
public Date loaddate() { public Date loaddate() {
// the date when the url was loaded // the date when the url was loaded
return new Date(this.loaddate); return new Date(this.loaddate);
} }
public Date serverdate() { public Date lastmodified() {
// the date that the server returned as document date // the date that the server returned as document date
return new Date(this.serverdate); return new Date(this.lastmodified);
} }
*/
public Date imsdate() { public Date imsdate() {
// the date that the client (browser) send as ifModifiedSince in proxy mode // the date that the client (browser) send as ifModifiedSince in proxy mode
return new Date(this.imsdate); return new Date(this.imsdate);

@ -201,6 +201,7 @@ public class Response {
docDate = responseHeader.lastModified(); docDate = responseHeader.lastModified();
if (docDate == null) docDate = responseHeader.date(); if (docDate == null) docDate = responseHeader.date();
} }
if (docDate == null && request != null) docDate = request.appdate();
if (docDate == null) docDate = new Date(DateFormatter.correctedUTCTime()); if (docDate == null) docDate = new Date(DateFormatter.correctedUTCTime());
return docDate; return docDate;

@ -282,7 +282,6 @@ public class SitemapParser extends DefaultHandler {
null, // this.siteMapURL.toString(), null, // this.siteMapURL.toString(),
this.nextURL, this.nextURL,
new Date(), new Date(),
null,
this.crawlingProfile.handle(), this.crawlingProfile.handle(),
0, 0,
0, 0,

@ -269,7 +269,6 @@ public class bookmarksDB {
null, null,
"CRAWLING-ROOT", "CRAWLING-ROOT",
new Date(), new Date(),
null,
pe.handle(), pe.handle(),
0, 0,
0, 0,

@ -387,8 +387,7 @@ public final class HTTPDProxyHandler {
url, url,
requestHeader.referer() == null ? null : requestHeader.referer().hash(), requestHeader.referer() == null ? null : requestHeader.referer().hash(),
"", "",
new Date(), cachedResponseHeader.lastModified(),
new Date(),
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0, 0,
0, 0,
@ -510,8 +509,7 @@ public final class HTTPDProxyHandler {
url, url,
requestHeader.referer() == null ? null : requestHeader.referer().hash(), requestHeader.referer() == null ? null : requestHeader.referer().hash(),
"", "",
new Date(), responseHeader.lastModified(),
new Date(),
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
0, 0,
0, 0,

@ -134,6 +134,7 @@ public class DocumentIndex extends Segment {
url, url,
null, null,
new Date(url.lastModified()), new Date(url.lastModified()),
new Date(),
url.length(), url.length(),
document, document,
condenser condenser

@ -241,7 +241,8 @@ public class Segment {
public URIMetadataRow storeDocument( public URIMetadataRow storeDocument(
final DigestURI url, final DigestURI url,
final DigestURI referrerURL, final DigestURI referrerURL,
final Date docDate, final Date modDate,
final Date loadDate,
final long sourcesize, final long sourcesize,
final Document document, final Document document,
final Condenser condenser final Condenser condenser
@ -295,16 +296,16 @@ public class Segment {
} }
// create a new loaded URL db entry // create a new loaded URL db entry
final long ldate = System.currentTimeMillis(); assert modDate.getTime() <= loadDate.getTime() : "modDate = " + modDate + ", loadDate = " + loadDate;
final URIMetadataRow newEntry = new URIMetadataRow( final URIMetadataRow newEntry = new URIMetadataRow(
url, // URL url, // URL
dc_title, // document description dc_title, // document description
document.dc_creator(), // author document.dc_creator(), // author
document.dc_subject(' '), // tags document.dc_subject(' '), // tags
"", // ETag "", // ETag
docDate, // modification date modDate, // modification date
new Date(), // loaded date loadDate, // loaded date
new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula
(referrerURL == null) ? null : new String(referrerURL.hash()), // referer hash (referrerURL == null) ? null : new String(referrerURL.hash()), // referer hash
new byte[0], // md5 new byte[0], // md5
(int) sourcesize, // size (int) sourcesize, // size
@ -328,7 +329,7 @@ public class Segment {
// STORE PAGE INDEX INTO WORD INDEX DB // STORE PAGE INDEX INTO WORD INDEX DB
final int words = addPageIndex( final int words = addPageIndex(
url, // document url url, // document url
docDate, // document mod date modDate, // document mod date
document, // document content document, // document content
condenser, // document condenser condenser, // document condenser
language, // document language language, // document language

@ -204,7 +204,8 @@ public class Segments implements Iterable<Segment> {
final String segmentName, final String segmentName,
final DigestURI url, final DigestURI url,
final DigestURI referrerURL, final DigestURI referrerURL,
final Date docDate, final Date modDate,
final Date loadDate,
final long sourcesize, final long sourcesize,
final Document document, final Document document,
final Condenser condenser final Condenser condenser
@ -212,7 +213,8 @@ public class Segments implements Iterable<Segment> {
return segment(segmentName).storeDocument( return segment(segmentName).storeDocument(
url, url,
referrerURL, referrerURL,
docDate, modDate,
loadDate,
sourcesize, sourcesize,
document, document,
condenser condenser

@ -1332,8 +1332,7 @@ public final class Switchboard extends serverSwitch {
surrogate.getIdentifier(true), surrogate.getIdentifier(true),
null, null,
"", "",
new Date(), surrogate.getDate(),
new Date(),
this.crawler.defaultSurrogateProfile.handle(), this.crawler.defaultSurrogateProfile.handle(),
0, 0,
0, 0,
@ -1670,7 +1669,7 @@ public final class Switchboard extends serverSwitch {
in.queueEntry.updateStatus(Response.QUEUE_STATE_PARSING); in.queueEntry.updateStatus(Response.QUEUE_STATE_PARSING);
// debug // debug
if (log.isFinest()) log.logFinest("PARSE "+ in.queueEntry.toString()); if (log.isFinest()) log.logFinest("PARSE "+ in.queueEntry);
Document document = null; Document document = null;
try { try {
@ -1733,9 +1732,6 @@ public final class Switchboard extends serverSwitch {
final long parsingEndTime = System.currentTimeMillis(); final long parsingEndTime = System.currentTimeMillis();
// get the document date
final Date docDate = response.lastModified();
// put anchors on crawl stack // put anchors on crawl stack
final long stackStartTime = System.currentTimeMillis(); final long stackStartTime = System.currentTimeMillis();
if ( if (
@ -1767,8 +1763,7 @@ public final class Switchboard extends serverSwitch {
new DigestURI(u, null), new DigestURI(u, null),
response.url().hash(), response.url().hash(),
nextEntry.getValue(), nextEntry.getValue(),
null, new Date(),
docDate,
response.profile().handle(), response.profile().handle(),
response.depth() + 1, response.depth() + 1,
0, 0,
@ -1860,6 +1855,7 @@ public final class Switchboard extends serverSwitch {
queueEntry.url(), queueEntry.url(),
referrerURL, referrerURL,
queueEntry.lastModified(), queueEntry.lastModified(),
new Date(),
queueEntry.size(), queueEntry.size(),
document, document,
condenser); condenser);
@ -2125,7 +2121,6 @@ public final class Switchboard extends serverSwitch {
(name == null) ? "" : name, (name == null) ? "" : name,
new Date(), new Date(),
null, null,
null,
0, 0,
0, 0,
0); 0);

@ -780,7 +780,6 @@ public class yacySeed implements Cloneable {
// name // name
final String peerName = this.dna.get(yacySeed.NAME); final String peerName = this.dna.get(yacySeed.NAME);
if (peerName == null) return "no peer name given"; if (peerName == null) return "no peer name given";
if (peerName.equalsIgnoreCase("VegaYacyB")) return "bad peer VegaYacyB [ " + this.hash + " ]"; // hack for wrong "VegaYacyB" peers
dna.put(yacySeed.NAME, checkPeerName(peerName)); dna.put(yacySeed.NAME, checkPeerName(peerName));
// type // type

@ -81,42 +81,24 @@ public class URIMetadataRow implements URIMetadata {
/* =========================================================================== /* ===========================================================================
* Constants to access the various columns of an URL entry * Constants to access the various columns of an URL entry
* =========================================================================== */ * =========================================================================== */
/** the url's hash */ private static final int col_hash = 0; // the url's hash
private static final int col_hash = 0; private static final int col_comp = 1; // components: the url, description, author and tags. As 5th element, an ETag is possible
/** components: the url, description, author and tags. As 5th element, an ETag is possible */ private static final int col_mod = 2; // the modifed-date time from the server (servertime in row)
private static final int col_comp = 1; private static final int col_load = 3; // time when the url was loaded
/** components: the url, description, author and tags. As 5th element, an ETag is possible */ private static final int col_fresh = 4; // time until this url is fresh
private static final int col_mod = 2; private static final int col_referrer = 5; // a referrer of the url (there may be several, but this is the one that was acually referring to this one)
/** time when the url was loaded */ private static final int col_md5 = 6; // the md5 of the url content (to identify changes)
private static final int col_load = 3; private static final int col_size = 7; // size of file in bytes
/** time until this url is fresh */ private static final int col_wc = 8; // size of file by number of words; for video and audio: seconds
private static final int col_fresh = 4; private static final int col_dt = 9; // doctype, taken from extension or any other heuristic
/** time when the url was loaded */ private static final int col_flags = 10; // flags; any stuff (see Word-Entity definition)
private static final int col_referrer = 5; private static final int col_lang = 11; // language
/** the md5 of the url content (to identify changes) */ private static final int col_llocal = 12; // # of outlinks to same domain; for video and image: width
private static final int col_md5 = 6; private static final int col_lother = 13; // # of outlinks to outside domain; for video and image: height
/** size of file in bytes */ private static final int col_limage = 14; // # of embedded image links
private static final int col_size = 7; private static final int col_laudio = 15; // # of embedded audio links; for audio: track number; for video: number of audio tracks
/** size of file by number of words; for video and audio: seconds */ private static final int col_lvideo = 16; // # of embedded video links
private static final int col_wc = 8; private static final int col_lapp = 17; // # of embedded links to applications
/** doctype, taken from extension or any other heuristic */
private static final int col_dt = 9;
/** flags; any stuff (see Word-Entity definition) */
private static final int col_flags = 10;
/** language */
private static final int col_lang = 11;
/** of outlinks to same domain; for video and image: width */
private static final int col_llocal = 12;
/** of outlinks to outside domain; for video and image: height */
private static final int col_lother = 13;
/** of embedded image links */
private static final int col_limage = 14;
/** of embedded audio links; for audio: track number; for video: number of audio tracks */
private static final int col_laudio = 15;
/** of embedded video links */
private static final int col_lvideo = 16;
/** of embedded links to applications */
private static final int col_lapp = 17;
private final Row.Entry entry; private final Row.Entry entry;
private final String snippet; private final String snippet;
@ -522,8 +504,7 @@ public class URIMetadataRow implements URIMetadata {
metadata().url(), metadata().url(),
referrerHash(), referrerHash(),
metadata().dc_title(), metadata().dc_title(),
null, moddate(),
loaddate(),
null, null,
0, 0,
0, 0,

@ -68,7 +68,7 @@ public final class Row {
os+= this.row[i].cellwidth; os+= this.row[i].cellwidth;
} }
this.objectsize = os; this.objectsize = os;
this.primaryKeyLength = row[0].cellwidth; this.primaryKeyLength = this.row[0].cellwidth;
} }
public Row(final String structure, final ByteOrder objectOrder) { public Row(final String structure, final ByteOrder objectOrder) {
@ -102,7 +102,7 @@ public final class Row {
os += this.row[i].cellwidth; os += this.row[i].cellwidth;
} }
this.objectsize = os; this.objectsize = os;
this.primaryKeyLength = row[0].cellwidth; this.primaryKeyLength = this.row[0].cellwidth;
} }
public final ByteOrder getOrdering() { public final ByteOrder getOrdering() {
@ -150,8 +150,8 @@ public final class Row {
public final Entry newEntry(final byte[] rowinstance) { public final Entry newEntry(final byte[] rowinstance) {
if (rowinstance == null) return null; if (rowinstance == null) return null;
//assert (rowinstance[0] != 0); //assert (rowinstance[0] != 0);
if (!(this.objectOrder.wellformed(rowinstance, 0, row[0].cellwidth))) { if (!(this.objectOrder.wellformed(rowinstance, 0, this.primaryKeyLength))) {
Log.logWarning("kelondroRow", "row not well-formed: rowinstance[0] = " + new String(rowinstance, 0, row[0].cellwidth) + " / " + NaturalOrder.arrayList(rowinstance, 0, row[0].cellwidth)); Log.logWarning("kelondroRow", "row not well-formed: rowinstance[0] = " + new String(rowinstance, 0, this.primaryKeyLength) + " / " + NaturalOrder.arrayList(rowinstance, 0, this.primaryKeyLength));
return null; return null;
} }
return new Entry(rowinstance, false); return new Entry(rowinstance, false);
@ -160,14 +160,14 @@ public final class Row {
public final Entry newEntry(final Entry oldrow, final int fromColumn) { public final Entry newEntry(final Entry oldrow, final int fromColumn) {
if (oldrow == null) return null; if (oldrow == null) return null;
assert (oldrow.getColBytes(0, false)[0] != 0); assert (oldrow.getColBytes(0, false)[0] != 0);
assert (this.objectOrder.wellformed(oldrow.getColBytes(0, false), 0, row[0].cellwidth)); assert (this.objectOrder.wellformed(oldrow.getColBytes(0, false), 0, this.primaryKeyLength));
return new Entry(oldrow, fromColumn, false); return new Entry(oldrow, fromColumn, false);
} }
public final Entry newEntry(final byte[] rowinstance, final int start, final boolean clone) { public final Entry newEntry(final byte[] rowinstance, final int start, final boolean clone) {
if (rowinstance == null) return null; if (rowinstance == null) return null;
//assert (rowinstance[0] != 0); //assert (rowinstance[0] != 0);
assert (this.objectOrder.wellformed(rowinstance, start, row[0].cellwidth)) : "rowinstance = " + new String(rowinstance); assert (this.objectOrder.wellformed(rowinstance, start, this.primaryKeyLength)) : "rowinstance = " + new String(rowinstance);
// this method offers the option to clone the content // this method offers the option to clone the content
// this is necessary if it is known that the underlying byte array may change and therefore // this is necessary if it is known that the underlying byte array may change and therefore
// the reference to the byte array does not contain the original content // the reference to the byte array does not contain the original content
@ -177,7 +177,7 @@ public final class Row {
public final Entry newEntry(final byte[][] cells) { public final Entry newEntry(final byte[][] cells) {
if (cells == null) return null; if (cells == null) return null;
assert (cells[0][0] != 0); assert (cells[0][0] != 0);
assert (this.objectOrder.wellformed(cells[0], 0, row[0].cellwidth)); assert (this.objectOrder.wellformed(cells[0], 0, this.primaryKeyLength));
return new Entry(cells); return new Entry(cells);
} }
@ -189,7 +189,7 @@ public final class Row {
public final EntryIndex newEntryIndex(final byte[] rowinstance, final int index) { public final EntryIndex newEntryIndex(final byte[] rowinstance, final int index) {
if (rowinstance == null) return null; if (rowinstance == null) return null;
assert (rowinstance[0] != 0); assert (rowinstance[0] != 0);
assert (this.objectOrder.wellformed(rowinstance, 0, row[0].cellwidth)); assert (this.objectOrder.wellformed(rowinstance, 0, this.primaryKeyLength));
return new EntryIndex(rowinstance, index); return new EntryIndex(rowinstance, index);
} }

@ -574,16 +574,22 @@ public class Table implements ObjectIndex, Iterable<Row.Entry> {
private void removeInFile(final int i) throws IOException, RowSpaceExceededException { private void removeInFile(final int i) throws IOException, RowSpaceExceededException {
assert i >= 0; assert i >= 0;
final byte[] p = new byte[rowdef.objectsize]; final byte[] p = new byte[this.rowdef.objectsize];
if (table == null) { if (this.table == null) {
if (i == index.size() - 1) { if (i == this.index.size() - 1) {
file.cleanLast(); this.file.cleanLast();
} else { } else {
file.cleanLast(p, 0); while (this.file.size() > 0) {
file.put(i, p, 0); this.file.cleanLast(p, 0);
final byte[] k = new byte[rowdef.primaryKeyLength]; if (!(this.rowdef.objectOrder.wellformed(p, 0, this.rowdef.primaryKeyLength))) {
System.arraycopy(p, 0, k, 0, rowdef.primaryKeyLength); continue;
index.put(k, i); }
this.file.put(i, p, 0);
final byte[] k = new byte[this.rowdef.primaryKeyLength];
System.arraycopy(p, 0, k, 0, this.rowdef.primaryKeyLength);
this.index.put(k, i);
break;
}
} }
} else { } else {
if (i == index.size() - 1) { if (i == index.size() - 1) {

@ -151,7 +151,6 @@ public final class LoaderDispatcher {
null, null,
"", "",
new Date(), new Date(),
new Date(),
(forText) ? (forText) ?
((global) ? ((global) ?
sb.crawler.defaultTextSnippetGlobalProfile.handle() : sb.crawler.defaultTextSnippetGlobalProfile.handle() :

Loading…
Cancel
Save