You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/de/anomic/crawler/CrawlEntry.java

371 lines
16 KiB

// CrawlEntry.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 14.03.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverProcessorJob;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
public class CrawlEntry extends serverProcessorJob {
// row definition for balancer-related NURL-entries
public final static Row rowdef = new Row(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String urlstring-256, " + // the url as string
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-80, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-8 {b256}, " + // the time when the url was first time appeared
"String profile-" + yacySeedDB.commonHashLength + ", " + // the name of the prefetch profile handle
"Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b256}, " + // number of anchors of the parent
"Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors
"byte[] flags-4, " + // flags
"String handle-4, " + // extra handle
"Cardinal loaddate-8 {b256}," + // time when the file was loaded
"Cardinal serverdate-8 {b256}," + // time when that the server returned as document date
"Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince
Base64Order.enhancedCoder,
0
);
// a shared domainAccess map for all balancers. the key is a domain-hash (6 bytes)
public static final ConcurrentHashMap<String, domaccess> domainAccess = new ConcurrentHashMap<String, domaccess>();
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
// if this is generated by a crawl, the own peer hash in entered
private String refhash; // the url's referrer hash
private yacyURL url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a>
private long appdate; // the time when the url was first time appeared
private long loaddate; // the time when the url was loaded
private long serverdate; // the document date from the target server
private long imsdate; // the time of a ifModifiedSince request
private String profileHandle; // the name of the prefetch profile
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private Bitfield flags;
private int handle;
private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
public static class domaccess {
public long time;
public long robotsMinDelay;
public int count;
public String host;
public domaccess(String host) {
this.host = host;
this.time = System.currentTimeMillis();
this.robotsMinDelay = 0;
this.count = 0;
}
public void update() {
this.time = System.currentTimeMillis();
this.count++;
}
public long flux(long range) {
return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count);
}
}
/**
* @param initiator the hash of the initiator peer
* @param url the {@link URL} to crawl
* @param referrer the hash of the referrer URL
* @param name the name of the document to crawl
* @param appdate the time when the url was first time appeared
* @param profileHandle the name of the prefetch profile. This must not be null!
* @param depth the crawling depth of the entry
* @param anchors number of anchors of the parent
* @param forkfactor sum of anchors of all ancestors
*/
public CrawlEntry(
final String initiator,
final yacyURL url,
final String referrerhash,
final String name,
final Date appdate,
final Date loaddate,
final String profileHandle,
final int depth,
final int anchors,
final int forkfactor
) {
// create new entry and store it into database
assert url != null;
assert initiator != null;
assert profileHandle == null || profileHandle.length() == yacySeedDB.commonHashLength : profileHandle + " != " + yacySeedDB.commonHashLength;
url.removeRef(); // remove anchor reference
this.initiator = initiator;
this.url = url;
this.refhash = (referrerhash == null) ? "" : referrerhash;
this.name = (name == null) ? "" : name;
this.appdate = (appdate == null) ? 0 : appdate.getTime();
this.loaddate = (loaddate == null) ? 0 : loaddate.getTime();
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new Bitfield(rowdef.width(10));
this.handle = 0;
this.serverdate = 0;
this.imsdate = 0;
this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode();
this.status = serverProcessorJob.STATUS_INITIATED;
}
public CrawlEntry(final Row.Entry entry) throws IOException {
assert (entry != null);
insertEntry(entry);
}
private void insertEntry(final Row.Entry entry) throws IOException {
final String urlstring = entry.getColString(2, null);
if (urlstring == null) throw new IOException ("url string is null");
this.initiator = entry.getColString(1, null);
this.url = new yacyURL(urlstring, entry.getColString(0, null));
this.refhash = (entry.empty(3)) ? "" : entry.getColString(3, null);
this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim();
this.appdate = entry.getColLong(5);
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
this.depth = (int) entry.getColLong(7);
this.anchors = (int) entry.getColLong(8);
this.forkfactor = (int) entry.getColLong(9);
this.flags = new Bitfield(entry.getColBytes(10));
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
this.loaddate = entry.getColLong(12);
this.serverdate = entry.getColLong(13);
this.imsdate = entry.getColLong(14);
this.statusMessage = "loaded(kelondroRow.Entry)";
this.initialHash = url.hashCode();
return;
}
public int hashCode() {
// overloads Object.hashCode()
return this.initialHash;
}
public void setStatus(final String s, int code) {
this.statusMessage = s;
this.status = code;
}
public String getStatus() {
return this.statusMessage;
}
private static String normalizeHandle(final int h) {
String d = Integer.toHexString(h);
while (d.length() < rowdef.width(11)) d = "0" + d;
return d;
}
public Row.Entry toRow() {
final byte[] appdatestr = NaturalOrder.encodeLong(appdate, rowdef.width(5));
final byte[] loaddatestr = NaturalOrder.encodeLong(loaddate, rowdef.width(12));
final byte[] serverdatestr = NaturalOrder.encodeLong(serverdate, rowdef.width(13));
final byte[] imsdatestr = NaturalOrder.encodeLong(imsdate, rowdef.width(14));
// store the hash in the hash cache
byte[] namebytes;
try {
namebytes = this.name.getBytes("UTF-8");
} catch (final UnsupportedEncodingException e) {
namebytes = this.name.getBytes();
}
final byte[][] entry = new byte[][] {
this.url.hash().getBytes(),
(initiator == null) ? "".getBytes() : this.initiator.getBytes(),
this.url.toString().getBytes(),
this.refhash.getBytes(),
namebytes,
appdatestr,
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
NaturalOrder.encodeLong(this.depth, rowdef.width(7)),
NaturalOrder.encodeLong(this.anchors, rowdef.width(8)),
NaturalOrder.encodeLong(this.forkfactor, rowdef.width(9)),
this.flags.bytes(),
normalizeHandle(this.handle).getBytes(),
loaddatestr,
serverdatestr,
imsdatestr};
return rowdef.newEntry(entry);
}
public yacyURL url() {
// the url
return url;
}
public void redirectURL(final yacyURL redirectedURL) {
// replace old URL by new one. This should only be used in case of url redirection
this.url = redirectedURL;
}
public String referrerhash() {
// the urlhash of a referer url
return this.refhash;
}
public String initiator() {
// returns the hash of the initiating peer
if (initiator == null) return "";
if (initiator.length() == 0) return "";
return initiator;
}
public boolean proxy() {
// true when the url was retrieved using the proxy
return (initiator() == null);
}
public Date appdate() {
// the date when the url appeared first
return new Date(this.appdate);
}
public Date loaddate() {
// the date when the url was loaded
return new Date(this.loaddate);
}
public Date serverdate() {
// the date that the server returned as document date
return new Date(this.serverdate);
}
public Date imsdate() {
// the date that the client (browser) send as ifModifiedSince in proxy mode
return new Date(this.imsdate);
}
public String name() {
// return the anchor name (text inside <a> tag)
return this.name;
}
public int depth() {
// crawl depth where the url appeared
return this.depth;
}
public String profileHandle() {
// the handle of the crawl profile
assert profileHandle.length() == yacySeedDB.commonHashLength : profileHandle + " != " + yacySeedDB.commonHashLength;
return this.profileHandle;
}
/**
* check a domain flag so it can be calculated when a domain was accessed the last time
*/
public void updateAccess() {
String domhash = url.hash().substring(6);
domaccess lastAccess = domainAccess.get(domhash);
if (lastAccess == null) {
lastAccess = new domaccess(url.getHost());
domainAccess.put(domhash, lastAccess);
} else {
lastAccess.update();
}
}
/**
* calculates how long should be waited until the domain can be accessed again
* this follows from given minimum access times, the fact that an url is a CGI url or now, the times that the domain was accessed
* and a given minimum access time as given in robots.txt
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
*/
public long waitingRemaining(final long minimumLocalDelta, final long minimumGlobalDelta) {
final long delta = lastAccessDelta(this.url.hash());
if (delta == Long.MAX_VALUE) return 0;
final boolean local = this.url.isLocal();
long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta;
if (this.url.isCGI()) deltaBase = deltaBase * 2; // mostly there is a database access in the background which creates a lot of unwanted IO on target site
domaccess lastAccess = domainAccess.get(this.url.hash().substring(6));
lastAccess.robotsMinDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(this.url);
final long genericDelta = Math.min(
60000,
Math.max(
deltaBase + ((lastAccess == null || local) ? 0 : lastAccess.flux(deltaBase)),
(local || lastAccess == null) ? 0 : lastAccess.robotsMinDelay)
); // prevent that that robots file can stop our indexer completely
return (delta < genericDelta) ? genericDelta - delta : 0;
}
/**
* guess a minimum waiting time
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
* also the 'isCGI' property is missing, because the full text of the domain is unknown here
* @param urlhash
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
*/
public static long waitingRemainingGuessed(String urlhash, final long minimumLocalDelta, final long minimumGlobalDelta) {
final long delta = lastAccessDelta(urlhash);
if (delta == Long.MAX_VALUE) return 0;
final boolean local = yacyURL.isLocal(urlhash);
long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta;
domaccess lastAccess = domainAccess.get(urlhash.substring(6));
final long genericDelta = Math.min(
60000,
Math.max(
deltaBase + ((lastAccess == null || local) ? 0 : lastAccess.flux(deltaBase)),
(local || lastAccess == null) ? 0 : lastAccess.robotsMinDelay)
); // prevent that that robots file can stop our indexer completely
return (delta < genericDelta) ? genericDelta - delta : 0;
}
/**
* calculates the time since the last access of the domain as referenced by the url hash
* @param urlhash
* @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before
*/
private static long lastAccessDelta(final String hash) {
assert hash != null;
assert hash.length() == 6 || hash.length() == 12;
final domaccess lastAccess = domainAccess.get((hash.length() > 6) ? hash.substring(6) : hash);
if (lastAccess == null) return Long.MAX_VALUE; // never accessed
return System.currentTimeMillis() - lastAccess.time;
}
}