- the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks - the new NURL-index is managed by the crawl balancer - the crawl balancer does not need an internal index any more, it is replaced by the NURL-index - the NURL.Entry was generalized and is now a new class plasmaCrawlEntry - the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future - the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names) - the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information - the EURL index is now filled with ZURL objects - a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers - redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another - found and fixed numerous bugs in the context of crawl state handling - fixed a serious bug in kelondroCache which caused that entries could not be removed - fixed some bugs in online interface and adopted monitor output to new entry objects - adopted yacy protocol to handle new delegatedURL entries all old crawl queues will disappear after this update! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
094a1482f4
commit
861f41e67e
@ -0,0 +1,238 @@
|
||||
// plasmaCrawlBalancerEntry.java
|
||||
// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
|
||||
// first published 14.03.2007 on http://www.anomic.de
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Date;
|
||||
|
||||
import de.anomic.kelondro.kelondroBase64Order;
|
||||
import de.anomic.kelondro.kelondroBitfield;
|
||||
import de.anomic.kelondro.kelondroNaturalOrder;
|
||||
import de.anomic.kelondro.kelondroRow;
|
||||
import de.anomic.net.URL;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
||||
public class plasmaCrawlEntry {
|
||||
|
||||
// row definition for balancer-related NURL-entries
|
||||
public final static kelondroRow rowdef = new kelondroRow(
|
||||
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
|
||||
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
|
||||
"String urlstring-256, " + // the url as string
|
||||
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
|
||||
"String urlname-80, " + // the name of the url, from anchor tag <a>name</a>
|
||||
"Cardinal appdate-8 {b256}, " + // the time when the url was first time appeared
|
||||
"String profile-4, " + // the name of the prefetch profile handle
|
||||
"Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0
|
||||
"Cardinal parentbr-3 {b256}, " + // number of anchors of the parent
|
||||
"Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors
|
||||
"byte[] flags-4, " + // flags
|
||||
"String handle-4, " + // extra handle
|
||||
"Cardinal loaddate-8 {b256}," + // time when the file was loaded
|
||||
"Cardinal serverdate-8 {b256}," + // time when that the server returned as document date
|
||||
"Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince
|
||||
kelondroBase64Order.enhancedCoder,
|
||||
0
|
||||
);
|
||||
|
||||
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
|
||||
// if this is generated by a crawl, the own peer hash in entered
|
||||
private String urlhash; // the url's hash
|
||||
private String referrer; // the url's referrer hash
|
||||
private URL url; // the url as string
|
||||
private String name; // the name of the url, from anchor tag <a>name</a>
|
||||
private long appdate; // the time when the url was first time appeared
|
||||
private long loaddate; // the time when the url was loaded
|
||||
private long serverdate; // the document date from the target server
|
||||
private long imsdate; // the time of a ifModifiedSince request
|
||||
private String profileHandle; // the name of the prefetch profile
|
||||
private int depth; // the prefetch depth so far, starts at 0
|
||||
private int anchors; // number of anchors of the parent
|
||||
private int forkfactor; // sum of anchors of all ancestors
|
||||
private kelondroBitfield flags;
|
||||
private int handle;
|
||||
|
||||
public plasmaCrawlEntry(URL url) {
|
||||
this(yacyCore.seedDB.mySeed.hash, url, null, null, new Date(), null, 0, 0, 0);
|
||||
}
|
||||
|
||||
public plasmaCrawlEntry(
|
||||
String initiator,
|
||||
URL url,
|
||||
String referrer,
|
||||
String name,
|
||||
Date appdate,
|
||||
String profileHandle,
|
||||
int depth,
|
||||
int anchors,
|
||||
int forkfactor
|
||||
) {
|
||||
// create new entry and store it into database
|
||||
this.urlhash = plasmaURL.urlHash(url);
|
||||
this.initiator = initiator;
|
||||
this.url = url;
|
||||
this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer;
|
||||
this.name = (name == null) ? "" : name;
|
||||
this.appdate = (appdate == null) ? 0 : appdate.getTime();
|
||||
this.profileHandle = profileHandle; // must not be null
|
||||
this.depth = depth;
|
||||
this.anchors = anchors;
|
||||
this.forkfactor = forkfactor;
|
||||
this.flags = new kelondroBitfield(rowdef.width(10));
|
||||
this.handle = 0;
|
||||
this.loaddate = 0;
|
||||
this.serverdate = 0;
|
||||
this.imsdate = 0;
|
||||
}
|
||||
|
||||
public plasmaCrawlEntry(kelondroRow.Entry entry) throws IOException {
|
||||
assert (entry != null);
|
||||
insertEntry(entry);
|
||||
}
|
||||
|
||||
private void insertEntry(kelondroRow.Entry entry) throws IOException {
|
||||
String urlstring = entry.getColString(2, null);
|
||||
if (urlstring == null) throw new IOException ("url string is null");
|
||||
this.urlhash = entry.getColString(0, null);
|
||||
this.initiator = entry.getColString(1, null);
|
||||
this.url = new URL(urlstring);
|
||||
this.referrer = (entry.empty(3)) ? plasmaURL.dummyHash : entry.getColString(3, null);
|
||||
this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim();
|
||||
this.appdate = entry.getColLong(5);
|
||||
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
|
||||
this.depth = (int) entry.getColLong(7);
|
||||
this.anchors = (int) entry.getColLong(8);
|
||||
this.forkfactor = (int) entry.getColLong(9);
|
||||
this.flags = new kelondroBitfield(entry.getColBytes(10));
|
||||
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
|
||||
this.loaddate = entry.getColLong(12);
|
||||
this.serverdate = entry.getColLong(13);
|
||||
this.imsdate = entry.getColLong(14);
|
||||
return;
|
||||
}
|
||||
|
||||
private static String normalizeHandle(int h) {
|
||||
String d = Integer.toHexString(h);
|
||||
while (d.length() < rowdef.width(11)) d = "0" + d;
|
||||
return d;
|
||||
}
|
||||
|
||||
public kelondroRow.Entry toRow() {
|
||||
byte[] appdatestr = kelondroNaturalOrder.encodeLong(appdate, rowdef.width(5));
|
||||
byte[] loaddatestr = kelondroNaturalOrder.encodeLong(loaddate, rowdef.width(12));
|
||||
byte[] serverdatestr = kelondroNaturalOrder.encodeLong(serverdate, rowdef.width(13));
|
||||
byte[] imsdatestr = kelondroNaturalOrder.encodeLong(imsdate, rowdef.width(14));
|
||||
// store the hash in the hash cache
|
||||
byte[] namebytes;
|
||||
try {
|
||||
namebytes = this.name.getBytes("UTF-8");
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
namebytes = this.name.getBytes();
|
||||
}
|
||||
byte[][] entry = new byte[][] {
|
||||
this.urlhash.getBytes(),
|
||||
(initiator == null) ? "".getBytes() : this.initiator.getBytes(),
|
||||
this.url.toString().getBytes(),
|
||||
this.referrer.getBytes(),
|
||||
namebytes,
|
||||
appdatestr,
|
||||
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
|
||||
kelondroNaturalOrder.encodeLong(this.depth, rowdef.width(7)),
|
||||
kelondroNaturalOrder.encodeLong(this.anchors, rowdef.width(8)),
|
||||
kelondroNaturalOrder.encodeLong(this.forkfactor, rowdef.width(9)),
|
||||
this.flags.bytes(),
|
||||
normalizeHandle(this.handle).getBytes(),
|
||||
loaddatestr,
|
||||
serverdatestr,
|
||||
imsdatestr};
|
||||
return rowdef.newEntry(entry);
|
||||
}
|
||||
|
||||
public URL url() {
|
||||
// the url
|
||||
return url;
|
||||
}
|
||||
|
||||
public String urlhash() {
|
||||
// the hash of this url
|
||||
return this.urlhash;
|
||||
}
|
||||
|
||||
public String referrerhash() {
|
||||
// the urlhash of a referer url
|
||||
return this.referrer;
|
||||
}
|
||||
|
||||
public String initiator() {
|
||||
// returns the hash of the initiating peer
|
||||
if (initiator == null) return null;
|
||||
if (initiator.length() == 0) return null;
|
||||
return initiator;
|
||||
}
|
||||
|
||||
public boolean proxy() {
|
||||
// true when the url was retrieved using the proxy
|
||||
return (initiator() == null);
|
||||
}
|
||||
|
||||
public Date appdate() {
|
||||
// the date when the url appeared first
|
||||
return new Date(appdate);
|
||||
}
|
||||
|
||||
public Date loaddate() {
|
||||
// the date when the url was loaded
|
||||
return new Date(loaddate);
|
||||
}
|
||||
|
||||
public Date serverdate() {
|
||||
// the date that the server returned as document date
|
||||
return new Date(serverdate);
|
||||
}
|
||||
|
||||
public Date imsdate() {
|
||||
// the date that the client (browser) send as ifModifiedSince in proxy mode
|
||||
return new Date(imsdate);
|
||||
}
|
||||
|
||||
public String name() {
|
||||
// return the anchor name (text inside <a> tag)
|
||||
return name;
|
||||
}
|
||||
|
||||
public int depth() {
|
||||
// crawl depth where the url appeared
|
||||
return depth;
|
||||
}
|
||||
|
||||
public String profileHandle() {
|
||||
// the handle of the crawl profile
|
||||
return profileHandle;
|
||||
}
|
||||
}
|
@ -0,0 +1,274 @@
|
||||
// plasmaCrawlZURL.java
|
||||
// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
|
||||
// first published 15.03.2007 on http://www.anomic.de
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import de.anomic.kelondro.kelondroBase64Order;
|
||||
import de.anomic.kelondro.kelondroFlexTable;
|
||||
import de.anomic.kelondro.kelondroIndex;
|
||||
import de.anomic.kelondro.kelondroRow;
|
||||
import de.anomic.net.URL;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
import de.anomic.yacy.yacySeedDB;
|
||||
|
||||
public class plasmaCrawlZURL {
|
||||
|
||||
public final static kelondroRow rowdef = new kelondroRow(
|
||||
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
|
||||
"String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
|
||||
"Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load
|
||||
"Cardinal workcount-4 {b256}, " + // number of load retries
|
||||
"String anycause-80, " + // string describing load failure
|
||||
"byte[] entry-" + plasmaCrawlEntry.rowdef.objectsize(), // extra space
|
||||
kelondroBase64Order.enhancedCoder,
|
||||
0);
|
||||
|
||||
// the class object
|
||||
private kelondroIndex urlIndexFile = null;
|
||||
private LinkedList rejectedStack = new LinkedList(); // strings: url
|
||||
|
||||
public plasmaCrawlZURL(File cachePath, String tablename) {
|
||||
cachePath.mkdirs();
|
||||
try {
|
||||
urlIndexFile = new kelondroFlexTable(cachePath, tablename, -1, rowdef);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
System.exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
public int size() {
|
||||
try {
|
||||
return urlIndexFile.size() ;
|
||||
} catch (IOException e) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (urlIndexFile != null) {
|
||||
urlIndexFile.close();
|
||||
urlIndexFile = null;
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized Entry newEntry(
|
||||
plasmaCrawlEntry bentry, String executor, Date workdate,
|
||||
int workcount, String anycause) {
|
||||
if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = plasmaURL.dummyHash;
|
||||
if (anycause == null) anycause = "unknown";
|
||||
return new Entry(bentry, executor, workdate, workcount, anycause);
|
||||
}
|
||||
|
||||
public synchronized Entry newEntry(URL url, String anycause) {
|
||||
return new Entry(url, anycause);
|
||||
}
|
||||
|
||||
public boolean remove(String hash) {
|
||||
if (hash == null) return false;
|
||||
try {
|
||||
urlIndexFile.remove(hash.getBytes());
|
||||
return true;
|
||||
} catch (IOException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void stackPushEntry(Entry e) {
|
||||
rejectedStack.add(e.hash());
|
||||
}
|
||||
|
||||
public Entry stackPopEntry(int pos) throws IOException {
|
||||
String urlhash = (String) rejectedStack.get(pos);
|
||||
if (urlhash == null) return null;
|
||||
return new Entry(urlhash);
|
||||
}
|
||||
|
||||
public synchronized Entry getEntry(String hash) throws IOException {
|
||||
return new Entry(hash);
|
||||
}
|
||||
|
||||
public boolean getUseNewDB() {
|
||||
return (urlIndexFile instanceof kelondroFlexTable);
|
||||
}
|
||||
|
||||
public boolean exists(String urlHash) {
|
||||
try {
|
||||
return urlIndexFile.has(urlHash.getBytes());
|
||||
} catch (IOException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void clearStack() {
|
||||
rejectedStack.clear();
|
||||
}
|
||||
|
||||
public int stackSize() {
|
||||
return rejectedStack.size();
|
||||
}
|
||||
|
||||
public class Entry {
|
||||
|
||||
plasmaCrawlEntry bentry; // the balancer entry
|
||||
private String executor; // the crawling initiator
|
||||
private Date workdate; // the time when the url was last time tried to load
|
||||
private int workcount; // number of tryings
|
||||
private String anycause; // string describing reason for load fail
|
||||
private boolean stored;
|
||||
|
||||
public Entry(URL url, String reason) {
|
||||
this(new plasmaCrawlEntry(url), null, new Date(), 0, reason);
|
||||
}
|
||||
|
||||
public Entry(
|
||||
plasmaCrawlEntry bentry, String executor, Date workdate,
|
||||
int workcount, String anycause) {
|
||||
// create new entry
|
||||
this.bentry = bentry;
|
||||
this.executor = (executor == null) ? yacyCore.seedDB.mySeed.hash : executor;
|
||||
this.workdate = (workdate == null) ? new Date() : workdate;
|
||||
this.workcount = workcount;
|
||||
this.anycause = (anycause == null) ? "" : anycause;
|
||||
stored = false;
|
||||
}
|
||||
|
||||
public Entry(String hash) throws IOException {
|
||||
kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes());
|
||||
if (entry != null) {
|
||||
insertEntry(entry);
|
||||
}
|
||||
this.stored = true;
|
||||
}
|
||||
|
||||
public Entry(kelondroRow.Entry entry) throws IOException {
|
||||
insertEntry(entry);
|
||||
this.stored = false;
|
||||
}
|
||||
|
||||
private void insertEntry(kelondroRow.Entry entry) throws IOException {
|
||||
assert (entry != null);
|
||||
this.executor = entry.getColString(1, "UTF-8");
|
||||
this.workdate = new Date(entry.getColLong(2));
|
||||
this.workcount = (int) entry.getColLong(3);
|
||||
this.anycause = entry.getColString(4, "UTF-8");
|
||||
this.bentry = new plasmaCrawlEntry(plasmaCrawlEntry.rowdef.newEntry(entry.getColBytes(5)));
|
||||
assert ((new String(entry.getColBytes(0))).equals(bentry.urlhash()));
|
||||
return;
|
||||
}
|
||||
|
||||
public void store() {
|
||||
// stores the values from the object variables into the database
|
||||
if (this.stored) return;
|
||||
if (this.bentry == null) return;
|
||||
kelondroRow.Entry newrow = rowdef.newEntry();
|
||||
newrow.setCol(0, this.bentry.urlhash().getBytes());
|
||||
newrow.setCol(1, this.executor.getBytes());
|
||||
newrow.setCol(2, this.workdate.getTime());
|
||||
newrow.setCol(3, this.workcount);
|
||||
newrow.setCol(4, this.anycause.getBytes());
|
||||
newrow.setCol(5, this.bentry.toRow().bytes());
|
||||
try {
|
||||
urlIndexFile.put(newrow);
|
||||
this.stored = true;
|
||||
} catch (IOException e) {
|
||||
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
public URL url() {
|
||||
return this.bentry.url();
|
||||
}
|
||||
|
||||
public String initiator() {
|
||||
return this.bentry.initiator();
|
||||
}
|
||||
|
||||
public String hash() {
|
||||
// return a url-hash, based on the md5 algorithm
|
||||
// the result is a String of 12 bytes within a 72-bit space
|
||||
// (each byte has an 6-bit range)
|
||||
// that should be enough for all web pages on the world
|
||||
return this.bentry.urlhash();
|
||||
}
|
||||
|
||||
public Date workdate() {
|
||||
return workdate;
|
||||
}
|
||||
|
||||
public String executor() {
|
||||
// return the creator's hash
|
||||
return executor;
|
||||
}
|
||||
|
||||
public String anycause() {
|
||||
return anycause;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class kiter implements Iterator {
|
||||
// enumerates entry elements
|
||||
Iterator i;
|
||||
boolean error = false;
|
||||
|
||||
public kiter(boolean up, String firstHash) throws IOException {
|
||||
i = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes());
|
||||
error = false;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if (error) return false;
|
||||
return i.hasNext();
|
||||
}
|
||||
|
||||
public Object next() throws RuntimeException {
|
||||
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
|
||||
if (e == null) return null;
|
||||
try {
|
||||
return new Entry(e);
|
||||
} catch (IOException ex) {
|
||||
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
i.remove();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Iterator entries(boolean up, String firstHash) throws IOException {
|
||||
// enumerates entry elements
|
||||
return new kiter(up, firstHash);
|
||||
}
|
||||
}
|
Loading…
Reference in new issue