added new yacy protocol for mass url-pull for better remote crawling distribution

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4056 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 4f6d56330d
commit bb426565f0

@ -1,6 +1,6 @@
// AccessStatistics_p.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 14.01.2007 on http://www.anomic.de
// AccessTracker_p.java
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 14.01.2007 on http://www.yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//

@ -275,7 +275,7 @@ public class CrawlURLFetchStack_p {
plasmaCrawlEntry entry;
int failed = 0;
for (int i=0; i<count; i++) try {
entry = nurl.pop(fromStackType);
entry = nurl.pop(fromStackType, false);
stack.push(entry.url());
} catch (IOException e) { failed++; }
return failed;

@ -53,7 +53,6 @@ import java.util.Map;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverDomains;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -122,6 +121,8 @@ public final class hello {
prop.putASIS("yourip", reportedip);
remoteSeed.put(yacySeed.IP, reportedip);
urls = yacyClient.queryUrlCount(remoteSeed);
} else {
prop.putASIS("yourip", "unknown");
}
// if the previous attempt (using the reported ip address) was not successful, try the ip where
@ -209,7 +210,6 @@ public final class hello {
seeds.append("seed0=").append(yacyCore.seedDB.mySeed.genSeedStr(key)).append(serverCore.crlfString);
}
prop.putASIS("mytime", serverDate.shortSecondTime());
prop.putASIS("seedlist", seeds.toString());
// return rewrite properties
return prop;

@ -0,0 +1,87 @@
// urls.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 22.08.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
public class urls {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
plasmaSwitchboard sb = (plasmaSwitchboard) env;
// return variable that accumulates replacements
serverObjects prop = new serverObjects();
// insert default values
prop.putASIS("iam", yacyCore.seedDB.mySeed.hash);
prop.putASIS("response", "rejected - insufficient call parameters");
prop.putASIS("channel_title", "");
prop.putASIS("channel_description", "");
prop.putASIS("channel_pubDate", "");
prop.put("item", 0);
if (post == null) return prop;
if (post.get("call", "").equals("remotecrawl")) {
// perform a remote crawl url handover
int stackType = plasmaCrawlNURL.STACK_TYPE_LIMIT;
//int stackType = plasmaCrawlNURL.STACK_TYPE_CORE;
int count = Math.min(100, post.getInt("count", 0));
int c = 0;
plasmaCrawlEntry entry;
while ((count > 0) && (sb.noticeURL.stackSize(stackType) > 0)) {
try {
entry = sb.noticeURL.pop(stackType, false);
} catch (IOException e) {
break;
}
if (entry == null) break;
prop.put("item_" + c + "_title", "");
prop.put("item_" + c + "_link", entry.url().toNormalform(true, false));
prop.put("item_" + c + "_description", entry.name());
prop.put("item_" + c + "_author", "");
prop.put("item_" + c + "_pubDate", serverDate.shortSecondTime(entry.appdate()));
prop.put("item_" + c + "_guid", entry.urlhash());
c++;
count--;
}
prop.put("item", c);
prop.put("response", "ok");
}
// return rewrite properties
return prop;
}
}

@ -0,0 +1,31 @@
<?xml version="1.0"?>
<!-- this is not exactly rss format, but similar -->
<rss>
<!-- YaCy standard response header -->
<yacy version="#[version]#">
<iam>#[iam]#</iam>
<uptime>#[uptime]#</uptime>
<mytime>#[mytime]#</mytime>
<response>#[response]#</response>
</yacy>
<!-- rss standard channel -->
<channel>
<title>#[channel_title]#</title>
<description>#[channel_description]#</description>
<pubDate>#[channel_pubDate]#</pubDate>
<!-- urll items -->
#{item}#
<item>
<title>#[title]#</title>
<link>#[link]#</link>
<description>#[description]#</description>
<author>#[author]#</author>
<pubDate>#[pubDate]#</pubDate>
<guid>#[guid]#</guid>
</item>
#{/item}#
</channel>
</rss>

@ -106,6 +106,7 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverClassLoader;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -631,8 +632,9 @@ public final class httpdFileHandler {
}
// add the application version, the uptime and the client name to every rewrite table
tp.put(servletProperties.PEER_STAT_VERSION, switchboard.getConfig("version", ""));
tp.put(servletProperties.PEER_STAT_UPTIME, ((System.currentTimeMillis() - Long.parseLong(switchboard.getConfig("startupTime","0"))) / 1000) / 60); // uptime in minutes
tp.put(servletProperties.PEER_STAT_UPTIME, ((System.currentTimeMillis() - serverCore.startupTime) / 1000) / 60); // uptime in minutes
tp.put(servletProperties.PEER_STAT_CLIENTNAME, switchboard.getConfig("peerName", "anomic"));
tp.put(servletProperties.PEER_STAT_MYTIME, serverDate.shortSecondTime());
//System.out.println("respond props: " + ((tp == null) ? "null" : tp.toString())); // debug
} catch (InvocationTargetException e) {
if (e.getCause() instanceof InterruptedException) {

@ -49,6 +49,7 @@ package de.anomic.plasma.crawler;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import de.anomic.plasma.plasmaURL;
import de.anomic.net.URL;
@ -288,7 +289,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
this.url,
referrerHash,
this.name,
null,
new Date(),
this.profile.handle(),
this.depth,
0,

@ -141,7 +141,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break;
this.urlCount++;
nextEntry = this.importNurlDB.pop(stackTypes[stackType]);
nextEntry = this.importNurlDB.pop(stackTypes[stackType], false);
nextHash = nextEntry.urlhash();
} else {
if (!entryIter.hasNext()) break;

@ -56,7 +56,7 @@ public class plasmaCrawlEntry {
"String handle-4, " + // extra handle
"Cardinal loaddate-8 {b256}," + // time when the file was loaded
"Cardinal serverdate-8 {b256}," + // time when that the server returned as document date
"Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince
"Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince
kelondroBase64Order.enhancedCoder,
0
);
@ -105,6 +105,7 @@ public class plasmaCrawlEntry {
int forkfactor
) {
// create new entry and store it into database
assert appdate != null;
this.urlhash = plasmaURL.urlHash(url);
this.initiator = initiator;
this.url = url;
@ -214,36 +215,36 @@ public class plasmaCrawlEntry {
public Date appdate() {
// the date when the url appeared first
return new Date(appdate);
return new Date(this.appdate);
}
public Date loaddate() {
// the date when the url was loaded
return new Date(loaddate);
return new Date(this.loaddate);
}
public Date serverdate() {
// the date that the server returned as document date
return new Date(serverdate);
return new Date(this.serverdate);
}
public Date imsdate() {
// the date that the client (browser) send as ifModifiedSince in proxy mode
return new Date(imsdate);
return new Date(this.imsdate);
}
public String name() {
// return the anchor name (text inside <a> tag)
return name;
return this.name;
}
public int depth() {
// crawl depth where the url appeared
return depth;
return this.depth;
}
public String profileHandle() {
// the handle of the crawl profile
return profileHandle;
return this.profileHandle;
}
}

@ -156,18 +156,18 @@ public class plasmaCrawlNURL {
}
}
public plasmaCrawlEntry pop(int stackType) throws IOException {
public plasmaCrawlEntry pop(int stackType, boolean delay) throws IOException {
switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack);
case STACK_TYPE_LIMIT: return pop(limitStack);
case STACK_TYPE_REMOTE: return pop(remoteStack);
case STACK_TYPE_CORE: return pop(coreStack, delay);
case STACK_TYPE_LIMIT: return pop(limitStack, delay);
case STACK_TYPE_REMOTE: return pop(remoteStack, delay);
default: return null;
}
}
public void shift(int fromStack, int toStack) {
try {
plasmaCrawlEntry entry = pop(fromStack);
plasmaCrawlEntry entry = pop(fromStack, false);
if (entry != null) push(toStack, entry);
} catch (IOException e) {
return;
@ -183,13 +183,13 @@ public class plasmaCrawlNURL {
}
}
private plasmaCrawlEntry pop(plasmaCrawlBalancer balancer) throws IOException {
private plasmaCrawlEntry pop(plasmaCrawlBalancer balancer, boolean delay) throws IOException {
// this is a filo - pop
int s;
plasmaCrawlEntry entry;
synchronized (balancer) {
while ((s = balancer.size()) > 0) {
entry = balancer.pop(minimumLocalDelta, minimumGlobalDelta, maximumDomAge);
entry = balancer.pop((delay) ? minimumLocalDelta : 0, (delay) ? minimumGlobalDelta : 0, maximumDomAge);
if (entry == null) {
if (s > balancer.size()) continue;
int aftersize = balancer.size();

@ -247,7 +247,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private plasmaDHTChunk dhtTransferChunk = null;
public ArrayList localSearches, remoteSearches; // array of search result properties as HashMaps
public HashMap localSearchTracker, remoteSearchTracker; // mappings from requesting host to a TreeSet of Long(access time)
public long startupTime = 0;
public long lastseedcheckuptime = -1;
public long indexedPages = 0;
public long lastindexedPages = 0;
@ -2172,7 +2171,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
while (urlEntry == null && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
String stats = "LOCALCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE);
urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE, true);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@ -2241,7 +2240,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT, true);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@ -2328,7 +2327,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE, true);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +

@ -94,6 +94,7 @@ public final class serverCore extends serverAbstractThread implements serverThre
public static final String crlfString = new String(crlf);
public static final String lfstring = new String(new byte[]{lf});
public static final Class[] stringType = {"".getClass()}; // set up some reflection
public static final long startupTime = System.currentTimeMillis();
//Class[] exceptionType = {Class.forName("java.lang.Throwable")};

@ -32,6 +32,7 @@ public class servletProperties extends serverObjects {
public static final String PEER_STAT_VERSION = "version";
public static final String PEER_STAT_UPTIME = "uptime";
public static final String PEER_STAT_MYTIME = "mytime";
public static final String PEER_STAT_CLIENTNAME = "clientname";
private String prefix="";

@ -81,7 +81,6 @@ public class yacyCore {
// statics
public static ThreadGroup publishThreadGroup = new ThreadGroup("publishThreadGroup");
public static long startupTime = System.currentTimeMillis();
public static yacySeedDB seedDB = null;
public static yacyNewsPool newsPool = null;
public static final HashMap seedUploadMethods = new HashMap();
@ -115,7 +114,7 @@ public class yacyCore {
public static int yacyTime() {
// the time since startup of yacy in seconds
return (int) ((System.currentTimeMillis() - startupTime) / 1000);
return (int) ((System.currentTimeMillis() - serverCore.startupTime) / 1000);
}
public yacyCore(plasmaSwitchboard sb) {

@ -52,6 +52,7 @@ import java.util.Iterator;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
@ -97,7 +98,7 @@ public class yacyPeerActions {
seedDB.mySeed.put(yacySeed.PORT, Integer.toString(serverCore.getPortNr(sb.getConfig("port", "8080"))));
}
long uptime = (System.currentTimeMillis() - sb.startupTime) / 1000;
long uptime = (System.currentTimeMillis() - serverCore.startupTime) / 1000;
long uptimediff = uptime - sb.lastseedcheckuptime;
long indexedcdiff = sb.indexedPages - sb.lastindexedPages;
//double requestcdiff = sb.requestedQueries - sb.lastrequestedQueries;
@ -116,6 +117,7 @@ public class yacyPeerActions {
seedDB.mySeed.put(yacySeed.UPTIME, Long.toString(uptime/60)); // the number of minutes that the peer is up in minutes/day (moving average MA30)
seedDB.mySeed.put(yacySeed.LCOUNT, Integer.toString(sb.wordIndex.loadedURL.size())); // the number of links that the peer has stored (LURL's)
seedDB.mySeed.put(yacySeed.NCOUNT, Integer.toString(sb.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
seedDB.mySeed.put(yacySeed.RCOUNT, Integer.toString(sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
seedDB.mySeed.put(yacySeed.ICOUNT, Integer.toString(sb.wordIndex.size())); // the minimum number of words that the peer has indexed (as it says)
seedDB.mySeed.put(yacySeed.SCOUNT, Integer.toString(seedDB.sizeConnected())); // the number of seeds that the peer has stored
seedDB.mySeed.put(yacySeed.CCOUNT, Double.toString(((int) ((seedDB.sizeConnected() + seedDB.sizeDisconnected() + seedDB.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)

@ -154,6 +154,8 @@ public class yacySeed {
public static final String LCOUNT = "LCount";
/** the number of links that the peer has noticed, but not loaded (NURL's) */
public static final String NCOUNT = "NCount";
/** the number of links that the peer provides for remote crawls (ZURL's) */
public static final String RCOUNT = "RCount";
/** the number of words the peer has indexed (as it says) */
public static final String ICOUNT = "ICount";
/** the number of seeds that the peer has stored */
@ -205,6 +207,7 @@ public class yacySeed {
this.dna.put(yacySeed.UPTIME, yacySeed.ZERO);
this.dna.put(yacySeed.LCOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.NCOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.RCOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.ICOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.SCOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.CCOUNT, yacySeed.ZERO);

@ -167,8 +167,6 @@ public final class yacy {
* @param startupFree free memory at startup time, to be used later for statistics
*/
private static void startup(String homePath, long startupMemFree, long startupMemTotal) {
long startup = System.currentTimeMillis();
int oldRev=0;
int newRev=0;
@ -268,7 +266,6 @@ public final class yacy {
sb.setConfig("vString", yacyVersion.combined2prettyVersion(Double.toString(version)));
sb.setConfig("vdate", (vDATE.startsWith("@")) ? serverDate.shortDayTime() : vDATE);
sb.setConfig("applicationRoot", homePath);
sb.startupTime = startup;
serverLog.logConfig("STARTUP", "YACY Version: " + version + ", Built " + sb.getConfig("vdate", "00000000"));
yacyVersion.latestRelease = version;

Loading…
Cancel
Save