enhanced crawler:

- added a new queue 'noload' which can be filled with urls where it is already known that the content cannot be loaded. This may be because there is no parser available or the file is too big
- the noload queue is emptied with the parser process which indexes the file names only
- the 'start from file' functionality now also reads from ftp crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7368 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent c36da90261
commit a563b05b60

@ -254,7 +254,7 @@ public class Crawler_p {
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final DigestURI url = crawlingStartURL;
sb.crawlStacker.queueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), "ftp", url.getHost(), url.getPort(), false);
sb.crawlStacker.enqueueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), "ftp", url.getHost(), url.getPort(), false);
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
@ -316,6 +316,7 @@ public class Crawler_p {
pe.handle(),
0,
0,
0,
0
));
@ -369,6 +370,7 @@ public class Crawler_p {
pe.handle(),
0,
0,
0,
0),
sb.peers.mySeed().hash.getBytes(),
new Date(),
@ -420,7 +422,7 @@ public class Crawler_p {
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
sb.crawlStacker.queueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
sb.crawlStacker.enqueueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
@ -522,6 +524,7 @@ public class Crawler_p {
profile.handle(),
0,
0,
0,
0
));
}

@ -65,13 +65,13 @@ public class IndexCreateWWWGlobalQueue_p {
}
if (post.containsKey("clearcrawlqueue")) {
final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
sb.crawlQueues.noticeURL.clear(NoticedURL.STACK_TYPE_LIMIT);
final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.LIMIT);
try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */}
/*
int c = 0;
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash();
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash();
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
*/
@ -85,12 +85,12 @@ public class IndexCreateWWWGlobalQueue_p {
}
}
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
if (stackSize == 0) {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, showLimit);
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit);
Request urle;
boolean dark = true;

@ -84,8 +84,8 @@ public class IndexCreateWWWLocalQueue_p {
final String pattern = post.get("pattern", ".*").trim();
final int option = post.getInt("option", INVALID);
if (pattern.equals(".*")) {
c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
sb.crawlQueues.noticeURL.clear(NoticedURL.STACK_TYPE_CORE);
c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.CORE);
try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */}
} else if (option > INVALID) {
Pattern compiledPattern = null;
@ -112,7 +112,7 @@ public class IndexCreateWWWLocalQueue_p {
}
} else {
// iterating through the list of URLs
final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.STACK_TYPE_CORE);
final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE);
Request entry;
List<byte[]> removehashes = new ArrayList<byte[]>();
while (iter.hasNext()) {
@ -152,12 +152,12 @@ public class IndexCreateWWWLocalQueue_p {
}
}
int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
if (stackSize == 0) {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20));
Request urle;
boolean dark = true;

@ -62,13 +62,13 @@ public class IndexCreateWWWRemoteQueue_p {
}
if (post.containsKey("clearcrawlqueue")) {
final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
sb.crawlQueues.noticeURL.clear(NoticedURL.STACK_TYPE_REMOTE);
final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE);
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.REMOTE);
try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */}
/*
int c = 0;
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash();
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash();
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
*/
@ -82,12 +82,12 @@ public class IndexCreateWWWRemoteQueue_p {
}
}
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE);
if (stackSize == 0) {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_REMOTE, showLimit);
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit);
Request urle;
boolean dark = true;

@ -182,6 +182,7 @@ public class QuickCrawlLink_p {
pe.handle(),
0,
0,
0,
0
));

@ -70,23 +70,23 @@ public class queues_p {
//local crawl queue
prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount());
prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
addNTable(sb, prop, "list-local", sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, Math.min(10, stackSize)));
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
addNTable(sb, prop, "list-local", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, Math.min(10, stackSize)));
//global crawl queue
prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize());
prop.put("limitCrawlState", STATE_RUNNING);
stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
//global crawl queue
prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount());
prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
if (stackSize == 0) {
prop.put("list-remote", "0");
} else {
addNTable(sb, prop, "list-remote", sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, Math.min(10, stackSize)));
addNTable(sb, prop, "list-remote", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, Math.min(10, stackSize)));
}
// return rewrite properties

@ -81,7 +81,8 @@ public class rct_p {
sb.crawler.defaultRemoteProfile.handle(),
0,
0,
0
0,
item.getSize()
));
} else {
env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);

@ -59,7 +59,7 @@ public class urls {
if (post.get("call", "").equals("remotecrawl")) {
// perform a remote crawl url handover
final int stackType = NoticedURL.STACK_TYPE_LIMIT;
final NoticedURL.StackType stackType = NoticedURL.StackType.LIMIT;
int maxCount = Math.min(100, post.getInt("count", 10));
long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000)));
long timeout = System.currentTimeMillis() + maxTime;

@ -44,11 +44,14 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.workflow.WorkflowJob;
import de.anomic.crawler.NoticedURL.StackType;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.search.Switchboard.indexingQueueEntry;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.dht.PeerSelection;
@ -60,7 +63,7 @@ public class CrawlQueues {
protected Switchboard sb;
protected Log log;
protected Map<Integer, crawlWorker> workers; // mapping from url hash to Worker thread object
protected Map<Integer, Loader> workers; // mapping from url hash to Worker thread object
private final ArrayList<String> remoteCrawlProviderHashes;
public NoticedURL noticeURL;
@ -69,7 +72,7 @@ public class CrawlQueues {
public CrawlQueues(final Switchboard sb, final File queuePath) {
this.sb = sb;
this.log = new Log("CRAWLER");
this.workers = new ConcurrentHashMap<Integer, crawlWorker>();
this.workers = new ConcurrentHashMap<Integer, Loader>();
this.remoteCrawlProviderHashes = new ArrayList<String>();
// start crawling management
@ -83,7 +86,7 @@ public class CrawlQueues {
public void relocate(final File newQueuePath) {
this.close();
this.workers = new ConcurrentHashMap<Integer, crawlWorker>();
this.workers = new ConcurrentHashMap<Integer, Loader>();
this.remoteCrawlProviderHashes.clear();
noticeURL = new NoticedURL(newQueuePath, sb.useTailCache, sb.exceed134217727);
@ -94,10 +97,10 @@ public class CrawlQueues {
public void close() {
// wait for all workers to finish
for (final crawlWorker w: workers.values()) {
for (final Loader w: workers.values()) {
w.interrupt();
}
for (final crawlWorker w: workers.values()) {
for (final Loader w: workers.values()) {
try {
w.join();
} catch (InterruptedException e) {
@ -111,7 +114,7 @@ public class CrawlQueues {
public void clear() {
// wait for all workers to finish
for (final crawlWorker w: workers.values()) {
for (final Loader w: workers.values()) {
w.interrupt();
}
// TODO: wait some more time until all threads are finished
@ -139,7 +142,7 @@ public class CrawlQueues {
if (delegatedURL.exists(hash)) return "delegated";
if (errorURL.exists(hash)) return "errors";
if (noticeURL.existsInStack(hash)) return "crawler";
for (final crawlWorker worker: workers.values()) {
for (final Loader worker: workers.values()) {
if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) return "worker";
}
return null;
@ -158,7 +161,7 @@ public class CrawlQueues {
if (ee != null) return ee.url();
ee = errorURL.get(urlhash);
if (ee != null) return ee.url();
for (final crawlWorker w: workers.values()) {
for (final Loader w: workers.values()) {
if (Base64Order.enhancedCoder.equal(w.request.url().hash(), urlhash)) return w.request.url();
}
final Request ne = noticeURL.get(urlhash);
@ -169,7 +172,7 @@ public class CrawlQueues {
public void cleanup() {
// wait for all workers to finish
int timeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
for (final crawlWorker w: workers.values()) {
for (final Loader w: workers.values()) {
if (w.age() > timeout) w.interrupt();
}
}
@ -178,7 +181,7 @@ public class CrawlQueues {
synchronized (workers) {
final Request[] e = new Request[workers.size()];
int i = 0;
for (final crawlWorker w: workers.values()) {
for (final Loader w: workers.values()) {
if (i >= e.length) break;
e[i++] = w.request;
}
@ -187,7 +190,7 @@ public class CrawlQueues {
}
public int coreCrawlJobSize() {
return noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
return noticeURL.stackSize(NoticedURL.StackType.CORE);
}
public boolean coreCrawlJob() {
@ -200,14 +203,14 @@ public class CrawlQueues {
// move some tasks to the core crawl job so we have something to do
final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
for (int i = 0; i < toshift; i++) {
noticeURL.shift(NoticedURL.STACK_TYPE_LIMIT, NoticedURL.STACK_TYPE_CORE, sb.crawler.profilesActiveCrawls);
noticeURL.shift(NoticedURL.StackType.LIMIT, NoticedURL.StackType.CORE, sb.crawler.profilesActiveCrawls);
}
log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() +
", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "") +
", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off"));
}
String queueCheck = crawlIsPossible(NoticedURL.STACK_TYPE_CORE);
String queueCheck = loadIsPossible(NoticedURL.StackType.CORE);
if (queueCheck != null) {
if (log.isFine()) log.logFine("omitting de-queue/local: " + queueCheck);
return false;
@ -219,11 +222,39 @@ public class CrawlQueues {
}
// do a local crawl
Request urlEntry = null;
while (urlEntry == null && noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) > 0) {
final String stats = "LOCALCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]";
Request urlEntry;
while (noticeURL.stackSize(NoticedURL.StackType.CORE) > 0 || noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
final String stats = "LOCALCRAWL[" +
noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " +
noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " +
noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " +
noticeURL.stackSize(NoticedURL.StackType.OVERHANG) +
", " + noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
try {
urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_CORE, true, sb.crawler.profilesActiveCrawls);
if (noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
// get one entry that will not be loaded, just indexed
urlEntry = noticeURL.pop(NoticedURL.StackType.NOLOAD, true, sb.crawler.profilesActiveCrawls);
if (urlEntry == null) continue;
final String profileHandle = urlEntry.profileHandle();
if (profileHandle == null) {
log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
Map<String, String> map = sb.crawler.profilesActiveCrawls.get(profileHandle);
if (map == null) {
log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
try {
sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, new Response(urlEntry, new CrawlProfile(map)), null, null));
Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false));
} catch (InterruptedException e) {
Log.logException(e);
}
return true;
}
urlEntry = noticeURL.pop(NoticedURL.StackType.CORE, true, sb.crawler.profilesActiveCrawls);
if (urlEntry == null) continue;
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
@ -232,11 +263,11 @@ public class CrawlQueues {
log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
return true;
}
generateCrawl(urlEntry, stats, profileHandle);
load(urlEntry, stats, profileHandle);
return true;
} catch (final IOException e) {
log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(NoticedURL.STACK_TYPE_CORE);
if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(NoticedURL.StackType.CORE);
}
}
return true;
@ -250,7 +281,7 @@ public class CrawlQueues {
* @param stats String for log prefixing
* @return
*/
private void generateCrawl(Request urlEntry, final String stats, final String profileHandle) {
private void load(Request urlEntry, final String stats, final String profileHandle) {
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
if (mp != null) {
@ -270,10 +301,10 @@ public class CrawlQueues {
+ ", permission=" + ((sb.peers == null) ? "undefined" : (((sb.peers.mySeed().isSenior()) || (sb.peers.mySeed().isPrincipal())) ? "true" : "false")));
// work off one Crawl stack entry
if ((urlEntry == null) || (urlEntry.url() == null)) {
if (urlEntry == null || urlEntry.url() == null) {
log.logInfo(stats + ": urlEntry = null");
} else {
new crawlWorker(urlEntry);
new Loader(urlEntry);
}
} else {
@ -309,7 +340,7 @@ public class CrawlQueues {
* @param stackType
* @return
*/
private String crawlIsPossible(int stackType) {
private String loadIsPossible(StackType stackType) {
//System.out.println("stacksize = " + noticeURL.stackSize(stackType));
if (noticeURL.stackSize(stackType) == 0) {
//log.logDebug("GlobalCrawl: queue is empty");
@ -443,7 +474,8 @@ public class CrawlQueues {
sb.crawler.defaultRemoteProfile.handle(),
0,
0,
0
0,
item.getSize()
));
} else {
log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
@ -461,11 +493,11 @@ public class CrawlQueues {
}
public int limitCrawlJobSize() {
return noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
return noticeURL.stackSize(NoticedURL.StackType.LIMIT);
}
public int remoteTriggeredCrawlJobSize() {
return noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
return noticeURL.stackSize(NoticedURL.StackType.REMOTE);
}
public boolean remoteTriggeredCrawlJob() {
@ -473,7 +505,7 @@ public class CrawlQueues {
// do nothing if either there are private processes to be done
// or there is no global crawl on the stack
String queueCheck = crawlIsPossible(NoticedURL.STACK_TYPE_REMOTE);
String queueCheck = loadIsPossible(NoticedURL.StackType.REMOTE);
if (queueCheck != null) {
if (log.isFinest()) log.logFinest("omitting de-queue/remote: " + queueCheck);
return false;
@ -485,19 +517,19 @@ public class CrawlQueues {
}
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]";
final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", "
+ noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
try {
final Request urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true, sb.crawler.profilesActiveCrawls);
final Request urlEntry = noticeURL.pop(NoticedURL.StackType.REMOTE, true, sb.crawler.profilesActiveCrawls);
final String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
// urlEntry.url());
generateCrawl(urlEntry, stats, profileHandle);
load(urlEntry, stats, profileHandle);
return true;
} catch (final IOException e) {
log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(NoticedURL.STACK_TYPE_REMOTE);
if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(NoticedURL.StackType.REMOTE);
return true;
}
}
@ -507,13 +539,13 @@ public class CrawlQueues {
return workers.size();
}
protected final class crawlWorker extends Thread {
protected final class Loader extends Thread {
protected Request request;
private final Integer code;
private final long start;
public crawlWorker(final Request entry) {
public Loader(final Request entry) {
this.start = System.currentTimeMillis();
this.request = entry;
this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
@ -600,7 +632,7 @@ public class CrawlQueues {
// Client.initConnectionManager();
this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED);
} finally {
crawlWorker w = workers.remove(code);
Loader w = workers.remove(code);
assert w != null;
}
}

@ -39,6 +39,7 @@ import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
@ -47,7 +48,10 @@ import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.repository.Blacklist;
import net.yacy.repository.FilterEngine;
import de.anomic.crawler.retrieval.FTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.SMBLoader;
import de.anomic.search.Segment;
import de.anomic.search.Switchboard;
import de.anomic.yacy.yacySeedDB;
@ -177,7 +181,7 @@ public final class CrawlStacker {
}
}
public void queueEntries(byte[] initiator, String profileHandle, Map<MultiProtocolURI, String> hyperlinks, boolean replace) {
public void enqueueEntries(byte[] initiator, String profileHandle, Map<MultiProtocolURI, String> hyperlinks, boolean replace) {
for (Map.Entry<MultiProtocolURI, String> e: hyperlinks.entrySet()) {
if (e.getKey() == null) continue;
@ -190,22 +194,28 @@ public final class CrawlStacker {
this.nextQueue.errorURL.remove(urlhash);
}
// put entry on crawl stack
enqueueEntry(new Request(
initiator,
url,
null,
e.getValue(),
new Date(),
profileHandle,
0,
0,
0
));
if (url.getProtocol().equals("ftp")) {
// put the whole ftp site on the crawl stack
enqueueEntries(initiator, profileHandle, "ftp", url.getHost(), url.getPort(), replace);
} else {
// put entry on crawl stack
enqueueEntry(new Request(
initiator,
url,
null,
e.getValue(),
new Date(),
profileHandle,
0,
0,
0,
0
));
}
}
}
public void queueEntries(final byte[] initiator, final String profileHandle, final String protocol, final String host, final int port, final boolean replace) {
public void enqueueEntries(final byte[] initiator, final String profileHandle, final String protocol, final String host, final int port, final boolean replace) {
final CrawlQueues cq = this.nextQueue;
new Thread() {
public void run() {
@ -242,7 +252,8 @@ public final class CrawlStacker {
profileHandle,
0,
0,
0
0,
entry.size
));
}
} catch (IOException e1) {
@ -295,30 +306,46 @@ public final class CrawlStacker {
return error;
}
long maxFileSize = Long.MAX_VALUE;
if (entry.size() > 0) {
String protocol = entry.url().getProtocol();
if (protocol.equals("http") || protocol.equals("https")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
if (protocol.equals("ftp")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.ftp.maxFileSize", FTPLoader.DEFAULT_MAXFILESIZE);
if (protocol.equals("smb")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.smb.maxFileSize", SMBLoader.DEFAULT_MAXFILESIZE);
}
// check availability of parser and maxfilesize
if (entry.size() > maxFileSize ||
(entry.url().getFileExtension().length() > 0 && TextParser.supports(entry.url(), null) != null)
) {
nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry);
return null;
}
if (global) {
// it may be possible that global == true and local == true, so do not check an error case against it
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
//this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT));
//int b = nextQueue.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
nextQueue.noticeURL.push(NoticedURL.StackType.LIMIT, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
//this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.StackType.LIMIT));
} else if (local) {
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
//this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
//int b = nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE);
nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE);
//this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE));
} else if (proxy) {
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
//this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
//int b = nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE);
nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE);
//this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE));
} else if (remote) {
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry);
nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
//this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE));
}

@ -40,14 +40,9 @@ import de.anomic.crawler.retrieval.Request;
public class NoticedURL {
public static final int STACK_TYPE_NULL = 0; // do not stack
public static final int STACK_TYPE_CORE = 1; // put on local stack
public static final int STACK_TYPE_LIMIT = 2; // put on global stack
public static final int STACK_TYPE_OVERHANG = 3; // put on overhang stack; links that are known but not crawled
public static final int STACK_TYPE_REMOTE = 4; // put on remote-triggered stack
public static final int STACK_TYPE_IMAGE = 11; // put on image stack
public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
public static final int STACK_TYPE_MUSIC = 13; // put on music stack
public enum StackType {
NULL, CORE, LIMIT, OVERHANG, REMOTE, NOLOAD, IMAGE, MOVIE, MUSIC;
}
public static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
@ -55,6 +50,7 @@ public class NoticedURL {
private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth
private Balancer remoteStack; // links from remote crawl orders
private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry
public NoticedURL(
final File cachePath,
@ -65,6 +61,7 @@ public class NoticedURL {
this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
//overhangStack = new plasmaCrawlBalancer(overhangStackFile);
this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
}
public long getMinimumLocalDelta() {
@ -79,6 +76,7 @@ public class NoticedURL {
this.coreStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.limitStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.remoteStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
this.noloadStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta);
}
public void clear() {
@ -86,6 +84,7 @@ public class NoticedURL {
coreStack.clear();
limitStack.clear();
remoteStack.clear();
noloadStack.clear();
}
public void close() {
@ -103,6 +102,10 @@ public class NoticedURL {
remoteStack.close();
remoteStack = null;
}
if (noloadStack != null) {
noloadStack.close();
noloadStack = null;
}
}
protected void finalize() {
@ -113,11 +116,11 @@ public class NoticedURL {
}
public boolean notEmpty() {
return coreStack.notEmpty() || limitStack.notEmpty() || remoteStack.notEmpty();
return coreStack.notEmpty() || limitStack.notEmpty() || remoteStack.notEmpty() || noloadStack.notEmpty();
}
public boolean notEmptyLocal() {
return coreStack.notEmpty() || limitStack.notEmpty();
return coreStack.notEmpty() || limitStack.notEmpty() || noloadStack.notEmpty();
}
public int size() {
@ -130,15 +133,17 @@ public class NoticedURL {
if (!coreStack.isEmpty()) return false;
if (!limitStack.isEmpty()) return false;
if (!remoteStack.isEmpty()) return false;
if (!noloadStack.isEmpty()) return false;
return true;
}
public int stackSize(final int stackType) {
public int stackSize(final StackType stackType) {
switch (stackType) {
case STACK_TYPE_CORE: return (coreStack == null) ? 0 : coreStack.size();
case STACK_TYPE_LIMIT: return (limitStack == null) ? 0 : limitStack.size();
case STACK_TYPE_OVERHANG: return 0;
case STACK_TYPE_REMOTE: return (remoteStack == null) ? 0 : remoteStack.size();
case NOLOAD: return (noloadStack == null) ? 0 : noloadStack.size();
case CORE: return (coreStack == null) ? 0 : coreStack.size();
case LIMIT: return (limitStack == null) ? 0 : limitStack.size();
case OVERHANG: return 0;
case REMOTE: return (remoteStack == null) ? 0 : remoteStack.size();
default: return -1;
}
}
@ -148,21 +153,25 @@ public class NoticedURL {
coreStack.has(urlhashb) ||
limitStack.has(urlhashb) ||
//overhangStack.has(urlhashb) ||
remoteStack.has(urlhashb);
remoteStack.has(urlhashb) ||
noloadStack.has(urlhashb);
}
public void push(final int stackType, final Request entry) {
public void push(final StackType stackType, final Request entry) {
try {
switch (stackType) {
case STACK_TYPE_CORE:
case CORE:
coreStack.push(entry);
break;
case STACK_TYPE_LIMIT:
case LIMIT:
limitStack.push(entry);
break;
case STACK_TYPE_REMOTE:
case REMOTE:
remoteStack.push(entry);
break;
case NOLOAD:
noloadStack.push(entry);
break;
default: break;
}
} catch (final Exception er) {
@ -172,6 +181,7 @@ public class NoticedURL {
public Request get(final byte[] urlhash) {
Request entry = null;
try {if ((entry = noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
try {if ((entry = coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
try {if ((entry = limitStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
try {if ((entry = remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {}
@ -188,6 +198,7 @@ public class NoticedURL {
try {
HandleSet urlHashes = Base64Order.enhancedCoder.getHandleSet(12, 1);
urlHashes.put(urlhashBytes);
try {return noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {return coreStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {return limitStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {return remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {}
@ -200,31 +211,34 @@ public class NoticedURL {
public int removeByProfileHandle(final String handle, final long timeout) throws RowSpaceExceededException {
int removed = 0;
try {removed += noloadStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
try {removed += coreStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
try {removed += limitStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
try {removed += remoteStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {}
return removed;
}
public ArrayList<Request> top(final int stackType, final int count) {
public ArrayList<Request> top(final StackType stackType, final int count) {
switch (stackType) {
case STACK_TYPE_CORE: return top(coreStack, count);
case STACK_TYPE_LIMIT: return top(limitStack, count);
case STACK_TYPE_REMOTE: return top(remoteStack, count);
case CORE: return top(coreStack, count);
case LIMIT: return top(limitStack, count);
case REMOTE: return top(remoteStack, count);
case NOLOAD: return top(noloadStack, count);
default: return null;
}
}
public Request pop(final int stackType, final boolean delay, Map<byte[], Map<String, String>> profiles) throws IOException {
public Request pop(final StackType stackType, final boolean delay, Map<byte[], Map<String, String>> profiles) throws IOException {
switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack, delay, profiles);
case STACK_TYPE_LIMIT: return pop(limitStack, delay, profiles);
case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profiles);
case CORE: return pop(coreStack, delay, profiles);
case LIMIT: return pop(limitStack, delay, profiles);
case REMOTE: return pop(remoteStack, delay, profiles);
case NOLOAD: return pop(noloadStack, false, profiles);
default: return null;
}
}
public void shift(final int fromStack, final int toStack, Map<byte[], Map<String, String>> profiles) {
public void shift(final StackType fromStack, final StackType toStack, Map<byte[], Map<String, String>> profiles) {
try {
final Request entry = pop(fromStack, false, profiles);
if (entry != null) push(toStack, entry);
@ -233,12 +247,13 @@ public class NoticedURL {
}
}
public void clear(final int stackType) {
public void clear(final StackType stackType) {
Log.logInfo("NoticedURL", "CLEARING STACK " + stackType);
switch (stackType) {
case STACK_TYPE_CORE: coreStack.clear(); break;
case STACK_TYPE_LIMIT: limitStack.clear(); break;
case STACK_TYPE_REMOTE: remoteStack.clear(); break;
case CORE: coreStack.clear(); break;
case LIMIT: limitStack.clear(); break;
case REMOTE: remoteStack.clear(); break;
case NOLOAD: noloadStack.clear(); break;
default: return;
}
}
@ -273,12 +288,13 @@ public class NoticedURL {
return list;
}
public Iterator<Request> iterator(final int stackType) {
public Iterator<Request> iterator(final StackType stackType) {
// returns an iterator of plasmaCrawlBalancerEntry Objects
try {switch (stackType) {
case STACK_TYPE_CORE: return coreStack.iterator();
case STACK_TYPE_LIMIT: return limitStack.iterator();
case STACK_TYPE_REMOTE: return remoteStack.iterator();
case CORE: return coreStack.iterator();
case LIMIT: return limitStack.iterator();
case REMOTE: return remoteStack.iterator();
case NOLOAD: return noloadStack.iterator();
default: return null;
}} catch (final IOException e) {
return new HashSet<Request>().iterator();

@ -97,6 +97,7 @@ public class SitemapImporter extends Thread {
this.crawlingProfile.handle(),
0,
0,
0,
0
));
logger.logInfo("New URL '" + entry.url() + "' added for loading.");

@ -48,14 +48,16 @@ import de.anomic.search.Switchboard;
public class FTPLoader {
public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10;
private final Switchboard sb;
private final Log log;
private final int maxFileSize;
private final long maxFileSize;
public FTPLoader(final Switchboard sb, final Log log) {
this.sb = sb;
this.log = log;
this.maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l);
this.maxFileSize = sb.getConfigLong("crawler.ftp.maxFileSize", -1l);
}
/**
@ -228,7 +230,7 @@ public class FTPLoader {
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
// if the mimetype and file extension is supported we start to download the file
final int size = ftpClient.fileSize(path);
final long size = ftpClient.fileSize(path);
String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > maxFileSize && maxFileSize >= 0)) {

@ -47,16 +47,16 @@ public class Request extends WorkflowJob {
"String urlstring-256, " + // the url as string
"String refhash-" + Word.commonHashLength + ", " + // the url's referrer hash
"String urlname-80, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-8 {b256}, " + // the time when the url was first time appeared
"Cardinal appdate-8 {b256}, " + // the date of the resource; either file date or first appearance
"String profile-" + Word.commonHashLength + ", " + // the name of the prefetch profile handle
"Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b256}, " + // number of anchors of the parent
"Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors
"byte[] flags-4, " + // flags
"String handle-4, " + // extra handle
"Cardinal loaddate-8 {b256}," + // NOT USED
"Cardinal lastmodified-8 {b256}," + // NOT USED
"Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince
"Cardinal handle-4 {b256}, " + // handle (NOT USED)
"Cardinal loaddate-8 {b256}, " + // NOT USED
"Cardinal lastmodified-8 {b256}, " + // NOT USED
"Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known
Base64Order.enhancedCoder
);
@ -65,14 +65,13 @@ public class Request extends WorkflowJob {
private byte[] refhash; // the url's referrer hash
private DigestURI url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a>
private long appdate; // the time when the url was first time appeared. may be negative in case that the date is before epoch (1970)!
private long imsdate; // the time of a ifModifiedSince request
private long appdate; // the time when the url was first time appeared.
private String profileHandle; // the name of the fetch profile
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private Bitfield flags;
private int handle;
private Bitfield flags;
private long size; // size of resource in bytes (if known) or 0 if not known
private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
@ -82,7 +81,7 @@ public class Request extends WorkflowJob {
* @param referrerhash
*/
public Request(final DigestURI url, final byte[] referrerhash) {
this(null, url, referrerhash, null, null, null, 0, 0, 0);
this(null, url, referrerhash, null, null, null, 0, 0, 0, 0);
}
/**
@ -108,7 +107,8 @@ public class Request extends WorkflowJob {
final String profileHandle,
final int depth,
final int anchors,
final int forkfactor
final int forkfactor,
final long size
) {
// create new entry and store it into database
assert url != null;
@ -124,11 +124,10 @@ public class Request extends WorkflowJob {
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new Bitfield(rowdef.width(10));
this.handle = 0;
this.imsdate = 0;
this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode();
this.status = WorkflowJob.STATUS_INITIATED;
this.size = size;
}
public Request(final Row.Entry entry) throws IOException {
@ -150,10 +149,9 @@ public class Request extends WorkflowJob {
this.anchors = (int) entry.getColLong(8);
this.forkfactor = (int) entry.getColLong(9);
this.flags = new Bitfield(entry.getColBytes(10, true));
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
//this.loaddate = entry.getColLong(12);
//this.lastmodified = entry.getColLong(13);
this.imsdate = entry.getColLong(14);
this.size = entry.getColLong(14);
this.statusMessage = "loaded(kelondroRow.Entry)";
this.initialHash = url.hashCode();
return;
@ -174,17 +172,11 @@ public class Request extends WorkflowJob {
return this.statusMessage;
}
private static String normalizeHandle(final int h) {
String d = Integer.toHexString(h);
while (d.length() < rowdef.width(11)) d = "0" + d;
return d;
}
public Row.Entry toRow() {
final byte[] appdatestr = NaturalOrder.encodeLong(appdate, rowdef.width(5));
final byte[] loaddatestr = NaturalOrder.encodeLong(0 /*loaddate*/, rowdef.width(12));
final byte[] serverdatestr = NaturalOrder.encodeLong(0 /*lastmodified*/, rowdef.width(13));
final byte[] imsdatestr = NaturalOrder.encodeLong(imsdate, rowdef.width(14));
final byte[] sizestr = NaturalOrder.encodeLong(this.size, rowdef.width(14));
// store the hash in the hash cache
byte[] namebytes;
try {
@ -204,10 +196,10 @@ public class Request extends WorkflowJob {
NaturalOrder.encodeLong(this.anchors, rowdef.width(8)),
NaturalOrder.encodeLong(this.forkfactor, rowdef.width(9)),
this.flags.bytes(),
normalizeHandle(this.handle).getBytes(),
NaturalOrder.encodeLong(0, rowdef.width(11)),
loaddatestr,
serverdatestr,
imsdatestr};
sizestr};
return rowdef.newEntry(entry);
}
@ -251,9 +243,9 @@ public class Request extends WorkflowJob {
return new Date(this.lastmodified);
}
*/
public Date imsdate() {
public long size() {
// the date that the client (browser) send as ifModifiedSince in proxy mode
return new Date(this.imsdate);
return this.size;
}
public String name() {

@ -144,7 +144,7 @@ public class Response {
public static final int QUEUE_STATE_FINISHED = 5;
public Response(
Request request,
final Request request,
final RequestHeader requestHeader,
final ResponseHeader responseHeader,
final String responseStatus,
@ -160,8 +160,19 @@ public class Response {
this.content = content;
}
public Response(final Request request, final CrawlProfile profile) {
this.request = request;
// request and response headers may be zero in case that we process surrogates
this.requestHeader = new RequestHeader();
this.responseHeader = new ResponseHeader();
this.responseStatus = "200";
this.profile = profile;
this.status = QUEUE_STATE_FRESH;
this.content = request.url().toNormalform(true, true).getBytes();
}
public Response(
Request request,
final Request request,
final RequestHeader requestHeader,
final ResponseHeader responseHeader,
final String responseStatus,

@ -56,14 +56,16 @@ import net.yacy.kelondro.util.FileUtils;
public class SMBLoader {
public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10;
private final Switchboard sb;
private final Log log;
private final int maxFileSize;
private final long maxFileSize;
public SMBLoader(final Switchboard sb, final Log log) {
this.sb = sb;
this.log = log;
maxFileSize = (int) sb.getConfigLong("crawler.smb.maxFileSize", -1l);
maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l);
}

@ -399,6 +399,7 @@ public final class HTTPDProxyHandler {
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0,
0);
final Response response = new Response(
request,
@ -517,7 +518,8 @@ public final class HTTPDProxyHandler {
sb.crawler.defaultProxyProfile.handle(),
0,
0,
0);
0,
sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete);
// handle incoming cookies

@ -1348,7 +1348,8 @@ public final class Switchboard extends serverSwitch {
this.crawler.defaultSurrogateProfile.handle(),
0,
0,
0
0,
0
);
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[]{document}, null);
@ -1800,7 +1801,8 @@ public final class Switchboard extends serverSwitch {
response.profile().handle(),
response.depth() + 1,
0,
0
0,
response.size() < 0 ? 0 : response.size()
));
} catch (MalformedURLException e) {
Log.logException(e);
@ -2261,6 +2263,7 @@ public final class Switchboard extends serverSwitch {
null,
0,
0,
0,
0);
crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason);
}
@ -2433,7 +2436,7 @@ public final class Switchboard extends serverSwitch {
peers.mySeed().put(yacySeed.UPTIME, Long.toString(uptime/60)); // the number of minutes that the peer is up in minutes/day (moving average MA30)
peers.mySeed().put(yacySeed.LCOUNT, Long.toString(indexSegments.URLCount())); // the number of links that the peer has stored (LURL's)
peers.mySeed().put(yacySeed.NCOUNT, Integer.toString(crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
peers.mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
peers.mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
peers.mySeed().put(yacySeed.ICOUNT, Long.toString(indexSegments.RWICount())); // the minimum number of words that the peer has indexed (as it says)
peers.mySeed().put(yacySeed.SCOUNT, Integer.toString(peers.sizeConnected())); // the number of seeds that the peer has stored
peers.mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((peers.sizeConnected() + peers.sizeDisconnected() + peers.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)

@ -50,6 +50,8 @@ public interface Hit {
public void setSubject(String[] tags);
public void setSize(long size);
public String getAuthor();
public String getCopyright();
@ -73,5 +75,7 @@ public interface Hit {
public String getDocs();
public String[] getSubject();
public long getSize();
}

@ -47,7 +47,8 @@ public class RSSMessage implements Hit {
language("language"),
guid("guid"),
ttl("ttl"),
docs("docs");
docs("docs"),
size("size,length");
private Set<String> keys;
@ -172,6 +173,11 @@ public class RSSMessage implements Hit {
return Token.docs.valueFrom(this.map);
}
public long getSize() {
String size = Token.size.valueFrom(this.map);
return (size == null) ? 0 : Long.parseLong(size);
}
public String getFulltext() {
StringBuilder sb = new StringBuilder(300);
for (String s: map.values()) sb.append(s).append(" ");
@ -230,13 +236,7 @@ public class RSSMessage implements Hit {
}
public void setSize(long size) {
// TODO Auto-generated method stub
}
public void setSizename(String sizename) {
// TODO Auto-generated method stub
setValue("size", Long.toString(size));
}
public void setTitle(String title) {

@ -1048,9 +1048,9 @@ public class FTPClient {
filetype type = filetype.file;
if (tokens.group(1).startsWith("d")) type = filetype.directory;
if (tokens.group(1).startsWith("l")) type = filetype.link;
int size = -1;
long size = -1;
try {
size = Integer.parseInt(tokens.group(2));
size = Long.parseLong(tokens.group(2));
} catch (final NumberFormatException e) {
log.warn("not a number in list-entry: ", e);
return null;
@ -1078,7 +1078,8 @@ public class FTPClient {
log.warn("---- Error: not ls date-format '" + dateString, e);
date = new Date();
}
return new entryInfo(type, size, date, tokens.group(6));
String filename = tokens.group(6);
return new entryInfo(type, size, date, filename);
}
return null;
}
@ -1104,7 +1105,7 @@ public class FTPClient {
/**
* size in bytes
*/
public final int size;
public final long size;
/**
* date of file
*/
@ -1130,7 +1131,7 @@ public class FTPClient {
* @param date
* @param name
*/
public entryInfo(final filetype type, final int size, final Date date, final String name) {
public entryInfo(final filetype type, final long size, final Date date, final String name) {
this.type = type;
this.size = size;
this.date = date;
@ -1680,8 +1681,8 @@ public class FTPClient {
* @param path
* @return size in bytes or -1 if size cannot be determinied
*/
public int fileSize(final String path) {
int size = -1;
public long fileSize(final String path) {
long size = -1;
try {
// extended FTP
size = size(path);

@ -290,7 +290,7 @@ public final class TextParser {
try {
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
List<Parser> idioms = parsers(url, mimeType);
return (idioms == null || idioms.isEmpty()) ? "no parser found" : null;
return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.get(0).getName().equals(genericIdiom.getName()))) ? "no parser found" : null;
} catch (Parser.Failure e) {
// in case that a parser is not available, return a error string describing the problem.
return e.getMessage();
@ -333,9 +333,7 @@ public final class TextParser {
// check mime type computed from extension
String mimeType2 = ext2mime.get(ext);
if (mimeType2 == null || denyMime.containsKey(mimeType2)) return idioms; // in this case we are a bit more lazy
idiom = mime2parser.get(mimeType2);
if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom);
if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.add(idiom);
// always add the generic parser
idioms.add(genericIdiom);

@ -519,6 +519,7 @@ public class URIMetadataRow implements URIMetadata {
null,
0,
0,
0,
0);
}

@ -129,6 +129,7 @@ public final class LoaderDispatcher {
sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0,
0,
0,
0);
}

Loading…
Cancel
Save