diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index e24d3de29..6dda2bd60 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -254,7 +254,7 @@ public class Crawler_p { sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); final DigestURI url = crawlingStartURL; - sb.crawlStacker.queueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), "ftp", url.getHost(), url.getPort(), false); + sb.crawlStacker.enqueueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), "ftp", url.getHost(), url.getPort(), false); } catch (final PatternSyntaxException e) { prop.put("info", "4"); // crawlfilter does not match url prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); @@ -316,6 +316,7 @@ public class Crawler_p { pe.handle(), 0, 0, + 0, 0 )); @@ -369,6 +370,7 @@ public class Crawler_p { pe.handle(), 0, 0, + 0, 0), sb.peers.mySeed().hash.getBytes(), new Date(), @@ -420,7 +422,7 @@ public class Crawler_p { cachePolicy); sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - sb.crawlStacker.queueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true); + sb.crawlStacker.enqueueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true); } catch (final PatternSyntaxException e) { prop.put("info", "4"); // crawlfilter does not match url prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); @@ -522,6 +524,7 @@ public class Crawler_p { profile.handle(), 0, 0, + 0, 0 )); } diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index a328a83ba..5d4019346 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -65,13 +65,13 @@ public class IndexCreateWWWGlobalQueue_p { } if (post.containsKey("clearcrawlqueue")) { - final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); - sb.crawlQueues.noticeURL.clear(NoticedURL.STACK_TYPE_LIMIT); + final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); + sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.LIMIT); try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */} /* int c = 0; - while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) { - urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash(); + while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) { + urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash(); if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } } */ @@ -85,12 +85,12 @@ public class IndexCreateWWWGlobalQueue_p { } } - int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); + int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); if (stackSize == 0) { prop.put("crawler-queue", "0"); } else { prop.put("crawler-queue", "1"); - final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, showLimit); + final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit); Request urle; boolean dark = true; diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 55f2c4204..db542a83c 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -84,8 +84,8 @@ public class IndexCreateWWWLocalQueue_p { final String pattern = post.get("pattern", ".*").trim(); final int option = post.getInt("option", INVALID); if (pattern.equals(".*")) { - c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - sb.crawlQueues.noticeURL.clear(NoticedURL.STACK_TYPE_CORE); + c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); + sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.CORE); try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */} } else if (option > INVALID) { Pattern compiledPattern = null; @@ -112,7 +112,7 @@ public class IndexCreateWWWLocalQueue_p { } } else { // iterating through the list of URLs - final Iterator iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.STACK_TYPE_CORE); + final Iterator iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE); Request entry; List removehashes = new ArrayList(); while (iter.hasNext()) { @@ -152,12 +152,12 @@ public class IndexCreateWWWLocalQueue_p { } } - int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); + int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); if (stackSize == 0) { prop.put("crawler-queue", "0"); } else { prop.put("crawler-queue", "1"); - final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, (int) (showLimit * 1.20)); + final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20)); Request urle; boolean dark = true; diff --git a/htroot/IndexCreateWWWRemoteQueue_p.java b/htroot/IndexCreateWWWRemoteQueue_p.java index 6fb7980c7..16eccfab4 100644 --- a/htroot/IndexCreateWWWRemoteQueue_p.java +++ b/htroot/IndexCreateWWWRemoteQueue_p.java @@ -62,13 +62,13 @@ public class IndexCreateWWWRemoteQueue_p { } if (post.containsKey("clearcrawlqueue")) { - final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); - sb.crawlQueues.noticeURL.clear(NoticedURL.STACK_TYPE_REMOTE); + final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE); + sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.REMOTE); try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */} /* int c = 0; - while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) { - urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash(); + while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) { + urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash(); if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } } */ @@ -82,12 +82,12 @@ public class IndexCreateWWWRemoteQueue_p { } } - int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); + int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE); if (stackSize == 0) { prop.put("crawler-queue", "0"); } else { prop.put("crawler-queue", "1"); - final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_REMOTE, showLimit); + final ArrayList crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit); Request urle; boolean dark = true; diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 2abb32999..5226f0c92 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -182,6 +182,7 @@ public class QuickCrawlLink_p { pe.handle(), 0, 0, + 0, 0 )); diff --git a/htroot/api/queues_p.java b/htroot/api/queues_p.java index 1ea531a9e..f81e9229c 100755 --- a/htroot/api/queues_p.java +++ b/htroot/api/queues_p.java @@ -70,23 +70,23 @@ public class queues_p { //local crawl queue prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount()); prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING); - int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - addNTable(sb, prop, "list-local", sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_CORE, Math.min(10, stackSize))); + int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); + addNTable(sb, prop, "list-local", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, Math.min(10, stackSize))); //global crawl queue prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize()); prop.put("limitCrawlState", STATE_RUNNING); - stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); + stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); //global crawl queue prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()); prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING); - stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); + stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT); if (stackSize == 0) { prop.put("list-remote", "0"); } else { - addNTable(sb, prop, "list-remote", sb.crawlQueues.noticeURL.top(NoticedURL.STACK_TYPE_LIMIT, Math.min(10, stackSize))); + addNTable(sb, prop, "list-remote", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, Math.min(10, stackSize))); } // return rewrite properties diff --git a/htroot/rct_p.java b/htroot/rct_p.java index 6e12ed07d..23d4ac04a 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -81,7 +81,8 @@ public class rct_p { sb.crawler.defaultRemoteProfile.handle(), 0, 0, - 0 + 0, + item.getSize() )); } else { env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 5e9e39dd5..962b4fe8c 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -59,7 +59,7 @@ public class urls { if (post.get("call", "").equals("remotecrawl")) { // perform a remote crawl url handover - final int stackType = NoticedURL.STACK_TYPE_LIMIT; + final NoticedURL.StackType stackType = NoticedURL.StackType.LIMIT; int maxCount = Math.min(100, post.getInt("count", 10)); long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000))); long timeout = System.currentTimeMillis() + maxTime; diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index d0221ebe7..fe40aab89 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -44,11 +44,14 @@ import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.workflow.WorkflowJob; +import de.anomic.crawler.NoticedURL.StackType; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; +import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; +import de.anomic.search.Switchboard.indexingQueueEntry; import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacySeed; import de.anomic.yacy.dht.PeerSelection; @@ -60,7 +63,7 @@ public class CrawlQueues { protected Switchboard sb; protected Log log; - protected Map workers; // mapping from url hash to Worker thread object + protected Map workers; // mapping from url hash to Worker thread object private final ArrayList remoteCrawlProviderHashes; public NoticedURL noticeURL; @@ -69,7 +72,7 @@ public class CrawlQueues { public CrawlQueues(final Switchboard sb, final File queuePath) { this.sb = sb; this.log = new Log("CRAWLER"); - this.workers = new ConcurrentHashMap(); + this.workers = new ConcurrentHashMap(); this.remoteCrawlProviderHashes = new ArrayList(); // start crawling management @@ -83,7 +86,7 @@ public class CrawlQueues { public void relocate(final File newQueuePath) { this.close(); - this.workers = new ConcurrentHashMap(); + this.workers = new ConcurrentHashMap(); this.remoteCrawlProviderHashes.clear(); noticeURL = new NoticedURL(newQueuePath, sb.useTailCache, sb.exceed134217727); @@ -94,10 +97,10 @@ public class CrawlQueues { public void close() { // wait for all workers to finish - for (final crawlWorker w: workers.values()) { + for (final Loader w: workers.values()) { w.interrupt(); } - for (final crawlWorker w: workers.values()) { + for (final Loader w: workers.values()) { try { w.join(); } catch (InterruptedException e) { @@ -111,7 +114,7 @@ public class CrawlQueues { public void clear() { // wait for all workers to finish - for (final crawlWorker w: workers.values()) { + for (final Loader w: workers.values()) { w.interrupt(); } // TODO: wait some more time until all threads are finished @@ -139,7 +142,7 @@ public class CrawlQueues { if (delegatedURL.exists(hash)) return "delegated"; if (errorURL.exists(hash)) return "errors"; if (noticeURL.existsInStack(hash)) return "crawler"; - for (final crawlWorker worker: workers.values()) { + for (final Loader worker: workers.values()) { if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) return "worker"; } return null; @@ -158,7 +161,7 @@ public class CrawlQueues { if (ee != null) return ee.url(); ee = errorURL.get(urlhash); if (ee != null) return ee.url(); - for (final crawlWorker w: workers.values()) { + for (final Loader w: workers.values()) { if (Base64Order.enhancedCoder.equal(w.request.url().hash(), urlhash)) return w.request.url(); } final Request ne = noticeURL.get(urlhash); @@ -169,7 +172,7 @@ public class CrawlQueues { public void cleanup() { // wait for all workers to finish int timeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000); - for (final crawlWorker w: workers.values()) { + for (final Loader w: workers.values()) { if (w.age() > timeout) w.interrupt(); } } @@ -178,7 +181,7 @@ public class CrawlQueues { synchronized (workers) { final Request[] e = new Request[workers.size()]; int i = 0; - for (final crawlWorker w: workers.values()) { + for (final Loader w: workers.values()) { if (i >= e.length) break; e[i++] = w.request; } @@ -187,7 +190,7 @@ public class CrawlQueues { } public int coreCrawlJobSize() { - return noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); + return noticeURL.stackSize(NoticedURL.StackType.CORE); } public boolean coreCrawlJob() { @@ -200,14 +203,14 @@ public class CrawlQueues { // move some tasks to the core crawl job so we have something to do final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance for (int i = 0; i < toshift; i++) { - noticeURL.shift(NoticedURL.STACK_TYPE_LIMIT, NoticedURL.STACK_TYPE_CORE, sb.crawler.profilesActiveCrawls); + noticeURL.shift(NoticedURL.StackType.LIMIT, NoticedURL.StackType.CORE, sb.crawler.profilesActiveCrawls); } log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + ", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "") + ", robinsonMode=" + ((sb.isRobinsonMode()) ? "on" : "off")); } - String queueCheck = crawlIsPossible(NoticedURL.STACK_TYPE_CORE); + String queueCheck = loadIsPossible(NoticedURL.StackType.CORE); if (queueCheck != null) { if (log.isFine()) log.logFine("omitting de-queue/local: " + queueCheck); return false; @@ -219,11 +222,39 @@ public class CrawlQueues { } // do a local crawl - Request urlEntry = null; - while (urlEntry == null && noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) > 0) { - final String stats = "LOCALCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]"; + Request urlEntry; + while (noticeURL.stackSize(NoticedURL.StackType.CORE) > 0 || noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) { + final String stats = "LOCALCRAWL[" + + noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " + + noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + + noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + + noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + + ", " + noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; try { - urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_CORE, true, sb.crawler.profilesActiveCrawls); + if (noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) { + // get one entry that will not be loaded, just indexed + urlEntry = noticeURL.pop(NoticedURL.StackType.NOLOAD, true, sb.crawler.profilesActiveCrawls); + if (urlEntry == null) continue; + final String profileHandle = urlEntry.profileHandle(); + if (profileHandle == null) { + log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); + return true; + } + Map map = sb.crawler.profilesActiveCrawls.get(profileHandle); + if (map == null) { + log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); + return true; + } + try { + sb.indexingDocumentProcessor.enQueue(new indexingQueueEntry(Segments.Process.LOCALCRAWLING, new Response(urlEntry, new CrawlProfile(map)), null, null)); + Log.logInfo("CrawlQueues", "placed NOLOAD URL on indexing queue: " + urlEntry.url().toNormalform(true, false)); + } catch (InterruptedException e) { + Log.logException(e); + } + return true; + } + + urlEntry = noticeURL.pop(NoticedURL.StackType.CORE, true, sb.crawler.profilesActiveCrawls); if (urlEntry == null) continue; final String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: @@ -232,11 +263,11 @@ public class CrawlQueues { log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); return true; } - generateCrawl(urlEntry, stats, profileHandle); + load(urlEntry, stats, profileHandle); return true; } catch (final IOException e) { log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); - if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(NoticedURL.STACK_TYPE_CORE); + if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(NoticedURL.StackType.CORE); } } return true; @@ -250,7 +281,7 @@ public class CrawlQueues { * @param stats String for log prefixing * @return */ - private void generateCrawl(Request urlEntry, final String stats, final String profileHandle) { + private void load(Request urlEntry, final String stats, final String profileHandle) { final Map mp = sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes()); if (mp != null) { @@ -270,10 +301,10 @@ public class CrawlQueues { + ", permission=" + ((sb.peers == null) ? "undefined" : (((sb.peers.mySeed().isSenior()) || (sb.peers.mySeed().isPrincipal())) ? "true" : "false"))); // work off one Crawl stack entry - if ((urlEntry == null) || (urlEntry.url() == null)) { + if (urlEntry == null || urlEntry.url() == null) { log.logInfo(stats + ": urlEntry = null"); } else { - new crawlWorker(urlEntry); + new Loader(urlEntry); } } else { @@ -309,7 +340,7 @@ public class CrawlQueues { * @param stackType * @return */ - private String crawlIsPossible(int stackType) { + private String loadIsPossible(StackType stackType) { //System.out.println("stacksize = " + noticeURL.stackSize(stackType)); if (noticeURL.stackSize(stackType) == 0) { //log.logDebug("GlobalCrawl: queue is empty"); @@ -443,7 +474,8 @@ public class CrawlQueues { sb.crawler.defaultRemoteProfile.handle(), 0, 0, - 0 + 0, + item.getSize() )); } else { log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); @@ -461,11 +493,11 @@ public class CrawlQueues { } public int limitCrawlJobSize() { - return noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); + return noticeURL.stackSize(NoticedURL.StackType.LIMIT); } public int remoteTriggeredCrawlJobSize() { - return noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); + return noticeURL.stackSize(NoticedURL.StackType.REMOTE); } public boolean remoteTriggeredCrawlJob() { @@ -473,7 +505,7 @@ public class CrawlQueues { // do nothing if either there are private processes to be done // or there is no global crawl on the stack - String queueCheck = crawlIsPossible(NoticedURL.STACK_TYPE_REMOTE); + String queueCheck = loadIsPossible(NoticedURL.StackType.REMOTE); if (queueCheck != null) { if (log.isFinest()) log.logFinest("omitting de-queue/remote: " + queueCheck); return false; @@ -485,19 +517,19 @@ public class CrawlQueues { } // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) - final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(NoticedURL.STACK_TYPE_OVERHANG) + ", " - + noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE) + "]"; + final String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", " + + noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; try { - final Request urlEntry = noticeURL.pop(NoticedURL.STACK_TYPE_REMOTE, true, sb.crawler.profilesActiveCrawls); + final Request urlEntry = noticeURL.pop(NoticedURL.StackType.REMOTE, true, sb.crawler.profilesActiveCrawls); final String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + // urlEntry.url()); - generateCrawl(urlEntry, stats, profileHandle); + load(urlEntry, stats, profileHandle); return true; } catch (final IOException e) { log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); - if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(NoticedURL.STACK_TYPE_REMOTE); + if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(NoticedURL.StackType.REMOTE); return true; } } @@ -507,13 +539,13 @@ public class CrawlQueues { return workers.size(); } - protected final class crawlWorker extends Thread { + protected final class Loader extends Thread { protected Request request; private final Integer code; private final long start; - public crawlWorker(final Request entry) { + public Loader(final Request entry) { this.start = System.currentTimeMillis(); this.request = entry; this.request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED); @@ -600,7 +632,7 @@ public class CrawlQueues { // Client.initConnectionManager(); this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED); } finally { - crawlWorker w = workers.remove(code); + Loader w = workers.remove(code); assert w != null; } } diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 50821a7f0..514855552 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -39,6 +39,7 @@ import java.util.concurrent.BlockingQueue; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ftp.FTPClient; +import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; @@ -47,7 +48,10 @@ import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.repository.Blacklist; import net.yacy.repository.FilterEngine; +import de.anomic.crawler.retrieval.FTPLoader; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; +import de.anomic.crawler.retrieval.SMBLoader; import de.anomic.search.Segment; import de.anomic.search.Switchboard; import de.anomic.yacy.yacySeedDB; @@ -177,7 +181,7 @@ public final class CrawlStacker { } } - public void queueEntries(byte[] initiator, String profileHandle, Map hyperlinks, boolean replace) { + public void enqueueEntries(byte[] initiator, String profileHandle, Map hyperlinks, boolean replace) { for (Map.Entry e: hyperlinks.entrySet()) { if (e.getKey() == null) continue; @@ -190,22 +194,28 @@ public final class CrawlStacker { this.nextQueue.errorURL.remove(urlhash); } - // put entry on crawl stack - enqueueEntry(new Request( - initiator, - url, - null, - e.getValue(), - new Date(), - profileHandle, - 0, - 0, - 0 - )); + if (url.getProtocol().equals("ftp")) { + // put the whole ftp site on the crawl stack + enqueueEntries(initiator, profileHandle, "ftp", url.getHost(), url.getPort(), replace); + } else { + // put entry on crawl stack + enqueueEntry(new Request( + initiator, + url, + null, + e.getValue(), + new Date(), + profileHandle, + 0, + 0, + 0, + 0 + )); + } } } - public void queueEntries(final byte[] initiator, final String profileHandle, final String protocol, final String host, final int port, final boolean replace) { + public void enqueueEntries(final byte[] initiator, final String profileHandle, final String protocol, final String host, final int port, final boolean replace) { final CrawlQueues cq = this.nextQueue; new Thread() { public void run() { @@ -242,7 +252,8 @@ public final class CrawlStacker { profileHandle, 0, 0, - 0 + 0, + entry.size )); } } catch (IOException e1) { @@ -295,30 +306,46 @@ public final class CrawlStacker { return error; } + long maxFileSize = Long.MAX_VALUE; + if (entry.size() > 0) { + String protocol = entry.url().getProtocol(); + if (protocol.equals("http") || protocol.equals("https")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); + if (protocol.equals("ftp")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.ftp.maxFileSize", FTPLoader.DEFAULT_MAXFILESIZE); + if (protocol.equals("smb")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.smb.maxFileSize", SMBLoader.DEFAULT_MAXFILESIZE); + + } + // check availability of parser and maxfilesize + if (entry.size() > maxFileSize || + (entry.url().getFileExtension().length() > 0 && TextParser.supports(entry.url(), null) != null) + ) { + nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry); + return null; + } + if (global) { // it may be possible that global == true and local == true, so do not check an error case against it if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); - //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); - nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry); - //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); - //this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT)); + //int b = nextQueue.noticeURL.stackSize(NoticedURL.StackType.LIMIT); + nextQueue.noticeURL.push(NoticedURL.StackType.LIMIT, entry); + //assert b < nextQueue.noticeURL.stackSize(NoticedURL.StackType.LIMIT); + //this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.StackType.LIMIT)); } else if (local) { if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); - //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); - //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - //this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE)); + //int b = nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE); + nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); + //assert b < nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE); + //this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE)); } else if (proxy) { if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); - //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); - //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - //this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE)); + //int b = nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE); + nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); + //assert b < nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE); + //this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.StackType.CORE)); } else if (remote) { //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); - nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry); + nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry); //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); //this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE)); } diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index 652552d28..69e728654 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -40,14 +40,9 @@ import de.anomic.crawler.retrieval.Request; public class NoticedURL { - public static final int STACK_TYPE_NULL = 0; // do not stack - public static final int STACK_TYPE_CORE = 1; // put on local stack - public static final int STACK_TYPE_LIMIT = 2; // put on global stack - public static final int STACK_TYPE_OVERHANG = 3; // put on overhang stack; links that are known but not crawled - public static final int STACK_TYPE_REMOTE = 4; // put on remote-triggered stack - public static final int STACK_TYPE_IMAGE = 11; // put on image stack - public static final int STACK_TYPE_MOVIE = 12; // put on movie stack - public static final int STACK_TYPE_MUSIC = 13; // put on music stack + public enum StackType { + NULL, CORE, LIMIT, OVERHANG, REMOTE, NOLOAD, IMAGE, MOVIE, MUSIC; + } public static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain @@ -55,6 +50,7 @@ public class NoticedURL { private Balancer coreStack; // links found by crawling to depth-1 private Balancer limitStack; // links found by crawling at target depth private Balancer remoteStack; // links from remote crawl orders + private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry public NoticedURL( final File cachePath, @@ -65,6 +61,7 @@ public class NoticedURL { this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727); //overhangStack = new plasmaCrawlBalancer(overhangStackFile); this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727); + this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727); } public long getMinimumLocalDelta() { @@ -79,6 +76,7 @@ public class NoticedURL { this.coreStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); this.limitStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); this.remoteStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); + this.noloadStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); } public void clear() { @@ -86,6 +84,7 @@ public class NoticedURL { coreStack.clear(); limitStack.clear(); remoteStack.clear(); + noloadStack.clear(); } public void close() { @@ -103,6 +102,10 @@ public class NoticedURL { remoteStack.close(); remoteStack = null; } + if (noloadStack != null) { + noloadStack.close(); + noloadStack = null; + } } protected void finalize() { @@ -113,11 +116,11 @@ public class NoticedURL { } public boolean notEmpty() { - return coreStack.notEmpty() || limitStack.notEmpty() || remoteStack.notEmpty(); + return coreStack.notEmpty() || limitStack.notEmpty() || remoteStack.notEmpty() || noloadStack.notEmpty(); } public boolean notEmptyLocal() { - return coreStack.notEmpty() || limitStack.notEmpty(); + return coreStack.notEmpty() || limitStack.notEmpty() || noloadStack.notEmpty(); } public int size() { @@ -130,15 +133,17 @@ public class NoticedURL { if (!coreStack.isEmpty()) return false; if (!limitStack.isEmpty()) return false; if (!remoteStack.isEmpty()) return false; + if (!noloadStack.isEmpty()) return false; return true; } - public int stackSize(final int stackType) { + public int stackSize(final StackType stackType) { switch (stackType) { - case STACK_TYPE_CORE: return (coreStack == null) ? 0 : coreStack.size(); - case STACK_TYPE_LIMIT: return (limitStack == null) ? 0 : limitStack.size(); - case STACK_TYPE_OVERHANG: return 0; - case STACK_TYPE_REMOTE: return (remoteStack == null) ? 0 : remoteStack.size(); + case NOLOAD: return (noloadStack == null) ? 0 : noloadStack.size(); + case CORE: return (coreStack == null) ? 0 : coreStack.size(); + case LIMIT: return (limitStack == null) ? 0 : limitStack.size(); + case OVERHANG: return 0; + case REMOTE: return (remoteStack == null) ? 0 : remoteStack.size(); default: return -1; } } @@ -148,21 +153,25 @@ public class NoticedURL { coreStack.has(urlhashb) || limitStack.has(urlhashb) || //overhangStack.has(urlhashb) || - remoteStack.has(urlhashb); + remoteStack.has(urlhashb) || + noloadStack.has(urlhashb); } - public void push(final int stackType, final Request entry) { + public void push(final StackType stackType, final Request entry) { try { switch (stackType) { - case STACK_TYPE_CORE: + case CORE: coreStack.push(entry); break; - case STACK_TYPE_LIMIT: + case LIMIT: limitStack.push(entry); break; - case STACK_TYPE_REMOTE: + case REMOTE: remoteStack.push(entry); break; + case NOLOAD: + noloadStack.push(entry); + break; default: break; } } catch (final Exception er) { @@ -172,6 +181,7 @@ public class NoticedURL { public Request get(final byte[] urlhash) { Request entry = null; + try {if ((entry = noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = limitStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} @@ -188,6 +198,7 @@ public class NoticedURL { try { HandleSet urlHashes = Base64Order.enhancedCoder.getHandleSet(12, 1); urlHashes.put(urlhashBytes); + try {return noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {return coreStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {return limitStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {return remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {} @@ -200,31 +211,34 @@ public class NoticedURL { public int removeByProfileHandle(final String handle, final long timeout) throws RowSpaceExceededException { int removed = 0; + try {removed += noloadStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} try {removed += coreStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} try {removed += limitStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} try {removed += remoteStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} return removed; } - public ArrayList top(final int stackType, final int count) { + public ArrayList top(final StackType stackType, final int count) { switch (stackType) { - case STACK_TYPE_CORE: return top(coreStack, count); - case STACK_TYPE_LIMIT: return top(limitStack, count); - case STACK_TYPE_REMOTE: return top(remoteStack, count); + case CORE: return top(coreStack, count); + case LIMIT: return top(limitStack, count); + case REMOTE: return top(remoteStack, count); + case NOLOAD: return top(noloadStack, count); default: return null; } } - public Request pop(final int stackType, final boolean delay, Map> profiles) throws IOException { + public Request pop(final StackType stackType, final boolean delay, Map> profiles) throws IOException { switch (stackType) { - case STACK_TYPE_CORE: return pop(coreStack, delay, profiles); - case STACK_TYPE_LIMIT: return pop(limitStack, delay, profiles); - case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profiles); + case CORE: return pop(coreStack, delay, profiles); + case LIMIT: return pop(limitStack, delay, profiles); + case REMOTE: return pop(remoteStack, delay, profiles); + case NOLOAD: return pop(noloadStack, false, profiles); default: return null; } } - public void shift(final int fromStack, final int toStack, Map> profiles) { + public void shift(final StackType fromStack, final StackType toStack, Map> profiles) { try { final Request entry = pop(fromStack, false, profiles); if (entry != null) push(toStack, entry); @@ -233,12 +247,13 @@ public class NoticedURL { } } - public void clear(final int stackType) { + public void clear(final StackType stackType) { Log.logInfo("NoticedURL", "CLEARING STACK " + stackType); switch (stackType) { - case STACK_TYPE_CORE: coreStack.clear(); break; - case STACK_TYPE_LIMIT: limitStack.clear(); break; - case STACK_TYPE_REMOTE: remoteStack.clear(); break; + case CORE: coreStack.clear(); break; + case LIMIT: limitStack.clear(); break; + case REMOTE: remoteStack.clear(); break; + case NOLOAD: noloadStack.clear(); break; default: return; } } @@ -273,12 +288,13 @@ public class NoticedURL { return list; } - public Iterator iterator(final int stackType) { + public Iterator iterator(final StackType stackType) { // returns an iterator of plasmaCrawlBalancerEntry Objects try {switch (stackType) { - case STACK_TYPE_CORE: return coreStack.iterator(); - case STACK_TYPE_LIMIT: return limitStack.iterator(); - case STACK_TYPE_REMOTE: return remoteStack.iterator(); + case CORE: return coreStack.iterator(); + case LIMIT: return limitStack.iterator(); + case REMOTE: return remoteStack.iterator(); + case NOLOAD: return noloadStack.iterator(); default: return null; }} catch (final IOException e) { return new HashSet().iterator(); diff --git a/source/de/anomic/crawler/SitemapImporter.java b/source/de/anomic/crawler/SitemapImporter.java index 97a380656..b239e7ca5 100644 --- a/source/de/anomic/crawler/SitemapImporter.java +++ b/source/de/anomic/crawler/SitemapImporter.java @@ -97,6 +97,7 @@ public class SitemapImporter extends Thread { this.crawlingProfile.handle(), 0, 0, + 0, 0 )); logger.logInfo("New URL '" + entry.url() + "' added for loading."); diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index d19373b8a..9a598e054 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -48,14 +48,16 @@ import de.anomic.search.Switchboard; public class FTPLoader { + public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10; + private final Switchboard sb; private final Log log; - private final int maxFileSize; + private final long maxFileSize; public FTPLoader(final Switchboard sb, final Log log) { this.sb = sb; this.log = log; - this.maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l); + this.maxFileSize = sb.getConfigLong("crawler.ftp.maxFileSize", -1l); } /** @@ -228,7 +230,7 @@ public class FTPLoader { responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); // if the mimetype and file extension is supported we start to download the file - final int size = ftpClient.fileSize(path); + final long size = ftpClient.fileSize(path); String parserError = null; if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || (size > maxFileSize && maxFileSize >= 0)) { diff --git a/source/de/anomic/crawler/retrieval/Request.java b/source/de/anomic/crawler/retrieval/Request.java index 229b3b84f..cfb26d77a 100755 --- a/source/de/anomic/crawler/retrieval/Request.java +++ b/source/de/anomic/crawler/retrieval/Request.java @@ -47,16 +47,16 @@ public class Request extends WorkflowJob { "String urlstring-256, " + // the url as string "String refhash-" + Word.commonHashLength + ", " + // the url's referrer hash "String urlname-80, " + // the name of the url, from anchor tag name - "Cardinal appdate-8 {b256}, " + // the time when the url was first time appeared + "Cardinal appdate-8 {b256}, " + // the date of the resource; either file date or first appearance "String profile-" + Word.commonHashLength + ", " + // the name of the prefetch profile handle "Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0 "Cardinal parentbr-3 {b256}, " + // number of anchors of the parent "Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors "byte[] flags-4, " + // flags - "String handle-4, " + // extra handle - "Cardinal loaddate-8 {b256}," + // NOT USED - "Cardinal lastmodified-8 {b256}," + // NOT USED - "Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince + "Cardinal handle-4 {b256}, " + // handle (NOT USED) + "Cardinal loaddate-8 {b256}, " + // NOT USED + "Cardinal lastmodified-8 {b256}, " + // NOT USED + "Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known Base64Order.enhancedCoder ); @@ -65,14 +65,13 @@ public class Request extends WorkflowJob { private byte[] refhash; // the url's referrer hash private DigestURI url; // the url as string private String name; // the name of the url, from anchor tag name - private long appdate; // the time when the url was first time appeared. may be negative in case that the date is before epoch (1970)! - private long imsdate; // the time of a ifModifiedSince request + private long appdate; // the time when the url was first time appeared. private String profileHandle; // the name of the fetch profile private int depth; // the prefetch depth so far, starts at 0 private int anchors; // number of anchors of the parent private int forkfactor; // sum of anchors of all ancestors - private Bitfield flags; - private int handle; + private Bitfield flags; + private long size; // size of resource in bytes (if known) or 0 if not known private String statusMessage; private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection @@ -82,7 +81,7 @@ public class Request extends WorkflowJob { * @param referrerhash */ public Request(final DigestURI url, final byte[] referrerhash) { - this(null, url, referrerhash, null, null, null, 0, 0, 0); + this(null, url, referrerhash, null, null, null, 0, 0, 0, 0); } /** @@ -108,7 +107,8 @@ public class Request extends WorkflowJob { final String profileHandle, final int depth, final int anchors, - final int forkfactor + final int forkfactor, + final long size ) { // create new entry and store it into database assert url != null; @@ -124,11 +124,10 @@ public class Request extends WorkflowJob { this.anchors = anchors; this.forkfactor = forkfactor; this.flags = new Bitfield(rowdef.width(10)); - this.handle = 0; - this.imsdate = 0; this.statusMessage = "loaded(args)"; this.initialHash = url.hashCode(); this.status = WorkflowJob.STATUS_INITIATED; + this.size = size; } public Request(final Row.Entry entry) throws IOException { @@ -150,10 +149,9 @@ public class Request extends WorkflowJob { this.anchors = (int) entry.getColLong(8); this.forkfactor = (int) entry.getColLong(9); this.flags = new Bitfield(entry.getColBytes(10, true)); - this.handle = Integer.parseInt(entry.getColString(11, null), 16); //this.loaddate = entry.getColLong(12); //this.lastmodified = entry.getColLong(13); - this.imsdate = entry.getColLong(14); + this.size = entry.getColLong(14); this.statusMessage = "loaded(kelondroRow.Entry)"; this.initialHash = url.hashCode(); return; @@ -174,17 +172,11 @@ public class Request extends WorkflowJob { return this.statusMessage; } - private static String normalizeHandle(final int h) { - String d = Integer.toHexString(h); - while (d.length() < rowdef.width(11)) d = "0" + d; - return d; - } - public Row.Entry toRow() { final byte[] appdatestr = NaturalOrder.encodeLong(appdate, rowdef.width(5)); final byte[] loaddatestr = NaturalOrder.encodeLong(0 /*loaddate*/, rowdef.width(12)); final byte[] serverdatestr = NaturalOrder.encodeLong(0 /*lastmodified*/, rowdef.width(13)); - final byte[] imsdatestr = NaturalOrder.encodeLong(imsdate, rowdef.width(14)); + final byte[] sizestr = NaturalOrder.encodeLong(this.size, rowdef.width(14)); // store the hash in the hash cache byte[] namebytes; try { @@ -204,10 +196,10 @@ public class Request extends WorkflowJob { NaturalOrder.encodeLong(this.anchors, rowdef.width(8)), NaturalOrder.encodeLong(this.forkfactor, rowdef.width(9)), this.flags.bytes(), - normalizeHandle(this.handle).getBytes(), + NaturalOrder.encodeLong(0, rowdef.width(11)), loaddatestr, serverdatestr, - imsdatestr}; + sizestr}; return rowdef.newEntry(entry); } @@ -251,9 +243,9 @@ public class Request extends WorkflowJob { return new Date(this.lastmodified); } */ - public Date imsdate() { + public long size() { // the date that the client (browser) send as ifModifiedSince in proxy mode - return new Date(this.imsdate); + return this.size; } public String name() { diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 62065ccb4..306c304d3 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -144,7 +144,7 @@ public class Response { public static final int QUEUE_STATE_FINISHED = 5; public Response( - Request request, + final Request request, final RequestHeader requestHeader, final ResponseHeader responseHeader, final String responseStatus, @@ -160,8 +160,19 @@ public class Response { this.content = content; } + public Response(final Request request, final CrawlProfile profile) { + this.request = request; + // request and response headers may be zero in case that we process surrogates + this.requestHeader = new RequestHeader(); + this.responseHeader = new ResponseHeader(); + this.responseStatus = "200"; + this.profile = profile; + this.status = QUEUE_STATE_FRESH; + this.content = request.url().toNormalform(true, true).getBytes(); + } + public Response( - Request request, + final Request request, final RequestHeader requestHeader, final ResponseHeader responseHeader, final String responseStatus, diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index 67cf1b9b1..d58f3f81a 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -56,14 +56,16 @@ import net.yacy.kelondro.util.FileUtils; public class SMBLoader { + public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10; + private final Switchboard sb; private final Log log; - private final int maxFileSize; + private final long maxFileSize; public SMBLoader(final Switchboard sb, final Log log) { this.sb = sb; this.log = log; - maxFileSize = (int) sb.getConfigLong("crawler.smb.maxFileSize", -1l); + maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l); } diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 5057388ec..58aead8aa 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -399,6 +399,7 @@ public final class HTTPDProxyHandler { sb.crawler.defaultProxyProfile.handle(), 0, 0, + 0, 0); final Response response = new Response( request, @@ -517,7 +518,8 @@ public final class HTTPDProxyHandler { sb.crawler.defaultProxyProfile.handle(), 0, 0, - 0); + 0, + sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); // handle incoming cookies diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index cfcfc1bb6..411424fd7 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1348,7 +1348,8 @@ public final class Switchboard extends serverSwitch { this.crawler.defaultSurrogateProfile.handle(), 0, 0, - 0 + 0, + 0 ); response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile); indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[]{document}, null); @@ -1800,7 +1801,8 @@ public final class Switchboard extends serverSwitch { response.profile().handle(), response.depth() + 1, 0, - 0 + 0, + response.size() < 0 ? 0 : response.size() )); } catch (MalformedURLException e) { Log.logException(e); @@ -2261,6 +2263,7 @@ public final class Switchboard extends serverSwitch { null, 0, 0, + 0, 0); crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason); } @@ -2433,7 +2436,7 @@ public final class Switchboard extends serverSwitch { peers.mySeed().put(yacySeed.UPTIME, Long.toString(uptime/60)); // the number of minutes that the peer is up in minutes/day (moving average MA30) peers.mySeed().put(yacySeed.LCOUNT, Long.toString(indexSegments.URLCount())); // the number of links that the peer has stored (LURL's) peers.mySeed().put(yacySeed.NCOUNT, Integer.toString(crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's) - peers.mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's) + peers.mySeed().put(yacySeed.RCOUNT, Integer.toString(crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's) peers.mySeed().put(yacySeed.ICOUNT, Long.toString(indexSegments.RWICount())); // the minimum number of words that the peer has indexed (as it says) peers.mySeed().put(yacySeed.SCOUNT, Integer.toString(peers.sizeConnected())); // the number of seeds that the peer has stored peers.mySeed().put(yacySeed.CCOUNT, Double.toString(((int) ((peers.sizeConnected() + peers.sizeDisconnected() + peers.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour) diff --git a/source/net/yacy/cora/document/Hit.java b/source/net/yacy/cora/document/Hit.java index 6245fd2f4..b93d97ece 100644 --- a/source/net/yacy/cora/document/Hit.java +++ b/source/net/yacy/cora/document/Hit.java @@ -50,6 +50,8 @@ public interface Hit { public void setSubject(String[] tags); + public void setSize(long size); + public String getAuthor(); public String getCopyright(); @@ -73,5 +75,7 @@ public interface Hit { public String getDocs(); public String[] getSubject(); + + public long getSize(); } diff --git a/source/net/yacy/cora/document/RSSMessage.java b/source/net/yacy/cora/document/RSSMessage.java index 45395e49f..d13eaf08e 100644 --- a/source/net/yacy/cora/document/RSSMessage.java +++ b/source/net/yacy/cora/document/RSSMessage.java @@ -47,7 +47,8 @@ public class RSSMessage implements Hit { language("language"), guid("guid"), ttl("ttl"), - docs("docs"); + docs("docs"), + size("size,length"); private Set keys; @@ -172,6 +173,11 @@ public class RSSMessage implements Hit { return Token.docs.valueFrom(this.map); } + public long getSize() { + String size = Token.size.valueFrom(this.map); + return (size == null) ? 0 : Long.parseLong(size); + } + public String getFulltext() { StringBuilder sb = new StringBuilder(300); for (String s: map.values()) sb.append(s).append(" "); @@ -230,13 +236,7 @@ public class RSSMessage implements Hit { } public void setSize(long size) { - // TODO Auto-generated method stub - - } - - public void setSizename(String sizename) { - // TODO Auto-generated method stub - + setValue("size", Long.toString(size)); } public void setTitle(String title) { diff --git a/source/net/yacy/cora/protocol/ftp/FTPClient.java b/source/net/yacy/cora/protocol/ftp/FTPClient.java index 9bb8e2623..5ce8464e9 100644 --- a/source/net/yacy/cora/protocol/ftp/FTPClient.java +++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java @@ -1048,9 +1048,9 @@ public class FTPClient { filetype type = filetype.file; if (tokens.group(1).startsWith("d")) type = filetype.directory; if (tokens.group(1).startsWith("l")) type = filetype.link; - int size = -1; + long size = -1; try { - size = Integer.parseInt(tokens.group(2)); + size = Long.parseLong(tokens.group(2)); } catch (final NumberFormatException e) { log.warn("not a number in list-entry: ", e); return null; @@ -1078,7 +1078,8 @@ public class FTPClient { log.warn("---- Error: not ls date-format '" + dateString, e); date = new Date(); } - return new entryInfo(type, size, date, tokens.group(6)); + String filename = tokens.group(6); + return new entryInfo(type, size, date, filename); } return null; } @@ -1104,7 +1105,7 @@ public class FTPClient { /** * size in bytes */ - public final int size; + public final long size; /** * date of file */ @@ -1130,7 +1131,7 @@ public class FTPClient { * @param date * @param name */ - public entryInfo(final filetype type, final int size, final Date date, final String name) { + public entryInfo(final filetype type, final long size, final Date date, final String name) { this.type = type; this.size = size; this.date = date; @@ -1680,8 +1681,8 @@ public class FTPClient { * @param path * @return size in bytes or -1 if size cannot be determinied */ - public int fileSize(final String path) { - int size = -1; + public long fileSize(final String path) { + long size = -1; try { // extended FTP size = size(path); diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 0a988cb0b..2f20a4db1 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -290,7 +290,7 @@ public final class TextParser { try { // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok. List idioms = parsers(url, mimeType); - return (idioms == null || idioms.isEmpty()) ? "no parser found" : null; + return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.get(0).getName().equals(genericIdiom.getName()))) ? "no parser found" : null; } catch (Parser.Failure e) { // in case that a parser is not available, return a error string describing the problem. return e.getMessage(); @@ -333,9 +333,7 @@ public final class TextParser { // check mime type computed from extension String mimeType2 = ext2mime.get(ext); - if (mimeType2 == null || denyMime.containsKey(mimeType2)) return idioms; // in this case we are a bit more lazy - idiom = mime2parser.get(mimeType2); - if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom); + if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.add(idiom); // always add the generic parser idioms.add(genericIdiom); diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index 5951ad9c4..9d864e05a 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -519,6 +519,7 @@ public class URIMetadataRow implements URIMetadata { null, 0, 0, + 0, 0); } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index fe2183880..6ad4ca8c3 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -129,6 +129,7 @@ public final class LoaderDispatcher { sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile 0, 0, + 0, 0); }