From 65eaf30f77c53afe9998c5df9d4bcc3df848b472 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 31 Aug 2010 15:47:47 +0000 Subject: [PATCH] redesign of crawl profiles data structure. target will be: - permanent storage of auto-dom statistics in profile - storage of profiles in WorkTable data structure not finished yet. No functional change yet. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7088 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CrawlProfileEditor_p.java | 96 ++- htroot/Crawler_p.java | 13 +- htroot/IndexCreateWWWGlobalQueue_p.java | 6 +- htroot/IndexCreateWWWLocalQueue_p.java | 17 +- htroot/IndexCreateWWWRemoteQueue_p.java | 6 +- htroot/ProxyIndexingMonitor_p.java | 74 +- htroot/QuickCrawlLink_p.java | 8 +- htroot/WatchWebStructure_p.java | 11 +- source/de/anomic/crawler/Balancer.java | 5 +- source/de/anomic/crawler/CrawlProfile.java | 685 +++++++----------- source/de/anomic/crawler/CrawlQueues.java | 10 +- source/de/anomic/crawler/CrawlStacker.java | 6 +- .../de/anomic/crawler/CrawlSwitchboard.java | 119 +-- source/de/anomic/crawler/NoticedURL.java | 17 +- source/de/anomic/crawler/SitemapImporter.java | 2 +- .../anomic/crawler/retrieval/FTPLoader.java | 11 +- .../anomic/crawler/retrieval/FileLoader.java | 11 +- .../anomic/crawler/retrieval/HTTPLoader.java | 5 +- .../de/anomic/crawler/retrieval/Response.java | 8 +- .../anomic/crawler/retrieval/SMBLoader.java | 11 +- source/de/anomic/data/SitemapParser.java | 10 +- source/de/anomic/data/WorkTables.java | 5 +- source/de/anomic/search/Switchboard.java | 69 +- source/de/anomic/yacy/yacyRelease.java | 3 - source/net/yacy/kelondro/blob/HeapReader.java | 1 + source/net/yacy/kelondro/blob/MapHeap.java | 5 +- .../net/yacy/repository/LoaderDispatcher.java | 4 +- 27 files changed, 532 insertions(+), 686 deletions(-) diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index a0c3d6d49..c48ab6d23 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -28,15 +28,15 @@ import java.text.DateFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; +import java.util.Map; import java.util.Set; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; -import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlSwitchboard; -import de.anomic.crawler.CrawlProfile.entry; +import de.anomic.crawler.CrawlProfile; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -80,23 +80,23 @@ public class CrawlProfileEditor_p { private static final ArrayList labels = new ArrayList(); static { - labels.add(new eentry(entry.NAME, "Name", true, eentry.STRING)); - labels.add(new eentry(entry.START_URL, "Start URL", true, eentry.STRING)); - labels.add(new eentry(entry.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING)); - labels.add(new eentry(entry.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING)); - labels.add(new eentry(entry.DEPTH, "Crawl Depth", false, eentry.INTEGER)); - labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); - labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER)); - labels.add(new eentry(entry.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); - labels.add(new eentry(entry.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN)); - labels.add(new eentry(entry.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN)); - labels.add(new eentry(entry.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN)); - labels.add(new eentry(entry.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN)); - labels.add(new eentry(entry.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN)); - labels.add(new eentry(entry.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN)); - labels.add(new eentry(entry.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN)); - labels.add(new eentry(entry.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN)); - labels.add(new eentry(entry.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN)); + labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING)); + labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING)); + labels.add(new eentry(CrawlProfile.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER)); + labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); + labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER)); + labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); + labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN)); + labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN)); + labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN)); + labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN)); + labels.add(new eentry(CrawlProfile.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN)); + labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN)); + labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN)); + labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN)); + labels.add(new eentry(CrawlProfile.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN)); } public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { @@ -106,40 +106,32 @@ public class CrawlProfileEditor_p { // read post for handle final String handle = (post == null) ? "" : post.get("handle", ""); if (post != null) { - if (post.containsKey("terminate")) { + if (post.containsKey("terminate")) try { // termination of a crawl: shift the crawl from active to passive - final CrawlProfile.entry entry = sb.crawler.profilesActiveCrawls.getEntry(handle); - if (entry != null) { - sb.crawler.profilesPassiveCrawls.newEntry(entry.map()); - } - sb.crawler.profilesActiveCrawls.removeEntry(handle.getBytes()); + final Map mp = sb.crawler.profilesActiveCrawls.get(handle.getBytes()); + if (mp != null) sb.crawler.profilesPassiveCrawls.put(handle.getBytes(), new CrawlProfile(mp)); // delete all entries from the crawl queue that are deleted here - try { - sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000); - } catch (RowSpaceExceededException e) { - Log.logException(e); - } + sb.crawler.profilesActiveCrawls.remove(handle.getBytes()); + sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000); + } catch (RowSpaceExceededException e) { + Log.logException(e); } if (post.containsKey("delete")) { // deletion of a terminated crawl profile - sb.crawler.profilesPassiveCrawls.removeEntry(handle.getBytes()); + sb.crawler.profilesPassiveCrawls.remove(handle.getBytes()); } if (post.containsKey("deleteTerminatedProfiles")) { - Iterator profiles = sb.crawler.profilesPassiveCrawls.profiles(false); - while (profiles.hasNext()) { - profiles.next(); - profiles.remove(); - profiles = sb.crawler.profilesPassiveCrawls.profiles(false); + for (byte[] h: sb.crawler.profilesPassiveCrawls.keySet()) { + sb.crawler.profilesPassiveCrawls.remove(h); } } } // generate handle list int count = 0; - Iterator it = sb.crawler.profilesActiveCrawls.profiles(true); - entry selentry; - while (it.hasNext()) { - selentry = it.next(); + CrawlProfile selentry; + for (byte[] h: sb.crawler.profilesActiveCrawls.keySet()) { + selentry = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(h)); if (ignoreNames.contains(selentry.name())) { continue; } @@ -151,7 +143,8 @@ public class CrawlProfileEditor_p { count++; } prop.put("profiles", count); - selentry = sb.crawler.profilesActiveCrawls.getEntry(handle); + final Map mp = sb.crawler.profilesActiveCrawls.get(handle.getBytes()); + selentry = mp == null ? null : new CrawlProfile(mp); assert selentry == null || selentry.handle() != null; // read post for change submit if ((post != null) && (selentry != null)) { @@ -161,10 +154,11 @@ public class CrawlProfileEditor_p { eentry tee; while (lit.hasNext()) { tee = lit.next(); - final String cval = selentry.map().get(tee.name); + final String cval = selentry.get(tee.name); final String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval); if (!cval.equals(val)) { - sb.crawler.profilesActiveCrawls.changeEntry(selentry, tee.name, val); + selentry.put(tee.name, val); + sb.crawler.profilesActiveCrawls.put(selentry.handle().getBytes(), selentry); } } } catch (final Exception ex) { @@ -179,20 +173,18 @@ public class CrawlProfileEditor_p { count = 0; boolean dark = true; final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160); - CrawlProfile.entry profile; + CrawlProfile profile; // put active crawls into list - it = sb.crawler.profilesActiveCrawls.profiles(true); - while (it.hasNext()) { - profile = it.next(); + for (byte[] h: sb.crawler.profilesActiveCrawls.keySet()) { + profile = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(h)); putProfileEntry(prop, profile, true, dark, count, domlistlength); dark = !dark; count++; } // put passive crawls into list boolean existPassiveCrawls = false; - it = sb.crawler.profilesPassiveCrawls.profiles(true); - while (it.hasNext()) { - profile = it.next(); + for (byte[] h: sb.crawler.profilesPassiveCrawls.keySet()) { + profile = new CrawlProfile(sb.crawler.profilesPassiveCrawls.get(h)); putProfileEntry(prop, profile, false, dark, count, domlistlength); dark = !dark; count++; @@ -217,7 +209,7 @@ public class CrawlProfileEditor_p { count = 0; while (lit.hasNext()) { final eentry ee = lit.next(); - final String val = selentry.map().get(ee.name); + final String val = selentry.get(ee.name); prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly", ee.readonly ? "1" : "0"); prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_name", ee.name); prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_label", ee.label); @@ -235,7 +227,7 @@ public class CrawlProfileEditor_p { return prop; } - private static void putProfileEntry(final servletProperties prop, final CrawlProfile.entry profile, final boolean active, final boolean dark, final int count, final int domlistlength) { + private static void putProfileEntry(final servletProperties prop, final CrawlProfile profile, final boolean active, final boolean dark, final int count, final int domlistlength) { prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0"); prop.put(CRAWL_PROFILE_PREFIX + count + "_name", profile.name()); diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 1d1aae168..a6cc74311 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -254,8 +254,8 @@ public class Crawler_p { sb.crawlQueues.errorURL.remove(urlhash); // stack url - sb.crawler.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it - final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry( + sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it + final CrawlProfile pe = new CrawlProfile( (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(), crawlingStartURL, newcrawlingMustMatch, @@ -265,6 +265,7 @@ public class Crawler_p { crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy); + sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); final String reasonString = sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash.getBytes(), url, @@ -297,7 +298,7 @@ public class Crawler_p { // generate a YaCyNews if the global flag was set if (crawlOrder) { - final Map m = new HashMap(pe.map()); // must be cloned + final Map m = new HashMap(pe); // must be cloned m.remove("specificDepth"); m.remove("indexText"); m.remove("indexMedia"); @@ -371,7 +372,7 @@ public class Crawler_p { // creating a crawler profile final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null); - final CrawlProfile.entry profile = sb.crawler.profilesActiveCrawls.newEntry( + final CrawlProfile profile = new CrawlProfile( fileName, crawlURL, newcrawlingMustMatch, CrawlProfile.MATCH_NEVER, @@ -387,6 +388,7 @@ public class Crawler_p { crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy); + sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); // pause local crawl here sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); @@ -435,7 +437,7 @@ public class Crawler_p { final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null); // create a new profile - final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry( + final CrawlProfile pe = new CrawlProfile( sitemapURLStr, sitemapURL, newcrawlingMustMatch, CrawlProfile.MATCH_NEVER, @@ -446,6 +448,7 @@ public class Crawler_p { storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy); + sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); // create a new sitemap importer final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new DigestURI(sitemapURLStr, null), pe); diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index 27a998d10..a328a83ba 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -31,6 +31,7 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.Locale; +import java.util.Map; import net.yacy.cora.protocol.RequestHeader; @@ -95,14 +96,15 @@ public class IndexCreateWWWGlobalQueue_p { boolean dark = true; yacySeed initiator; String profileHandle; - CrawlProfile.entry profileEntry; + CrawlProfile profileEntry; int i, showNum = 0; for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { urle = crawlerList.get(i); if (urle != null && urle.url() != null) { initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator())); profileHandle = urle.profileHandle(); - profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle); + final Map mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes()); + profileEntry = mp == null ? null : new CrawlProfile(mp); prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0"); prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 6ad6218c7..0de9c6918 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -32,6 +32,7 @@ import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.Locale; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -95,10 +96,9 @@ public class IndexCreateWWWLocalQueue_p { if (option == PROFILE) { // search and delete the crawl profile (_much_ faster, independant of queue size) // XXX: what to do about the annoying LOST PROFILE messages in the log? - final Iterator it = sb.crawler.profilesActiveCrawls.profiles(true); - CrawlProfile.entry entry; - while (it.hasNext()) { - entry = it.next(); + CrawlProfile entry; + for (byte[] handle: sb.crawler.profilesActiveCrawls.keySet()) { + entry = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(handle)); final String name = entry.name(); if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) || name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) || @@ -108,9 +108,7 @@ public class IndexCreateWWWLocalQueue_p { name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) || name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) continue; - if (compiledPattern.matcher(name).find()) { - sb.crawler.profilesActiveCrawls.removeEntry(entry.handle().getBytes()); - } + if (compiledPattern.matcher(name).find()) sb.crawler.profilesActiveCrawls.remove(entry.handle().getBytes()); } } else { // iterating through the list of URLs @@ -165,14 +163,15 @@ public class IndexCreateWWWLocalQueue_p { boolean dark = true; yacySeed initiator; String profileHandle; - CrawlProfile.entry profileEntry; + CrawlProfile profileEntry; int i; for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { urle = crawlerList.get(i); if ((urle != null)&&(urle.url()!=null)) { initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator())); profileHandle = urle.profileHandle(); - profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle); + final Map mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes()); + profileEntry = mp == null ? null : new CrawlProfile(mp); prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0"); prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); diff --git a/htroot/IndexCreateWWWRemoteQueue_p.java b/htroot/IndexCreateWWWRemoteQueue_p.java index dfeae983b..6fb7980c7 100644 --- a/htroot/IndexCreateWWWRemoteQueue_p.java +++ b/htroot/IndexCreateWWWRemoteQueue_p.java @@ -28,6 +28,7 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.Locale; +import java.util.Map; import net.yacy.cora.protocol.RequestHeader; @@ -92,14 +93,15 @@ public class IndexCreateWWWRemoteQueue_p { boolean dark = true; yacySeed initiator; String profileHandle; - CrawlProfile.entry profileEntry; + CrawlProfile profileEntry; int i, showNum = 0; for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { urle = crawlerList.get(i); if (urle != null && urle.url() != null) { initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator())); profileHandle = urle.profileHandle(); - profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle); + final Map mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes()); + profileEntry = mp == null ? null : new CrawlProfile(mp); prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0"); prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); diff --git a/htroot/ProxyIndexingMonitor_p.java b/htroot/ProxyIndexingMonitor_p.java index b603e2874..51e5c4d6c 100644 --- a/htroot/ProxyIndexingMonitor_p.java +++ b/htroot/ProxyIndexingMonitor_p.java @@ -28,7 +28,6 @@ // if the shell's current path is HTROOT import java.io.File; -import java.io.IOException; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.logging.Log; @@ -102,46 +101,41 @@ public class ProxyIndexingMonitor_p { if (sb.crawler.defaultProxyProfile == null) { prop.put("info", "1"); //delete DATA/PLASMADB/crawlProfiles0.db } else { - try { - assert sb.crawler.defaultProxyProfile.handle() != null; - sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "generalDepth", Integer.toString(newProxyPrefetchDepth)); - sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "storeHTCache", (proxyStoreHTCache) ? "true": "false"); - sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "remoteIndexing",proxyIndexingRemote ? "true":"false"); - sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "indexText",proxyIndexingLocalText ? "true":"false"); - sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "indexMedia",proxyIndexingLocalMedia ? "true":"false"); - - prop.put("info", "2");//new proxyPrefetchdepth - prop.put("info_message", newProxyPrefetchDepth); - prop.put("info_caching", proxyStoreHTCache ? "1" : "0"); - prop.put("info_indexingLocalText", proxyIndexingLocalText ? "1" : "0"); - prop.put("info_indexingLocalMedia", proxyIndexingLocalMedia ? "1" : "0"); - prop.put("info_indexingRemote", proxyIndexingRemote ? "1" : "0"); - - // proxyCache - only display on change - if (oldProxyCachePath.equals(newProxyCachePath)) { - prop.put("info_path", "0"); - prop.putHTML("info_path_return", oldProxyCachePath); - } else { - prop.put("info_path", "1"); - prop.putHTML("info_path_return", newProxyCachePath); - } - // proxyCacheSize - only display on change - if (oldProxyCacheSize.equals(newProxyCacheSize)) { - prop.put("info_size", "0"); - prop.put("info_size_return", oldProxyCacheSize); - } else { - prop.put("info_size", "1"); - prop.put("info_size_return", newProxyCacheSize); - } - // proxyCache, proxyCacheSize we need a restart - prop.put("info_restart", "0"); - prop.put("info_restart_return", "0"); - if (!oldProxyCachePath.equals(newProxyCachePath)) prop.put("info_restart", "1"); - - } catch (final IOException e) { - prop.put("info", "3"); //Error: errmsg - prop.putHTML("info_error", e.getMessage()); + assert sb.crawler.defaultProxyProfile.handle() != null; + sb.crawler.defaultProxyProfile.put("generalDepth", Integer.toString(newProxyPrefetchDepth)); + sb.crawler.defaultProxyProfile.put("storeHTCache", (proxyStoreHTCache) ? "true": "false"); + sb.crawler.defaultProxyProfile.put("remoteIndexing",proxyIndexingRemote ? "true":"false"); + sb.crawler.defaultProxyProfile.put("indexText",proxyIndexingLocalText ? "true":"false"); + sb.crawler.defaultProxyProfile.put("indexMedia",proxyIndexingLocalMedia ? "true":"false"); + sb.crawler.profilesActiveCrawls.put(sb.crawler.defaultProxyProfile.handle().getBytes(), sb.crawler.defaultProxyProfile); + + prop.put("info", "2");//new proxyPrefetchdepth + prop.put("info_message", newProxyPrefetchDepth); + prop.put("info_caching", proxyStoreHTCache ? "1" : "0"); + prop.put("info_indexingLocalText", proxyIndexingLocalText ? "1" : "0"); + prop.put("info_indexingLocalMedia", proxyIndexingLocalMedia ? "1" : "0"); + prop.put("info_indexingRemote", proxyIndexingRemote ? "1" : "0"); + + // proxyCache - only display on change + if (oldProxyCachePath.equals(newProxyCachePath)) { + prop.put("info_path", "0"); + prop.putHTML("info_path_return", oldProxyCachePath); + } else { + prop.put("info_path", "1"); + prop.putHTML("info_path_return", newProxyCachePath); } + // proxyCacheSize - only display on change + if (oldProxyCacheSize.equals(newProxyCacheSize)) { + prop.put("info_size", "0"); + prop.put("info_size_return", oldProxyCacheSize); + } else { + prop.put("info_size", "1"); + prop.put("info_size_return", newProxyCacheSize); + } + // proxyCache, proxyCacheSize we need a restart + prop.put("info_restart", "0"); + prop.put("info_restart_return", "0"); + if (!oldProxyCachePath.equals(newProxyCachePath)) prop.put("info_restart", "1"); } } catch (final Exception e) { diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 3d9da8f5a..c951e7d10 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -143,9 +143,9 @@ public class QuickCrawlLink_p { sb.crawlQueues.errorURL.remove(urlhash); // create crawling profile - CrawlProfile.entry pe = null; + CrawlProfile pe = null; try { - pe = sb.crawler.profilesActiveCrawls.newEntry( + pe = new CrawlProfile( crawlingStartURL.getHost(), crawlingStartURL, crawlingMustMatch, @@ -163,8 +163,8 @@ public class QuickCrawlLink_p { xsstopw, xdstopw, xpstopw, - CrawlProfile.CacheStrategy.IFFRESH - ); + CrawlProfile.CacheStrategy.IFFRESH); + sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); } catch (final Exception e) { // mist prop.put("mode_status", "2");//Error with url diff --git a/htroot/WatchWebStructure_p.java b/htroot/WatchWebStructure_p.java index 4f79c0c3c..50dbc0e26 100644 --- a/htroot/WatchWebStructure_p.java +++ b/htroot/WatchWebStructure_p.java @@ -4,11 +4,9 @@ //$LastChangedBy$ // -import java.util.Iterator; - import net.yacy.cora.protocol.RequestHeader; -import de.anomic.crawler.CrawlProfile.entry; +import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlSwitchboard; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; @@ -50,10 +48,9 @@ public class WatchWebStructure_p { if (host.equals("auto")) { // try to find the host from the crawl profiles - final Iterator it = sb.crawler.profilesActiveCrawls.profiles(true); - entry e; - while (it.hasNext()) { - e = it.next(); + CrawlProfile e; + for (byte[] handle: sb.crawler.profilesActiveCrawls.keySet()) { + e = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(handle)); if (e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) || e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) || e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 6e8a229bc..e109339e6 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -320,7 +320,7 @@ public class Balancer { * @throws IOException * @throws RowSpaceExceededException */ - public Request pop(final boolean delay, final CrawlProfile profile) throws IOException { + public Request pop(final boolean delay, final Map> profiles) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times try { @@ -384,7 +384,8 @@ public class Balancer { // at this point we must check if the crawlEntry has relevance because the crawl profile still exists // if not: return null. A calling method must handle the null value and try again - final CrawlProfile.entry profileEntry = (profile == null) ? null : profile.getEntry(crawlEntry.profileHandle()); + final Map mp = profiles == null ? null : profiles.get(crawlEntry.profileHandle()); + final CrawlProfile profileEntry = mp == null ? null : new CrawlProfile(mp); if (profileEntry == null) { Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); return null; diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index cccc1b65d..8dc5e13fc 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -4,7 +4,7 @@ // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de // Frankfurt, Germany, 2004 -// last major change: 25.02.2004 +// last major change: 31.08.2010 // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -22,210 +22,300 @@ package de.anomic.crawler; -import java.io.File; -import java.io.IOException; -import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; -import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; -import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; -import net.yacy.kelondro.order.CloneableIterator; import net.yacy.kelondro.order.Digest; -import net.yacy.kelondro.order.NaturalOrder; -import net.yacy.kelondro.util.FileUtils; -import net.yacy.kelondro.util.kelondroException; -public class CrawlProfile { +public class CrawlProfile extends ConcurrentHashMap implements Map { + + private static final long serialVersionUID = 5527325718810703504L; public static final String MATCH_ALL = ".*"; public static final String MATCH_NEVER = ""; - static ConcurrentHashMap> domsCache = new ConcurrentHashMap>(); + // this is a simple record structure that hold all properties of a single crawl start + public static final String HANDLE = "handle"; + public static final String NAME = "name"; + public static final String START_URL = "startURL"; + public static final String FILTER_MUSTMATCH = "generalFilter"; + public static final String FILTER_MUSTNOTMATCH = "nevermatch"; + public static final String DEPTH = "generalDepth"; + public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; + public static final String DOM_FILTER_DEPTH = "domFilterDepth"; + public static final String DOM_MAX_PAGES = "domMaxPages"; + public static final String CRAWLING_Q = "crawlingQ"; + public static final String INDEX_TEXT = "indexText"; + public static final String INDEX_MEDIA = "indexMedia"; + public static final String STORE_HTCACHE = "storeHTCache"; + public static final String STORE_TXCACHE = "storeTXCache"; + public static final String REMOTE_INDEXING = "remoteIndexing"; + public static final String XSSTOPW = "xsstopw"; + public static final String XDSTOPW = "xdstopw"; + public static final String XPSTOPW = "xpstopw"; + public static final String CACHE_STRAGEGY = "cacheStrategy"; - MapHeap profileTable; - private final File profileTableFile; + private Map doms; + private Pattern mustmatch = null, mustnotmatch = null; - public CrawlProfile(final File file) throws IOException { - //System.out.println("loading crawl profile from " + file); - this.profileTableFile = file; - profileTableFile.getParentFile().mkdirs(); - profileTable = new MapHeap(profileTableFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); - profileIterator pi = new profileIterator(true); - entry e; - while (pi.hasNext()) { - e = pi.next(); - if (e == null) continue; - Log.logInfo("CrawlProfiles", "loaded Profile " + e.handle() + ": " + e.name()); - } + + public CrawlProfile(final String name, final DigestURI startURL, + final String mustmatch, + final String mustnotmatch, + final int depth, + final long recrawlIfOlder /*date*/, + final int domFilterDepth, final int domMaxPages, + final boolean crawlingQ, + final boolean indexText, final boolean indexMedia, + final boolean storeHTCache, final boolean storeTXCache, + final boolean remoteIndexing, + final boolean xsstopw, final boolean xdstopw, final boolean xpstopw, + final CacheStrategy cacheStrategy) { + super(40); + if (name == null || name.length() == 0) throw new NullPointerException("name must not be null"); + final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : new String(startURL.hash()); + put(HANDLE, handle); + put(NAME, name); + put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false)); + put(FILTER_MUSTMATCH, (mustmatch == null) ? CrawlProfile.MATCH_ALL : mustmatch); + put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch); + put(DEPTH, depth); + put(RECRAWL_IF_OLDER, recrawlIfOlder); + put(DOM_FILTER_DEPTH, domFilterDepth); + put(DOM_MAX_PAGES, domMaxPages); + put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' + put(INDEX_TEXT, indexText); + put(INDEX_MEDIA, indexMedia); + put(STORE_HTCACHE, storeHTCache); + put(STORE_TXCACHE, storeTXCache); + put(REMOTE_INDEXING, remoteIndexing); + put(XSSTOPW, xsstopw); // exclude static stop-words + put(XDSTOPW, xdstopw); // exclude dynamic stop-word + put(XPSTOPW, xpstopw); // exclude parent stop-words + put(CACHE_STRAGEGY, cacheStrategy.toString()); + doms = new ConcurrentHashMap(); } - public void clear() { - // deletes the profile database and creates a new one - if (profileTable != null) profileTable.close(); - FileUtils.deletedelete(profileTableFile); - profileTableFile.getParentFile().mkdirs(); - try { - profileTable = new MapHeap(profileTableFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); - } catch (IOException e) { - Log.logException(e); - } + public CrawlProfile(Map ext) { + super(ext == null ? 1 : ext.size()); + if (ext != null) this.putAll(ext); + doms = new ConcurrentHashMap(); } - public void close() { - if (profileTable != null) profileTable.close(); - this.profileTable = null; + public void put(String key, boolean value) { + super.put(key, Boolean.toString(value)); } - public int size() { - return profileTable.size(); + public void put(String key, int value) { + super.put(key, Integer.toString(value)); } - public Iterator profiles(final boolean up) { - // enumerates profile entries - try { - return new profileIterator(up); - } catch (final IOException e) { - Log.logException(e); - return new HashSet().iterator(); - } + public void put(String key, long value) { + super.put(key, Long.toString(value)); } - public class profileIterator implements Iterator { - // the iterator iterates all keys, which are byte[] objects - CloneableIterator handleIterator; - String lastkey; - public profileIterator(final boolean up) throws IOException { - handleIterator = profileTable.keys(up, false); - lastkey = null; - } - public boolean hasNext() { - try { - return handleIterator.hasNext(); - } catch (final kelondroException e) { - Log.logException(e); - clear(); - return false; - } - } - public entry next() { - try { - lastkey = new String(handleIterator.next()); - return getEntry(lastkey); - } catch (final kelondroException e) { - Log.logException(e); - clear(); - return null; - } + public String handle() { + final String r = get(HANDLE); + //if (r == null) return null; + return r; + } + public String name() { + final String r = get(NAME); + if (r == null) return ""; + return r; + } + public String startURL() { + final String r = get(START_URL); + return r; + } + public Pattern mustMatchPattern() { + if (this.mustmatch == null) { + String r = get(FILTER_MUSTMATCH); + if (r == null) r = CrawlProfile.MATCH_ALL; + this.mustmatch = Pattern.compile(r); } - public void remove() { - if (lastkey != null) try { - removeEntry(lastkey.getBytes()); - } catch (final kelondroException e) { - Log.logException(e); - clear(); - } + return this.mustmatch; + } + public Pattern mustNotMatchPattern() { + if (this.mustnotmatch == null) { + String r = get(FILTER_MUSTNOTMATCH); + if (r == null) r = CrawlProfile.MATCH_NEVER; + this.mustnotmatch = Pattern.compile(r); } + return this.mustnotmatch; } - - public void removeEntry(final byte[] handle) { + public int depth() { + final String r = get(DEPTH); + if (r == null) return 0; try { - profileTable.delete(handle); - } catch (final IOException e) { + return Integer.parseInt(r); + } catch (final NumberFormatException e) { Log.logException(e); + return 0; } } - - public entry newEntry(final Map mem) { - final entry ne = new entry(mem); + public CacheStrategy cacheStrategy() { + final String r = get(CACHE_STRAGEGY); + if (r == null) return CacheStrategy.IFFRESH; try { - profileTable.insert(ne.handle().getBytes(), ne.map()); - } catch (final Exception e) { - clear(); - try { - profileTable.insert(ne.handle().getBytes(), ne.map()); - } catch (final Exception ee) { - Log.logException(e); - System.exit(0); - } + return CacheStrategy.decode(Integer.parseInt(r)); + } catch (final NumberFormatException e) { + Log.logException(e); + return CacheStrategy.IFFRESH; } - return ne; } - - public entry newEntry( final String name, - final DigestURI startURL, - final String mustmatch, final String mustnotmatch, - final int generalDepth, - final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages, - final boolean crawlingQ, - final boolean indexText, final boolean indexMedia, - final boolean storeHTCache, final boolean storeTXCache, - final boolean remoteIndexing, - final boolean xsstopw, final boolean xdstopw, final boolean xpstopw, - final CacheStrategy cacheStrategy) { - - final entry ne = new entry( - name, startURL, - mustmatch, mustnotmatch, - generalDepth, - recrawlIfOlder, domFilterDepth, domMaxPages, - crawlingQ, - indexText, indexMedia, - storeHTCache, storeTXCache, - remoteIndexing, - xsstopw, xdstopw, xpstopw, - cacheStrategy); + public void setCacheStrategy(CacheStrategy newStrategy) { + put(CACHE_STRAGEGY, newStrategy.toString()); + } + public long recrawlIfOlder() { + // returns a long (millis) that is the minimum age that + // an entry must have to be re-crawled + final String r = get(RECRAWL_IF_OLDER); + if (r == null) return 0L; try { - profileTable.insert(ne.handle().getBytes(), ne.map()); - } catch (final Exception e) { - clear(); - try { - profileTable.insert(ne.handle().getBytes(), ne.map()); - } catch (final Exception ee) { - Log.logException(e); - System.exit(0); - } + final long l = Long.parseLong(r); + return (l < 0) ? 0L : l; + } catch (final NumberFormatException e) { + Log.logException(e); + return 0L; } - return ne; } - - public boolean hasEntry(final String handle) { - return profileTable.containsKey(handle.getBytes()); - } - - public entry getEntry(final String handle) { - if (profileTable == null) return null; - Map m; + public int domFilterDepth() { + // if the depth is equal or less to this depth, + // then the current url feeds with its domain the crawl filter + // if this is -1, all domains are feeded + final String r = get(DOM_FILTER_DEPTH); + if (r == null) return Integer.MAX_VALUE; try { - m = profileTable.get(handle.getBytes()); - } catch (final IOException e) { + final int i = Integer.parseInt(r); + if (i < 0) return Integer.MAX_VALUE; + return i; + } catch (final NumberFormatException e) { Log.logException(e); - return null; - } catch (RowSpaceExceededException e) { + return Integer.MAX_VALUE; + } + } + public int domMaxPages() { + // this is the maximum number of pages that are crawled for a single domain + // if -1, this means no limit + final String r = get(DOM_MAX_PAGES); + if (r == null) return Integer.MAX_VALUE; + try { + final int i = Integer.parseInt(r); + if (i < 0) return Integer.MAX_VALUE; + return i; + } catch (final NumberFormatException e) { Log.logException(e); - return null; + return Integer.MAX_VALUE; } - if (m == null) return null; - return new entry(m); + } + public boolean crawlingQ() { + final String r = get(CRAWLING_Q); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean indexText() { + final String r = get(INDEX_TEXT); + if (r == null) return true; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean indexMedia() { + final String r = get(INDEX_MEDIA); + if (r == null) return true; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean storeHTCache() { + final String r = get(STORE_HTCACHE); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean storeTXCache() { + final String r = get(STORE_TXCACHE); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean remoteIndexing() { + final String r = get(REMOTE_INDEXING); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean excludeStaticStopwords() { + final String r = get(XSSTOPW); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean excludeDynamicStopwords() { + final String r = get(XDSTOPW); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean excludeParentStopwords() { + final String r = get(XPSTOPW); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + public void domInc(final String domain, final String referrer, final int depth) { + final DomProfile dp = doms.get(domain); + if (dp == null) { + // new domain + doms.put(domain, new DomProfile(referrer, depth)); + } else { + // increase counter + dp.inc(); + } + } + public boolean grantedDomAppearance(final String domain) { + final int max = domFilterDepth(); + if (max == Integer.MAX_VALUE) return true; + final DomProfile dp = doms.get(domain); + if (dp == null) { + return 0 < max; + } + return dp.depth <= max; } - public void changeEntry(final entry e, final String propName, final String newValue) throws IOException, RowSpaceExceededException { - e.mem.put(propName, newValue); - assert e.handle() != null; - profileTable.insert(e.handle().getBytes(), e.mem); + public boolean grantedDomCount(final String domain) { + final int max = domMaxPages(); + if (max == Integer.MAX_VALUE) return true; + final DomProfile dp = doms.get(domain); + if (dp == null) { + return 0 < max; + } + return dp.count <= max; } - - public long getRecrawlDate(final long oldTimeMinutes) { - return System.currentTimeMillis() - (60000L * oldTimeMinutes); + public int domSize() { + return doms.size(); + } + public boolean domExists(final String domain) { + if (domFilterDepth() == Integer.MAX_VALUE) return true; + return doms.containsKey(domain); + } + + public String domName(final boolean attr, final int index){ + final Iterator> domnamesi = doms.entrySet().iterator(); + String domname=""; + Map.Entry ey; + DomProfile dp; + int i = 0; + while ((domnamesi.hasNext()) && (i < index)) { + ey = domnamesi.next(); + i++; + } + if (domnamesi.hasNext()) { + ey = domnamesi.next(); + dp = ey.getValue(); + domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " "); + } + return domname; } - public static class DomProfile { + public final static class DomProfile { public String referrer; public int depth, count; @@ -275,287 +365,8 @@ public class CrawlProfile { return this.code == 3; } } - - public static class entry { - // this is a simple record structure that hold all properties of a single crawl start - - public static final String HANDLE = "handle"; - public static final String NAME = "name"; - public static final String START_URL = "startURL"; - public static final String FILTER_MUSTMATCH = "generalFilter"; - public static final String FILTER_MUSTNOTMATCH = "nevermatch"; - public static final String DEPTH = "generalDepth"; - public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; - public static final String DOM_FILTER_DEPTH = "domFilterDepth"; - public static final String DOM_MAX_PAGES = "domMaxPages"; - public static final String CRAWLING_Q = "crawlingQ"; - public static final String INDEX_TEXT = "indexText"; - public static final String INDEX_MEDIA = "indexMedia"; - public static final String STORE_HTCACHE = "storeHTCache"; - public static final String STORE_TXCACHE = "storeTXCache"; - public static final String REMOTE_INDEXING = "remoteIndexing"; - public static final String XSSTOPW = "xsstopw"; - public static final String XDSTOPW = "xdstopw"; - public static final String XPSTOPW = "xpstopw"; - public static final String CACHE_STRAGEGY = "cacheStrategy"; - - private Map mem; - private Map doms; - private Pattern mustmatch = null, mustnotmatch = null; - - - public entry(final String name, final DigestURI startURL, - final String mustmatch, - final String mustnotmatch, - final int depth, - final long recrawlIfOlder /*date*/, - final int domFilterDepth, final int domMaxPages, - final boolean crawlingQ, - final boolean indexText, final boolean indexMedia, - final boolean storeHTCache, final boolean storeTXCache, - final boolean remoteIndexing, - final boolean xsstopw, final boolean xdstopw, final boolean xpstopw, - final CacheStrategy cacheStrategy) { - if (name == null || name.length() == 0) throw new NullPointerException("name must not be null"); - final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : new String(startURL.hash()); - mem = new ConcurrentHashMap(40); - mem.put(HANDLE, handle); - mem.put(NAME, name); - mem.put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false)); - mem.put(FILTER_MUSTMATCH, (mustmatch == null) ? MATCH_ALL : mustmatch); - mem.put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? MATCH_NEVER : mustnotmatch); - mem.put(DEPTH, Integer.toString(depth)); - mem.put(RECRAWL_IF_OLDER, Long.toString(recrawlIfOlder)); - mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth)); - mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages)); - mem.put(CRAWLING_Q, Boolean.toString(crawlingQ)); // crawling of urls with '?' - mem.put(INDEX_TEXT, Boolean.toString(indexText)); - mem.put(INDEX_MEDIA, Boolean.toString(indexMedia)); - mem.put(STORE_HTCACHE, Boolean.toString(storeHTCache)); - mem.put(STORE_TXCACHE, Boolean.toString(storeTXCache)); - mem.put(REMOTE_INDEXING, Boolean.toString(remoteIndexing)); - mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words - mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word - mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words - mem.put(CACHE_STRAGEGY, cacheStrategy.toString()); - doms = new ConcurrentHashMap(); - } - - @Override - public String toString() { - final StringBuilder str = new StringBuilder(); - - if (this.mem != null) { - str.append(this.mem.toString()); - } - - return str.toString(); - } - - public entry(final Map mem) { - this.mem = mem; - this.doms = domsCache.get(this.mem.get(HANDLE)); - if (this.doms == null) this.doms = new ConcurrentHashMap(); - } - - public Map map() { - return mem; - } - public String handle() { - final String r = mem.get(HANDLE); - //if (r == null) return null; - return r; - } - public String name() { - final String r = mem.get(NAME); - if (r == null) return ""; - return r; - } - public String startURL() { - final String r = mem.get(START_URL); - return r; - } - public Pattern mustMatchPattern() { - if (this.mustmatch == null) { - String r = mem.get(FILTER_MUSTMATCH); - if (r == null) r = MATCH_ALL; - this.mustmatch = Pattern.compile(r); - } - return this.mustmatch; - } - public Pattern mustNotMatchPattern() { - if (this.mustnotmatch == null) { - String r = mem.get(FILTER_MUSTNOTMATCH); - if (r == null) r = MATCH_NEVER; - this.mustnotmatch = Pattern.compile(r); - } - return this.mustnotmatch; - } - public int depth() { - final String r = mem.get(DEPTH); - if (r == null) return 0; - try { - return Integer.parseInt(r); - } catch (final NumberFormatException e) { - Log.logException(e); - return 0; - } - } - public CacheStrategy cacheStrategy() { - final String r = mem.get(CACHE_STRAGEGY); - if (r == null) return CacheStrategy.IFFRESH; - try { - return CacheStrategy.decode(Integer.parseInt(r)); - } catch (final NumberFormatException e) { - Log.logException(e); - return CacheStrategy.IFFRESH; - } - } - public void setCacheStrategy(CacheStrategy newStrategy) { - mem.put(CACHE_STRAGEGY, newStrategy.toString()); - } - public long recrawlIfOlder() { - // returns a long (millis) that is the minimum age that - // an entry must have to be re-crawled - final String r = mem.get(RECRAWL_IF_OLDER); - if (r == null) return 0L; - try { - final long l = Long.parseLong(r); - return (l < 0) ? 0L : l; - } catch (final NumberFormatException e) { - Log.logException(e); - return 0L; - } - } - public int domFilterDepth() { - // if the depth is equal or less to this depth, - // then the current url feeds with its domain the crawl filter - // if this is -1, all domains are feeded - final String r = mem.get(DOM_FILTER_DEPTH); - if (r == null) return Integer.MAX_VALUE; - try { - final int i = Integer.parseInt(r); - if (i < 0) return Integer.MAX_VALUE; - return i; - } catch (final NumberFormatException e) { - Log.logException(e); - return Integer.MAX_VALUE; - } - } - public int domMaxPages() { - // this is the maximum number of pages that are crawled for a single domain - // if -1, this means no limit - final String r = mem.get(DOM_MAX_PAGES); - if (r == null) return Integer.MAX_VALUE; - try { - final int i = Integer.parseInt(r); - if (i < 0) return Integer.MAX_VALUE; - return i; - } catch (final NumberFormatException e) { - Log.logException(e); - return Integer.MAX_VALUE; - } - } - public boolean crawlingQ() { - final String r = mem.get(CRAWLING_Q); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } - public boolean indexText() { - final String r = mem.get(INDEX_TEXT); - if (r == null) return true; - return (r.equals(Boolean.TRUE.toString())); - } - public boolean indexMedia() { - final String r = mem.get(INDEX_MEDIA); - if (r == null) return true; - return (r.equals(Boolean.TRUE.toString())); - } - public boolean storeHTCache() { - final String r = mem.get(STORE_HTCACHE); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } - public boolean storeTXCache() { - final String r = mem.get(STORE_TXCACHE); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } - public boolean remoteIndexing() { - final String r = mem.get(REMOTE_INDEXING); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } - public boolean excludeStaticStopwords() { - final String r = mem.get(XSSTOPW); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } - public boolean excludeDynamicStopwords() { - final String r = mem.get(XDSTOPW); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } - public boolean excludeParentStopwords() { - final String r = mem.get(XPSTOPW); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } - public void domInc(final String domain, final String referrer, final int depth) { - final DomProfile dp = doms.get(domain); - if (dp == null) { - // new domain - doms.put(domain, new DomProfile(referrer, depth)); - } else { - // increase counter - dp.inc(); - } - domsCache.put(this.mem.get(HANDLE), doms); - } - public boolean grantedDomAppearance(final String domain) { - final int max = domFilterDepth(); - if (max == Integer.MAX_VALUE) return true; - final DomProfile dp = doms.get(domain); - if (dp == null) { - return 0 < max; - } - return dp.depth <= max; - } - - public boolean grantedDomCount(final String domain) { - final int max = domMaxPages(); - if (max == Integer.MAX_VALUE) return true; - final DomProfile dp = doms.get(domain); - if (dp == null) { - return 0 < max; - } - return dp.count <= max; - } - public int domSize() { - return doms.size(); - } - public boolean domExists(final String domain) { - if (domFilterDepth() == Integer.MAX_VALUE) return true; - return doms.containsKey(domain); - } - public String domName(final boolean attr, final int index){ - final Iterator> domnamesi = doms.entrySet().iterator(); - String domname=""; - Map.Entry ey; - DomProfile dp; - int i = 0; - while ((domnamesi.hasNext()) && (i < index)) { - ey = domnamesi.next(); - i++; - } - if (domnamesi.hasNext()) { - ey = domnamesi.next(); - dp = ey.getValue(); - domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " "); - } - return domname; - } + public static long getRecrawlDate(final long oldTimeMinutes) { + return System.currentTimeMillis() - (60000L * oldTimeMinutes); } - } diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 2199ac4e6..a7c0b3829 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -47,7 +47,6 @@ import net.yacy.kelondro.workflow.WorkflowJob; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; -//import de.anomic.http.client.Client; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; import de.anomic.yacy.yacyClient; @@ -252,14 +251,14 @@ public class CrawlQueues { * @return */ private void generateCrawl(Request urlEntry, final String stats, final String profileHandle) { - final CrawlProfile.entry profile = sb.crawler.profilesActiveCrawls.getEntry(profileHandle); - if (profile != null) { + final Map mp = sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes()); + if (mp != null) { // check if the protocol is supported final DigestURI url = urlEntry.url(); final String urlProtocol = url.getProtocol(); if (sb.loader.isSupportedProtocol(urlProtocol)) { - + CrawlProfile profile = new CrawlProfile(mp); if (this.log.isFine()) log.logFine(stats + ": URL=" + urlEntry.url() + ", initiator=" + ((urlEntry.initiator() == null) ? "" : new String(urlEntry.initiator())) @@ -556,7 +555,8 @@ public class CrawlQueues { try { request.setStatus("loading", WorkflowJob.STATUS_RUNNING); final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); - CrawlProfile.entry e = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); + CrawlProfile e = mp == null ? null : new CrawlProfile(mp); Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize); if (response == null) { request.setStatus("error", WorkflowJob.STATUS_FINISHED); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 75591b83b..82c955440 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -31,6 +31,7 @@ package de.anomic.crawler; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.Date; +import java.util.Map; import net.yacy.cora.protocol.Domains; import net.yacy.kelondro.data.meta.DigestURI; @@ -180,7 +181,8 @@ public final class CrawlStacker { // returns null if successful, a reason string if not successful //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); - final CrawlProfile.entry profile = crawler.profilesActiveCrawls.getEntry(entry.profileHandle()); + final Map mp = crawler.profilesActiveCrawls.get(entry.profileHandle().getBytes()); + CrawlProfile profile = mp == null ? null : new CrawlProfile(mp); String error; if (profile == null) { error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); @@ -248,7 +250,7 @@ public final class CrawlStacker { return null; } - public String checkAcceptance(final DigestURI url, final CrawlProfile.entry profile, int depth) { + public String checkAcceptance(final DigestURI url, final CrawlProfile profile, int depth) { // check if the protocol is supported final String urlProtocol = url.getProtocol(); diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index 313d56980..2b74c91b6 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -28,11 +28,12 @@ package de.anomic.crawler; import java.io.File; import java.io.IOException; -import java.util.Iterator; - -import de.anomic.crawler.CrawlProfile.CacheStrategy; +import java.util.Map; +import net.yacy.kelondro.blob.MapHeap; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.kelondroException; @@ -56,14 +57,14 @@ public final class CrawlSwitchboard { public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L; - private final Log log; - public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls; - public CrawlProfile.entry defaultProxyProfile; - public CrawlProfile.entry defaultRemoteProfile; - public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; - public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; - public CrawlProfile.entry defaultSurrogateProfile; - private final File queuesRoot; + private final Log log; + public Map> profilesActiveCrawls, profilesPassiveCrawls; + public CrawlProfile defaultProxyProfile; + public CrawlProfile defaultRemoteProfile; + public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; + public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; + public CrawlProfile defaultSurrogateProfile; + private final File queuesRoot; public CrawlSwitchboard( final String networkName, @@ -82,43 +83,44 @@ public final class CrawlSwitchboard { this.queuesRoot = queuesRoot; this.queuesRoot.mkdirs(); this.log.logConfig("Initializing Crawl Profiles"); + final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); - if (!profilesActiveFile.exists()) { - // migrate old file - final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesActive1.db"); - if (oldFile.exists()) oldFile.renameTo(profilesActiveFile); - } try { - this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile); + this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); } catch (IOException e) { Log.logException(e);Log.logException(e); FileUtils.deletedelete(profilesActiveFile); try { - this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile); + this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); } catch (IOException e1) { Log.logException(e1); this.profilesActiveCrawls = null; } } + for (byte[] handle: this.profilesActiveCrawls.keySet()) { + CrawlProfile p = new CrawlProfile(this.profilesActiveCrawls.get(handle)); + Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name()); + } initActiveCrawlProfiles(); log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries"); + final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES); - if (!profilesPassiveFile.exists()) { - // migrate old file - final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesPassive1.db"); - if (oldFile.exists()) oldFile.renameTo(profilesPassiveFile); - } try { - this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile); + this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); } catch (IOException e) { - FileUtils.deletedelete(profilesPassiveFile); + Log.logException(e);Log.logException(e); + FileUtils.deletedelete(profilesActiveFile); try { - this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile); + this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); } catch (IOException e1) { Log.logException(e1); this.profilesPassiveCrawls = null; } } + for (byte[] handle: this.profilesPassiveCrawls.keySet()) { + CrawlProfile p = new CrawlProfile(this.profilesPassiveCrawls.get(handle)); + Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name()); + } log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() + ", " + this.profilesPassiveCrawls.size() + " entries" + ", " + profilesPassiveFile.length()/1024); @@ -135,12 +137,11 @@ public final class CrawlSwitchboard { this.defaultMediaSnippetLocalProfile = null; this.defaultMediaSnippetGlobalProfile = null; this.defaultSurrogateProfile = null; - final Iterator i = this.profilesActiveCrawls.profiles(true); - CrawlProfile.entry profile; + CrawlProfile profile; String name; try { - while (i.hasNext()) { - profile = i.next(); + for (byte[] handle: this.profilesActiveCrawls.keySet()) { + profile = new CrawlProfile(this.profilesActiveCrawls.get(handle)); name = profile.name(); if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile; if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile; @@ -163,45 +164,52 @@ public final class CrawlSwitchboard { if (this.defaultProxyProfile == null) { // generate new default entry for proxy crawling - this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, + this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false, + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true, true, false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true, CrawlProfile.CacheStrategy.IFFRESH); + this.profilesActiveCrawls.put(this.defaultProxyProfile.handle().getBytes(), this.defaultProxyProfile); } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling - defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); + this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); + this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); + this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile); } - this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); + this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST); if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing - defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); + this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); + this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile); } } @@ -209,24 +217,24 @@ public final class CrawlSwitchboard { final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); if (pdb.exists()) FileUtils.deletedelete(pdb); try { - profilesActiveCrawls = new CrawlProfile(pdb); - } catch (IOException e) { - Log.logException(e); + this.profilesActiveCrawls = new MapHeap(pdb, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); + } catch (IOException e1) { + Log.logException(e1); + this.profilesActiveCrawls = null; } initActiveCrawlProfiles(); } public boolean cleanProfiles() throws InterruptedException { - final Iterator iter = profilesActiveCrawls.profiles(true); - CrawlProfile.entry entry; + CrawlProfile entry; boolean hasDoneSomething = false; try { - while (iter.hasNext()) { + for (byte[] handle: profilesActiveCrawls.keySet()) { // check for interruption if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress"); // getting next profile - entry = iter.next(); + entry = new CrawlProfile(profilesActiveCrawls.get(handle)); if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) || (entry.name().equals(CRAWL_PROFILE_REMOTE)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) || @@ -234,8 +242,9 @@ public final class CrawlSwitchboard { (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) || (entry.name().equals(CRAWL_PROFILE_SURROGATE)))) { - profilesPassiveCrawls.newEntry(entry.map()); - iter.remove(); + CrawlProfile p = new CrawlProfile(entry); + profilesPassiveCrawls.put(p.handle().getBytes(), p); + profilesActiveCrawls.remove(handle); hasDoneSomething = true; } } @@ -248,8 +257,8 @@ public final class CrawlSwitchboard { public void close() { - this.profilesActiveCrawls.close(); - this.profilesPassiveCrawls.close(); + ((MapHeap) this.profilesActiveCrawls).close(); + ((MapHeap) this.profilesPassiveCrawls).close(); } } diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index 36434ebd3..13cf3dbb1 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; +import java.util.Map; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; @@ -213,18 +214,18 @@ public class NoticedURL { } } - public Request pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException { + public Request pop(final int stackType, final boolean delay, Map> profiles) throws IOException { switch (stackType) { - case STACK_TYPE_CORE: return pop(coreStack, delay, profile); - case STACK_TYPE_LIMIT: return pop(limitStack, delay, profile); - case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profile); + case STACK_TYPE_CORE: return pop(coreStack, delay, profiles); + case STACK_TYPE_LIMIT: return pop(limitStack, delay, profiles); + case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profiles); default: return null; } } - public void shift(final int fromStack, final int toStack, CrawlProfile profile) { + public void shift(final int fromStack, final int toStack, Map> profiles) { try { - final Request entry = pop(fromStack, false, profile); + final Request entry = pop(fromStack, false, profiles); if (entry != null) push(toStack, entry); } catch (final IOException e) { return; @@ -241,14 +242,14 @@ public class NoticedURL { } } - private Request pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException { + private Request pop(final Balancer balancer, final boolean delay, Map> profiles) throws IOException { // this is a filo - pop int s; Request entry; int errors = 0; synchronized (balancer) { while ((s = balancer.size()) > 0) { - entry = balancer.pop(delay, profile); + entry = balancer.pop(delay, profiles); if (entry == null) { if (s > balancer.size()) continue; errors++; diff --git a/source/de/anomic/crawler/SitemapImporter.java b/source/de/anomic/crawler/SitemapImporter.java index 9aa04ac4f..ebe502952 100644 --- a/source/de/anomic/crawler/SitemapImporter.java +++ b/source/de/anomic/crawler/SitemapImporter.java @@ -35,7 +35,7 @@ public class SitemapImporter extends AbstractImporter implements Importer { private final DigestURI sitemapURL; private final ImporterManager superviser; - public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile.entry profileEntry) throws ImporterException { + public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile profileEntry) throws ImporterException { super("sitemap"); this.superviser = importManager; try { diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 773ff8c44..6d25d5c16 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -31,6 +31,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.PrintStream; import java.util.Date; +import java.util.Map; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.HeaderFramework; @@ -40,6 +41,7 @@ import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.Latency; import de.anomic.search.Segments; import de.anomic.search.Switchboard; @@ -124,12 +126,13 @@ public class FTPLoader { ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); response = new Response( request, requestHeader, responseHeader, "200", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + mp == null ? null : new CrawlProfile(mp), dirList.toString().getBytes()); } } else { @@ -237,12 +240,13 @@ public class FTPLoader { // create response with metadata only responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); Response response = new Response( request, requestHeader, responseHeader, "200", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + mp == null ? null : new CrawlProfile(mp), url.toNormalform(true, true).getBytes()); return response; } @@ -254,12 +258,13 @@ public class FTPLoader { byte[] b = ftpClient.get(path); // create a response + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); Response response = new Response( request, requestHeader, responseHeader, "200", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + mp == null ? null : new CrawlProfile(mp), b); return response; } diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java index ad8a3d649..785356a55 100644 --- a/source/de/anomic/crawler/retrieval/FileLoader.java +++ b/source/de/anomic/crawler/retrieval/FileLoader.java @@ -25,9 +25,11 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.Date; import java.util.List; +import java.util.Map; import de.anomic.search.Segments; import de.anomic.search.Switchboard; +import de.anomic.crawler.CrawlProfile; import de.anomic.data.MimeTable; import net.yacy.cora.protocol.HeaderFramework; @@ -81,12 +83,13 @@ public class FileLoader { ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); Response response = new Response( request, requestHeader, responseHeader, "200", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + mp == null ? null : new CrawlProfile(mp), content.toString().getBytes()); return response; @@ -115,12 +118,13 @@ public class FileLoader { // create response with metadata only responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); Response response = new Response( request, requestHeader, responseHeader, "200", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + mp == null ? null : new CrawlProfile(mp), url.toNormalform(true, true).getBytes()); return response; } @@ -131,12 +135,13 @@ public class FileLoader { is.close(); // create response with loaded content + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); Response response = new Response( request, requestHeader, responseHeader, "200", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + mp == null ? null : new CrawlProfile(mp), b); return response; } diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index db507e4e1..fdee31934 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -26,6 +26,7 @@ package de.anomic.crawler.retrieval; import java.io.IOException; import java.util.Date; +import java.util.Map; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.HeaderFramework; @@ -36,6 +37,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.repository.Blacklist; +import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.Latency; import de.anomic.search.Segments; import de.anomic.search.Switchboard; @@ -146,6 +148,7 @@ public final class HTTPLoader { } // create a new cache entry + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); response = new Response( request, requestHeader, @@ -153,7 +156,7 @@ public final class HTTPLoader { // res.getStatusLine(), header, Integer.toString(code), - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + mp == null ? null : new CrawlProfile(mp), responseBody ); diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 73ba3eb80..fc05ee9e3 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -61,7 +61,7 @@ public class Response { private final RequestHeader requestHeader; private final ResponseHeader responseHeader; private final String responseStatus; - private final CrawlProfile.entry profile; + private final CrawlProfile profile; private byte[] content; private int status; // tracker indexing status, see status defs below @@ -148,7 +148,7 @@ public class Response { final RequestHeader requestHeader, final ResponseHeader responseHeader, final String responseStatus, - final CrawlProfile.entry profile, + final CrawlProfile profile, final byte[] content) { this.request = request; // request and response headers may be zero in case that we process surrogates @@ -165,7 +165,7 @@ public class Response { final RequestHeader requestHeader, final ResponseHeader responseHeader, final String responseStatus, - final CrawlProfile.entry profile) { + final CrawlProfile profile) { this(request, requestHeader, responseHeader, responseStatus, profile, null); } @@ -216,7 +216,7 @@ public class Response { return this.url().language(); } - public CrawlProfile.entry profile() { + public CrawlProfile profile() { return this.profile; } diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index ab34a63c8..e755b90bb 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -34,6 +34,7 @@ import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Date; import java.util.List; +import java.util.Map; import jcifs.smb.SmbException; import jcifs.smb.SmbFile; @@ -41,6 +42,7 @@ import jcifs.smb.SmbFileInputStream; import de.anomic.search.Segments; import de.anomic.search.Switchboard; +import de.anomic.crawler.CrawlProfile; import de.anomic.data.MimeTable; import net.yacy.cora.protocol.HeaderFramework; @@ -100,12 +102,13 @@ public class SMBLoader { ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); Response response = new Response( request, requestHeader, responseHeader, "200", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + mp == null ? null : new CrawlProfile(mp), content.toString().getBytes()); return response; @@ -134,12 +137,13 @@ public class SMBLoader { // create response with metadata only responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); Response response = new Response( request, requestHeader, responseHeader, "200", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + mp == null ? null : new CrawlProfile(mp), url.toNormalform(true, true).getBytes()); return response; } @@ -150,12 +154,13 @@ public class SMBLoader { is.close(); // create response with loaded content + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); Response response = new Response( request, requestHeader, responseHeader, "200", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + mp == null ? null : new CrawlProfile(mp), b); return response; } diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index f3f8e57f1..172eb1a57 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -94,7 +94,7 @@ public class SitemapParser extends DefaultHandler { /** * The crawling profile used to parse the URLs contained in the sitemap file */ - private CrawlProfile.entry crawlingProfile = null; + private CrawlProfile crawlingProfile = null; /** * Name of the current XML element @@ -137,7 +137,7 @@ public class SitemapParser extends DefaultHandler { private Date lastMod = null; private final Switchboard sb; - public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile.entry theCrawlingProfile) { + public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile theCrawlingProfile) { assert sitemap != null; this.sb = sb; this.siteMapURL = sitemap; @@ -328,8 +328,8 @@ public class SitemapParser extends DefaultHandler { } } - private CrawlProfile.entry createProfile(final String domainName, final DigestURI sitemapURL) { - return this.sb.crawler.profilesActiveCrawls.newEntry( + private CrawlProfile createProfile(final String domainName, final DigestURI sitemapURL) { + CrawlProfile p = new CrawlProfile( domainName, sitemapURL, // crawling Filter CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, @@ -352,5 +352,7 @@ public class SitemapParser extends DefaultHandler { // exclude stop-words true, true, true, CrawlProfile.CacheStrategy.IFFRESH); + this.sb.crawler.profilesActiveCrawls.put(p.handle().getBytes(), p); + return p; } } diff --git a/source/de/anomic/data/WorkTables.java b/source/de/anomic/data/WorkTables.java index e9adc3fc1..1f8d9f943 100644 --- a/source/de/anomic/data/WorkTables.java +++ b/source/de/anomic/data/WorkTables.java @@ -59,9 +59,12 @@ public class WorkTables extends Tables { public final static String TABLE_API_COL_APICALL_COUNT = "apicall_count"; // counts how often the API was called (starts with 1) public final static String TABLE_API_COL_APICALL_SCHEDULE_TIME = "apicall_schedule_time"; // factor for SCHEULE_UNIT time units public final static String TABLE_API_COL_APICALL_SCHEDULE_UNIT= "apicall_schedule_unit"; // may be 'minutes', 'hours', 'days' - + public final static String TABLE_ROBOTS_NAME = "robots"; + public final static String TABLE_ACTIVECRAWLS_NAME = "crawljobsActive"; + public final static String TABLE_PASSIVECRAWLS_NAME = "crawljobsPassive"; + public WorkTables(final File workPath) { super(workPath, 12); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index d09665479..5e26ec014 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -123,7 +123,6 @@ import de.anomic.crawler.ResultImages; import de.anomic.crawler.ResultURLs; import de.anomic.crawler.RobotsTxt; import de.anomic.crawler.CrawlProfile.CacheStrategy; -import de.anomic.crawler.CrawlProfile.entry; import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; @@ -1102,12 +1101,12 @@ public final class Switchboard extends serverSwitch { } /** - * {@link CrawlProfile Crawl Profiles} are saved independently from the queues themselves + * {@link CrawlProfiles Crawl Profiles} are saved independently from the queues themselves * and therefore have to be cleaned up from time to time. This method only performs the clean-up * if - and only if - the {@link IndexingStack switchboard}, * {@link LoaderDispatcher loader} and {@link plasmaCrawlNURL local crawl} queues are all empty. *

- * Then it iterates through all existing {@link CrawlProfile crawl profiles} and removes + * Then it iterates through all existing {@link CrawlProfiles crawl profiles} and removes * all profiles which are not hard-coded. *

*

@@ -1442,34 +1441,47 @@ public final class Switchboard extends serverSwitch { // refresh recrawl dates try{ - Iterator it = crawler.profilesActiveCrawls.profiles(true); - entry selentry; - while (it.hasNext()) { - selentry = it.next(); + CrawlProfile selentry; + for (byte[] handle: crawler.profilesActiveCrawls.keySet()) { + selentry = new CrawlProfile(crawler.profilesActiveCrawls.get(handle)); assert selentry.handle() != null : "profile.name = " + selentry.name(); if (selentry.handle() == null) { - it.remove(); + crawler.profilesActiveCrawls.remove(handle); continue; } - if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY)) - crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, - Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE))); + boolean insert = false; + if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY)) { + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, + Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE))); + insert = true; + } // if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE)); - if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) - crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, - Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE))); - if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) - crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, - Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE))); - if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) - crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, - Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE))); - if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) - crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, - Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE))); - if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) - crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, - Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE))); + if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) { + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, + Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE))); + insert = true; + } + if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) { + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, + Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE))); + insert = true; + } + if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) { + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, + Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE))); + insert = true; + } + if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) { + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, + Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE))); + insert = true; + } + if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) { + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, + Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE))); + insert = true; + } + if (insert) crawler.profilesActiveCrawls.put(selentry.handle().getBytes(), selentry); } } catch (final Exception e) { Log.logException(e); @@ -1827,7 +1839,7 @@ public final class Switchboard extends serverSwitch { // update image result list statistics // its good to do this concurrently here, because it needs a DNS lookup // to compute a URL hash which is necessary for a double-check - final CrawlProfile.entry profile = in.queueEntry.profile(); + final CrawlProfile profile = in.queueEntry.profile(); ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing()); } catch (final UnsupportedEncodingException e) { @@ -1987,7 +1999,8 @@ public final class Switchboard extends serverSwitch { if (searchEvent != null) searchEvent.addHeuristic(url.hash(), heuristicName, true); if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work final Request request = loader.request(url, true, true); - String acceptedError = this.crawlStacker.checkAcceptance(url, this.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), 0); + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); + String acceptedError = this.crawlStacker.checkAcceptance(url, mp == null ? null : new CrawlProfile(mp), 0); if (acceptedError != null) { log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError); return; diff --git a/source/de/anomic/yacy/yacyRelease.java b/source/de/anomic/yacy/yacyRelease.java index d005e0cb6..59ed3188b 100644 --- a/source/de/anomic/yacy/yacyRelease.java +++ b/source/de/anomic/yacy/yacyRelease.java @@ -27,7 +27,6 @@ package de.anomic.yacy; -//import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; @@ -59,8 +58,6 @@ import net.yacy.kelondro.util.OS; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.HTTPLoader; -//import de.anomic.http.client.Client; -//import de.anomic.http.server.ResponseContainer; import de.anomic.search.Switchboard; import de.anomic.server.serverCore; import de.anomic.tools.CryptoLib; diff --git a/source/net/yacy/kelondro/blob/HeapReader.java b/source/net/yacy/kelondro/blob/HeapReader.java index 33e317840..ef0d9592d 100644 --- a/source/net/yacy/kelondro/blob/HeapReader.java +++ b/source/net/yacy/kelondro/blob/HeapReader.java @@ -71,6 +71,7 @@ public class HeapReader { this.keylength = keylength; this.index = null; // will be created as result of initialization process this.free = null; // will be initialized later depending on existing idx/gap file + this.heapFile.getParentFile().mkdirs(); this.file = new CachedFileWriter(this.heapFile); // read or initialize the index diff --git a/source/net/yacy/kelondro/blob/MapHeap.java b/source/net/yacy/kelondro/blob/MapHeap.java index 426a13c01..a2127a83e 100644 --- a/source/net/yacy/kelondro/blob/MapHeap.java +++ b/source/net/yacy/kelondro/blob/MapHeap.java @@ -52,8 +52,6 @@ import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.kelondroException; - - public class MapHeap implements Map> { private BLOB blob; @@ -229,7 +227,8 @@ public class MapHeap implements Map> { public Map get(final Object key) { if (key == null) return null; try { - return get((byte[]) key); + if (key instanceof byte[]) return get((byte[]) key); + if (key instanceof String) return get(((String) key).getBytes()); } catch (IOException e) { Log.logException(e); } catch (RowSpaceExceededException e) { diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 6c8f33179..a16b6c0bb 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -159,8 +159,8 @@ public final class LoaderDispatcher { if (url.isLocal() && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + url); // check if we have the page in the cache - - CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); + final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); + CrawlProfile crawlProfile = mp == null ? null : new CrawlProfile(mp); if (crawlProfile != null && cacheStrategy != CrawlProfile.CacheStrategy.NOCACHE) { // we have passed a first test if caching is allowed // now see if there is a cache entry