redesign of crawl profiles data structure. target will be:

- permanent storage of auto-dom statistics in profile
- storage of profiles in WorkTable data structure
not finished yet. No functional change yet.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7088 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 3f1d5a061f
commit 65eaf30f77

@ -28,15 +28,15 @@ import java.text.DateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -80,23 +80,23 @@ public class CrawlProfileEditor_p {
private static final ArrayList <eentry> labels = new ArrayList<eentry>();
static {
labels.add(new eentry(entry.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(entry.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(entry.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(entry.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(entry.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(entry.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(entry.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(entry.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(entry.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(entry.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN));
labels.add(new eentry(entry.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN));
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
@ -106,40 +106,32 @@ public class CrawlProfileEditor_p {
// read post for handle
final String handle = (post == null) ? "" : post.get("handle", "");
if (post != null) {
if (post.containsKey("terminate")) {
if (post.containsKey("terminate")) try {
// termination of a crawl: shift the crawl from active to passive
final CrawlProfile.entry entry = sb.crawler.profilesActiveCrawls.getEntry(handle);
if (entry != null) {
sb.crawler.profilesPassiveCrawls.newEntry(entry.map());
}
sb.crawler.profilesActiveCrawls.removeEntry(handle.getBytes());
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(handle.getBytes());
if (mp != null) sb.crawler.profilesPassiveCrawls.put(handle.getBytes(), new CrawlProfile(mp));
// delete all entries from the crawl queue that are deleted here
try {
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
sb.crawler.profilesActiveCrawls.remove(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
if (post.containsKey("delete")) {
// deletion of a terminated crawl profile
sb.crawler.profilesPassiveCrawls.removeEntry(handle.getBytes());
sb.crawler.profilesPassiveCrawls.remove(handle.getBytes());
}
if (post.containsKey("deleteTerminatedProfiles")) {
Iterator<CrawlProfile.entry> profiles = sb.crawler.profilesPassiveCrawls.profiles(false);
while (profiles.hasNext()) {
profiles.next();
profiles.remove();
profiles = sb.crawler.profilesPassiveCrawls.profiles(false);
for (byte[] h: sb.crawler.profilesPassiveCrawls.keySet()) {
sb.crawler.profilesPassiveCrawls.remove(h);
}
}
}
// generate handle list
int count = 0;
Iterator<CrawlProfile.entry> it = sb.crawler.profilesActiveCrawls.profiles(true);
entry selentry;
while (it.hasNext()) {
selentry = it.next();
CrawlProfile selentry;
for (byte[] h: sb.crawler.profilesActiveCrawls.keySet()) {
selentry = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(h));
if (ignoreNames.contains(selentry.name())) {
continue;
}
@ -151,7 +143,8 @@ public class CrawlProfileEditor_p {
count++;
}
prop.put("profiles", count);
selentry = sb.crawler.profilesActiveCrawls.getEntry(handle);
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(handle.getBytes());
selentry = mp == null ? null : new CrawlProfile(mp);
assert selentry == null || selentry.handle() != null;
// read post for change submit
if ((post != null) && (selentry != null)) {
@ -161,10 +154,11 @@ public class CrawlProfileEditor_p {
eentry tee;
while (lit.hasNext()) {
tee = lit.next();
final String cval = selentry.map().get(tee.name);
final String cval = selentry.get(tee.name);
final String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval);
if (!cval.equals(val)) {
sb.crawler.profilesActiveCrawls.changeEntry(selentry, tee.name, val);
selentry.put(tee.name, val);
sb.crawler.profilesActiveCrawls.put(selentry.handle().getBytes(), selentry);
}
}
} catch (final Exception ex) {
@ -179,20 +173,18 @@ public class CrawlProfileEditor_p {
count = 0;
boolean dark = true;
final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
CrawlProfile.entry profile;
CrawlProfile profile;
// put active crawls into list
it = sb.crawler.profilesActiveCrawls.profiles(true);
while (it.hasNext()) {
profile = it.next();
for (byte[] h: sb.crawler.profilesActiveCrawls.keySet()) {
profile = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(h));
putProfileEntry(prop, profile, true, dark, count, domlistlength);
dark = !dark;
count++;
}
// put passive crawls into list
boolean existPassiveCrawls = false;
it = sb.crawler.profilesPassiveCrawls.profiles(true);
while (it.hasNext()) {
profile = it.next();
for (byte[] h: sb.crawler.profilesPassiveCrawls.keySet()) {
profile = new CrawlProfile(sb.crawler.profilesPassiveCrawls.get(h));
putProfileEntry(prop, profile, false, dark, count, domlistlength);
dark = !dark;
count++;
@ -217,7 +209,7 @@ public class CrawlProfileEditor_p {
count = 0;
while (lit.hasNext()) {
final eentry ee = lit.next();
final String val = selentry.map().get(ee.name);
final String val = selentry.get(ee.name);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly", ee.readonly ? "1" : "0");
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_name", ee.name);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_label", ee.label);
@ -235,7 +227,7 @@ public class CrawlProfileEditor_p {
return prop;
}
private static void putProfileEntry(final servletProperties prop, final CrawlProfile.entry profile, final boolean active, final boolean dark, final int count, final int domlistlength) {
private static void putProfileEntry(final servletProperties prop, final CrawlProfile profile, final boolean active, final boolean dark, final int count, final int domlistlength) {
prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_name", profile.name());

@ -254,8 +254,8 @@ public class Crawler_p {
sb.crawlQueues.errorURL.remove(urlhash);
// stack url
sb.crawler.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry(
sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile(
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
crawlingStartURL,
newcrawlingMustMatch,
@ -265,6 +265,7 @@ public class Crawler_p {
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
url,
@ -297,7 +298,7 @@ public class Crawler_p {
// generate a YaCyNews if the global flag was set
if (crawlOrder) {
final Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
m.remove("specificDepth");
m.remove("indexText");
m.remove("indexMedia");
@ -371,7 +372,7 @@ public class Crawler_p {
// creating a crawler profile
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
final CrawlProfile.entry profile = sb.crawler.profilesActiveCrawls.newEntry(
final CrawlProfile profile = new CrawlProfile(
fileName, crawlURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
@ -387,6 +388,7 @@ public class Crawler_p {
crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
// pause local crawl here
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -435,7 +437,7 @@ public class Crawler_p {
final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
// create a new profile
final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry(
final CrawlProfile pe = new CrawlProfile(
sitemapURLStr, sitemapURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
@ -446,6 +448,7 @@ public class Crawler_p {
storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
// create a new sitemap importer
final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new DigestURI(sitemapURLStr, null), pe);

@ -31,6 +31,7 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.protocol.RequestHeader;
@ -95,14 +96,15 @@ public class IndexCreateWWWGlobalQueue_p {
boolean dark = true;
yacySeed initiator;
String profileHandle;
CrawlProfile.entry profileEntry;
CrawlProfile profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
final Map<String, String> mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
profileEntry = mp == null ? null : new CrawlProfile(mp);
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

@ -32,6 +32,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@ -95,10 +96,9 @@ public class IndexCreateWWWLocalQueue_p {
if (option == PROFILE) {
// search and delete the crawl profile (_much_ faster, independant of queue size)
// XXX: what to do about the annoying LOST PROFILE messages in the log?
final Iterator<CrawlProfile.entry> it = sb.crawler.profilesActiveCrawls.profiles(true);
CrawlProfile.entry entry;
while (it.hasNext()) {
entry = it.next();
CrawlProfile entry;
for (byte[] handle: sb.crawler.profilesActiveCrawls.keySet()) {
entry = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(handle));
final String name = entry.name();
if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
@ -108,9 +108,7 @@ public class IndexCreateWWWLocalQueue_p {
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
continue;
if (compiledPattern.matcher(name).find()) {
sb.crawler.profilesActiveCrawls.removeEntry(entry.handle().getBytes());
}
if (compiledPattern.matcher(name).find()) sb.crawler.profilesActiveCrawls.remove(entry.handle().getBytes());
}
} else {
// iterating through the list of URLs
@ -165,14 +163,15 @@ public class IndexCreateWWWLocalQueue_p {
boolean dark = true;
yacySeed initiator;
String profileHandle;
CrawlProfile.entry profileEntry;
CrawlProfile profileEntry;
int i;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if ((urle != null)&&(urle.url()!=null)) {
initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
final Map<String, String> mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
profileEntry = mp == null ? null : new CrawlProfile(mp);
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

@ -28,6 +28,7 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.protocol.RequestHeader;
@ -92,14 +93,15 @@ public class IndexCreateWWWRemoteQueue_p {
boolean dark = true;
yacySeed initiator;
String profileHandle;
CrawlProfile.entry profileEntry;
CrawlProfile profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
final Map<String, String> mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
profileEntry = mp == null ? null : new CrawlProfile(mp);
prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

@ -28,7 +28,6 @@
// if the shell's current path is HTROOT
import java.io.File;
import java.io.IOException;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.logging.Log;
@ -102,46 +101,41 @@ public class ProxyIndexingMonitor_p {
if (sb.crawler.defaultProxyProfile == null) {
prop.put("info", "1"); //delete DATA/PLASMADB/crawlProfiles0.db
} else {
try {
assert sb.crawler.defaultProxyProfile.handle() != null;
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "generalDepth", Integer.toString(newProxyPrefetchDepth));
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "storeHTCache", (proxyStoreHTCache) ? "true": "false");
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "remoteIndexing",proxyIndexingRemote ? "true":"false");
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "indexText",proxyIndexingLocalText ? "true":"false");
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "indexMedia",proxyIndexingLocalMedia ? "true":"false");
prop.put("info", "2");//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth);
prop.put("info_caching", proxyStoreHTCache ? "1" : "0");
prop.put("info_indexingLocalText", proxyIndexingLocalText ? "1" : "0");
prop.put("info_indexingLocalMedia", proxyIndexingLocalMedia ? "1" : "0");
prop.put("info_indexingRemote", proxyIndexingRemote ? "1" : "0");
// proxyCache - only display on change
if (oldProxyCachePath.equals(newProxyCachePath)) {
prop.put("info_path", "0");
prop.putHTML("info_path_return", oldProxyCachePath);
} else {
prop.put("info_path", "1");
prop.putHTML("info_path_return", newProxyCachePath);
}
// proxyCacheSize - only display on change
if (oldProxyCacheSize.equals(newProxyCacheSize)) {
prop.put("info_size", "0");
prop.put("info_size_return", oldProxyCacheSize);
} else {
prop.put("info_size", "1");
prop.put("info_size_return", newProxyCacheSize);
}
// proxyCache, proxyCacheSize we need a restart
prop.put("info_restart", "0");
prop.put("info_restart_return", "0");
if (!oldProxyCachePath.equals(newProxyCachePath)) prop.put("info_restart", "1");
} catch (final IOException e) {
prop.put("info", "3"); //Error: errmsg
prop.putHTML("info_error", e.getMessage());
assert sb.crawler.defaultProxyProfile.handle() != null;
sb.crawler.defaultProxyProfile.put("generalDepth", Integer.toString(newProxyPrefetchDepth));
sb.crawler.defaultProxyProfile.put("storeHTCache", (proxyStoreHTCache) ? "true": "false");
sb.crawler.defaultProxyProfile.put("remoteIndexing",proxyIndexingRemote ? "true":"false");
sb.crawler.defaultProxyProfile.put("indexText",proxyIndexingLocalText ? "true":"false");
sb.crawler.defaultProxyProfile.put("indexMedia",proxyIndexingLocalMedia ? "true":"false");
sb.crawler.profilesActiveCrawls.put(sb.crawler.defaultProxyProfile.handle().getBytes(), sb.crawler.defaultProxyProfile);
prop.put("info", "2");//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth);
prop.put("info_caching", proxyStoreHTCache ? "1" : "0");
prop.put("info_indexingLocalText", proxyIndexingLocalText ? "1" : "0");
prop.put("info_indexingLocalMedia", proxyIndexingLocalMedia ? "1" : "0");
prop.put("info_indexingRemote", proxyIndexingRemote ? "1" : "0");
// proxyCache - only display on change
if (oldProxyCachePath.equals(newProxyCachePath)) {
prop.put("info_path", "0");
prop.putHTML("info_path_return", oldProxyCachePath);
} else {
prop.put("info_path", "1");
prop.putHTML("info_path_return", newProxyCachePath);
}
// proxyCacheSize - only display on change
if (oldProxyCacheSize.equals(newProxyCacheSize)) {
prop.put("info_size", "0");
prop.put("info_size_return", oldProxyCacheSize);
} else {
prop.put("info_size", "1");
prop.put("info_size_return", newProxyCacheSize);
}
// proxyCache, proxyCacheSize we need a restart
prop.put("info_restart", "0");
prop.put("info_restart_return", "0");
if (!oldProxyCachePath.equals(newProxyCachePath)) prop.put("info_restart", "1");
}
} catch (final Exception e) {

@ -143,9 +143,9 @@ public class QuickCrawlLink_p {
sb.crawlQueues.errorURL.remove(urlhash);
// create crawling profile
CrawlProfile.entry pe = null;
CrawlProfile pe = null;
try {
pe = sb.crawler.profilesActiveCrawls.newEntry(
pe = new CrawlProfile(
crawlingStartURL.getHost(),
crawlingStartURL,
crawlingMustMatch,
@ -163,8 +163,8 @@ public class QuickCrawlLink_p {
xsstopw,
xdstopw,
xpstopw,
CrawlProfile.CacheStrategy.IFFRESH
);
CrawlProfile.CacheStrategy.IFFRESH);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
} catch (final Exception e) {
// mist
prop.put("mode_status", "2");//Error with url

@ -4,11 +4,9 @@
//$LastChangedBy$
//
import java.util.Iterator;
import net.yacy.cora.protocol.RequestHeader;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
@ -50,10 +48,9 @@ public class WatchWebStructure_p {
if (host.equals("auto")) {
// try to find the host from the crawl profiles
final Iterator<entry> it = sb.crawler.profilesActiveCrawls.profiles(true);
entry e;
while (it.hasNext()) {
e = it.next();
CrawlProfile e;
for (byte[] handle: sb.crawler.profilesActiveCrawls.keySet()) {
e = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(handle));
if (e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||

@ -320,7 +320,7 @@ public class Balancer {
* @throws IOException
* @throws RowSpaceExceededException
*/
public Request pop(final boolean delay, final CrawlProfile profile) throws IOException {
public Request pop(final boolean delay, final Map<byte[], Map<String, String>> profiles) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
try {
@ -384,7 +384,8 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
final CrawlProfile.entry profileEntry = (profile == null) ? null : profile.getEntry(crawlEntry.profileHandle());
final Map<String, String> mp = profiles == null ? null : profiles.get(crawlEntry.profileHandle());
final CrawlProfile profileEntry = mp == null ? null : new CrawlProfile(mp);
if (profileEntry == null) {
Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
return null;

@ -4,7 +4,7 @@
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 25.02.2004
// last major change: 31.08.2010
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -22,210 +22,300 @@
package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
public class CrawlProfile {
public class CrawlProfile extends ConcurrentHashMap<String, String> implements Map<String, String> {
private static final long serialVersionUID = 5527325718810703504L;
public static final String MATCH_ALL = ".*";
public static final String MATCH_NEVER = "";
static ConcurrentHashMap<String, Map<String, DomProfile>> domsCache = new ConcurrentHashMap<String, Map<String, DomProfile>>();
// this is a simple record structure that hold all properties of a single crawl start
public static final String HANDLE = "handle";
public static final String NAME = "name";
public static final String START_URL = "startURL";
public static final String FILTER_MUSTMATCH = "generalFilter";
public static final String FILTER_MUSTNOTMATCH = "nevermatch";
public static final String DEPTH = "generalDepth";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_FILTER_DEPTH = "domFilterDepth";
public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ";
public static final String INDEX_TEXT = "indexText";
public static final String INDEX_MEDIA = "indexMedia";
public static final String STORE_HTCACHE = "storeHTCache";
public static final String STORE_TXCACHE = "storeTXCache";
public static final String REMOTE_INDEXING = "remoteIndexing";
public static final String XSSTOPW = "xsstopw";
public static final String XDSTOPW = "xdstopw";
public static final String XPSTOPW = "xpstopw";
public static final String CACHE_STRAGEGY = "cacheStrategy";
MapHeap profileTable;
private final File profileTableFile;
private Map<String, DomProfile> doms;
private Pattern mustmatch = null, mustnotmatch = null;
public CrawlProfile(final File file) throws IOException {
//System.out.println("loading crawl profile from " + file);
this.profileTableFile = file;
profileTableFile.getParentFile().mkdirs();
profileTable = new MapHeap(profileTableFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
profileIterator pi = new profileIterator(true);
entry e;
while (pi.hasNext()) {
e = pi.next();
if (e == null) continue;
Log.logInfo("CrawlProfiles", "loaded Profile " + e.handle() + ": " + e.name());
}
public CrawlProfile(final String name, final DigestURI startURL,
final String mustmatch,
final String mustnotmatch,
final int depth,
final long recrawlIfOlder /*date*/,
final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final CacheStrategy cacheStrategy) {
super(40);
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : new String(startURL.hash());
put(HANDLE, handle);
put(NAME, name);
put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
put(FILTER_MUSTMATCH, (mustmatch == null) ? CrawlProfile.MATCH_ALL : mustmatch);
put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
put(DEPTH, depth);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(DOM_FILTER_DEPTH, domFilterDepth);
put(DOM_MAX_PAGES, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
put(INDEX_TEXT, indexText);
put(INDEX_MEDIA, indexMedia);
put(STORE_HTCACHE, storeHTCache);
put(STORE_TXCACHE, storeTXCache);
put(REMOTE_INDEXING, remoteIndexing);
put(XSSTOPW, xsstopw); // exclude static stop-words
put(XDSTOPW, xdstopw); // exclude dynamic stop-word
put(XPSTOPW, xpstopw); // exclude parent stop-words
put(CACHE_STRAGEGY, cacheStrategy.toString());
doms = new ConcurrentHashMap<String, DomProfile>();
}
public void clear() {
// deletes the profile database and creates a new one
if (profileTable != null) profileTable.close();
FileUtils.deletedelete(profileTableFile);
profileTableFile.getParentFile().mkdirs();
try {
profileTable = new MapHeap(profileTableFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e) {
Log.logException(e);
}
public CrawlProfile(Map<String, String> ext) {
super(ext == null ? 1 : ext.size());
if (ext != null) this.putAll(ext);
doms = new ConcurrentHashMap<String, DomProfile>();
}
public void close() {
if (profileTable != null) profileTable.close();
this.profileTable = null;
public void put(String key, boolean value) {
super.put(key, Boolean.toString(value));
}
public int size() {
return profileTable.size();
public void put(String key, int value) {
super.put(key, Integer.toString(value));
}
public Iterator<entry> profiles(final boolean up) {
// enumerates profile entries
try {
return new profileIterator(up);
} catch (final IOException e) {
Log.logException(e);
return new HashSet<entry>().iterator();
}
public void put(String key, long value) {
super.put(key, Long.toString(value));
}
public class profileIterator implements Iterator<entry> {
// the iterator iterates all keys, which are byte[] objects
CloneableIterator<byte[]> handleIterator;
String lastkey;
public profileIterator(final boolean up) throws IOException {
handleIterator = profileTable.keys(up, false);
lastkey = null;
}
public boolean hasNext() {
try {
return handleIterator.hasNext();
} catch (final kelondroException e) {
Log.logException(e);
clear();
return false;
}
}
public entry next() {
try {
lastkey = new String(handleIterator.next());
return getEntry(lastkey);
} catch (final kelondroException e) {
Log.logException(e);
clear();
return null;
}
public String handle() {
final String r = get(HANDLE);
//if (r == null) return null;
return r;
}
public String name() {
final String r = get(NAME);
if (r == null) return "";
return r;
}
public String startURL() {
final String r = get(START_URL);
return r;
}
public Pattern mustMatchPattern() {
if (this.mustmatch == null) {
String r = get(FILTER_MUSTMATCH);
if (r == null) r = CrawlProfile.MATCH_ALL;
this.mustmatch = Pattern.compile(r);
}
public void remove() {
if (lastkey != null) try {
removeEntry(lastkey.getBytes());
} catch (final kelondroException e) {
Log.logException(e);
clear();
}
return this.mustmatch;
}
public Pattern mustNotMatchPattern() {
if (this.mustnotmatch == null) {
String r = get(FILTER_MUSTNOTMATCH);
if (r == null) r = CrawlProfile.MATCH_NEVER;
this.mustnotmatch = Pattern.compile(r);
}
return this.mustnotmatch;
}
public void removeEntry(final byte[] handle) {
public int depth() {
final String r = get(DEPTH);
if (r == null) return 0;
try {
profileTable.delete(handle);
} catch (final IOException e) {
return Integer.parseInt(r);
} catch (final NumberFormatException e) {
Log.logException(e);
return 0;
}
}
public entry newEntry(final Map<String, String> mem) {
final entry ne = new entry(mem);
public CacheStrategy cacheStrategy() {
final String r = get(CACHE_STRAGEGY);
if (r == null) return CacheStrategy.IFFRESH;
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception e) {
clear();
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception ee) {
Log.logException(e);
System.exit(0);
}
return CacheStrategy.decode(Integer.parseInt(r));
} catch (final NumberFormatException e) {
Log.logException(e);
return CacheStrategy.IFFRESH;
}
return ne;
}
public entry newEntry( final String name,
final DigestURI startURL,
final String mustmatch, final String mustnotmatch,
final int generalDepth,
final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final CacheStrategy cacheStrategy) {
final entry ne = new entry(
name, startURL,
mustmatch, mustnotmatch,
generalDepth,
recrawlIfOlder, domFilterDepth, domMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, storeTXCache,
remoteIndexing,
xsstopw, xdstopw, xpstopw,
cacheStrategy);
public void setCacheStrategy(CacheStrategy newStrategy) {
put(CACHE_STRAGEGY, newStrategy.toString());
}
public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that
// an entry must have to be re-crawled
final String r = get(RECRAWL_IF_OLDER);
if (r == null) return 0L;
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception e) {
clear();
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception ee) {
Log.logException(e);
System.exit(0);
}
final long l = Long.parseLong(r);
return (l < 0) ? 0L : l;
} catch (final NumberFormatException e) {
Log.logException(e);
return 0L;
}
return ne;
}
public boolean hasEntry(final String handle) {
return profileTable.containsKey(handle.getBytes());
}
public entry getEntry(final String handle) {
if (profileTable == null) return null;
Map<String, String> m;
public int domFilterDepth() {
// if the depth is equal or less to this depth,
// then the current url feeds with its domain the crawl filter
// if this is -1, all domains are feeded
final String r = get(DOM_FILTER_DEPTH);
if (r == null) return Integer.MAX_VALUE;
try {
m = profileTable.get(handle.getBytes());
} catch (final IOException e) {
final int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
return i;
} catch (final NumberFormatException e) {
Log.logException(e);
return null;
} catch (RowSpaceExceededException e) {
return Integer.MAX_VALUE;
}
}
public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
final String r = get(DOM_MAX_PAGES);
if (r == null) return Integer.MAX_VALUE;
try {
final int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
return i;
} catch (final NumberFormatException e) {
Log.logException(e);
return null;
return Integer.MAX_VALUE;
}
if (m == null) return null;
return new entry(m);
}
public boolean crawlingQ() {
final String r = get(CRAWLING_Q);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexText() {
final String r = get(INDEX_TEXT);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexMedia() {
final String r = get(INDEX_MEDIA);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeHTCache() {
final String r = get(STORE_HTCACHE);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeTXCache() {
final String r = get(STORE_TXCACHE);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean remoteIndexing() {
final String r = get(REMOTE_INDEXING);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeStaticStopwords() {
final String r = get(XSSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeDynamicStopwords() {
final String r = get(XDSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeParentStopwords() {
final String r = get(XPSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public void domInc(final String domain, final String referrer, final int depth) {
final DomProfile dp = doms.get(domain);
if (dp == null) {
// new domain
doms.put(domain, new DomProfile(referrer, depth));
} else {
// increase counter
dp.inc();
}
}
public boolean grantedDomAppearance(final String domain) {
final int max = domFilterDepth();
if (max == Integer.MAX_VALUE) return true;
final DomProfile dp = doms.get(domain);
if (dp == null) {
return 0 < max;
}
return dp.depth <= max;
}
public void changeEntry(final entry e, final String propName, final String newValue) throws IOException, RowSpaceExceededException {
e.mem.put(propName, newValue);
assert e.handle() != null;
profileTable.insert(e.handle().getBytes(), e.mem);
public boolean grantedDomCount(final String domain) {
final int max = domMaxPages();
if (max == Integer.MAX_VALUE) return true;
final DomProfile dp = doms.get(domain);
if (dp == null) {
return 0 < max;
}
return dp.count <= max;
}
public long getRecrawlDate(final long oldTimeMinutes) {
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
public int domSize() {
return doms.size();
}
public boolean domExists(final String domain) {
if (domFilterDepth() == Integer.MAX_VALUE) return true;
return doms.containsKey(domain);
}
public String domName(final boolean attr, final int index){
final Iterator<Map.Entry<String, DomProfile>> domnamesi = doms.entrySet().iterator();
String domname="";
Map.Entry<String, DomProfile> ey;
DomProfile dp;
int i = 0;
while ((domnamesi.hasNext()) && (i < index)) {
ey = domnamesi.next();
i++;
}
if (domnamesi.hasNext()) {
ey = domnamesi.next();
dp = ey.getValue();
domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " ");
}
return domname;
}
public static class DomProfile {
public final static class DomProfile {
public String referrer;
public int depth, count;
@ -275,287 +365,8 @@ public class CrawlProfile {
return this.code == 3;
}
}
public static class entry {
// this is a simple record structure that hold all properties of a single crawl start
public static final String HANDLE = "handle";
public static final String NAME = "name";
public static final String START_URL = "startURL";
public static final String FILTER_MUSTMATCH = "generalFilter";
public static final String FILTER_MUSTNOTMATCH = "nevermatch";
public static final String DEPTH = "generalDepth";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_FILTER_DEPTH = "domFilterDepth";
public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ";
public static final String INDEX_TEXT = "indexText";
public static final String INDEX_MEDIA = "indexMedia";
public static final String STORE_HTCACHE = "storeHTCache";
public static final String STORE_TXCACHE = "storeTXCache";
public static final String REMOTE_INDEXING = "remoteIndexing";
public static final String XSSTOPW = "xsstopw";
public static final String XDSTOPW = "xdstopw";
public static final String XPSTOPW = "xpstopw";
public static final String CACHE_STRAGEGY = "cacheStrategy";
private Map<String, String> mem;
private Map<String, DomProfile> doms;
private Pattern mustmatch = null, mustnotmatch = null;
public entry(final String name, final DigestURI startURL,
final String mustmatch,
final String mustnotmatch,
final int depth,
final long recrawlIfOlder /*date*/,
final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final CacheStrategy cacheStrategy) {
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : new String(startURL.hash());
mem = new ConcurrentHashMap<String, String>(40);
mem.put(HANDLE, handle);
mem.put(NAME, name);
mem.put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
mem.put(FILTER_MUSTMATCH, (mustmatch == null) ? MATCH_ALL : mustmatch);
mem.put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? MATCH_NEVER : mustnotmatch);
mem.put(DEPTH, Integer.toString(depth));
mem.put(RECRAWL_IF_OLDER, Long.toString(recrawlIfOlder));
mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth));
mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages));
mem.put(CRAWLING_Q, Boolean.toString(crawlingQ)); // crawling of urls with '?'
mem.put(INDEX_TEXT, Boolean.toString(indexText));
mem.put(INDEX_MEDIA, Boolean.toString(indexMedia));
mem.put(STORE_HTCACHE, Boolean.toString(storeHTCache));
mem.put(STORE_TXCACHE, Boolean.toString(storeTXCache));
mem.put(REMOTE_INDEXING, Boolean.toString(remoteIndexing));
mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words
mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word
mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words
mem.put(CACHE_STRAGEGY, cacheStrategy.toString());
doms = new ConcurrentHashMap<String, DomProfile>();
}
@Override
public String toString() {
final StringBuilder str = new StringBuilder();
if (this.mem != null) {
str.append(this.mem.toString());
}
return str.toString();
}
public entry(final Map<String, String> mem) {
this.mem = mem;
this.doms = domsCache.get(this.mem.get(HANDLE));
if (this.doms == null) this.doms = new ConcurrentHashMap<String, DomProfile>();
}
public Map<String, String> map() {
return mem;
}
public String handle() {
final String r = mem.get(HANDLE);
//if (r == null) return null;
return r;
}
public String name() {
final String r = mem.get(NAME);
if (r == null) return "";
return r;
}
public String startURL() {
final String r = mem.get(START_URL);
return r;
}
public Pattern mustMatchPattern() {
if (this.mustmatch == null) {
String r = mem.get(FILTER_MUSTMATCH);
if (r == null) r = MATCH_ALL;
this.mustmatch = Pattern.compile(r);
}
return this.mustmatch;
}
public Pattern mustNotMatchPattern() {
if (this.mustnotmatch == null) {
String r = mem.get(FILTER_MUSTNOTMATCH);
if (r == null) r = MATCH_NEVER;
this.mustnotmatch = Pattern.compile(r);
}
return this.mustnotmatch;
}
public int depth() {
final String r = mem.get(DEPTH);
if (r == null) return 0;
try {
return Integer.parseInt(r);
} catch (final NumberFormatException e) {
Log.logException(e);
return 0;
}
}
public CacheStrategy cacheStrategy() {
final String r = mem.get(CACHE_STRAGEGY);
if (r == null) return CacheStrategy.IFFRESH;
try {
return CacheStrategy.decode(Integer.parseInt(r));
} catch (final NumberFormatException e) {
Log.logException(e);
return CacheStrategy.IFFRESH;
}
}
public void setCacheStrategy(CacheStrategy newStrategy) {
mem.put(CACHE_STRAGEGY, newStrategy.toString());
}
public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that
// an entry must have to be re-crawled
final String r = mem.get(RECRAWL_IF_OLDER);
if (r == null) return 0L;
try {
final long l = Long.parseLong(r);
return (l < 0) ? 0L : l;
} catch (final NumberFormatException e) {
Log.logException(e);
return 0L;
}
}
public int domFilterDepth() {
// if the depth is equal or less to this depth,
// then the current url feeds with its domain the crawl filter
// if this is -1, all domains are feeded
final String r = mem.get(DOM_FILTER_DEPTH);
if (r == null) return Integer.MAX_VALUE;
try {
final int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
return i;
} catch (final NumberFormatException e) {
Log.logException(e);
return Integer.MAX_VALUE;
}
}
public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
final String r = mem.get(DOM_MAX_PAGES);
if (r == null) return Integer.MAX_VALUE;
try {
final int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
return i;
} catch (final NumberFormatException e) {
Log.logException(e);
return Integer.MAX_VALUE;
}
}
public boolean crawlingQ() {
final String r = mem.get(CRAWLING_Q);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexText() {
final String r = mem.get(INDEX_TEXT);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexMedia() {
final String r = mem.get(INDEX_MEDIA);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeHTCache() {
final String r = mem.get(STORE_HTCACHE);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeTXCache() {
final String r = mem.get(STORE_TXCACHE);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean remoteIndexing() {
final String r = mem.get(REMOTE_INDEXING);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeStaticStopwords() {
final String r = mem.get(XSSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeDynamicStopwords() {
final String r = mem.get(XDSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeParentStopwords() {
final String r = mem.get(XPSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public void domInc(final String domain, final String referrer, final int depth) {
final DomProfile dp = doms.get(domain);
if (dp == null) {
// new domain
doms.put(domain, new DomProfile(referrer, depth));
} else {
// increase counter
dp.inc();
}
domsCache.put(this.mem.get(HANDLE), doms);
}
public boolean grantedDomAppearance(final String domain) {
final int max = domFilterDepth();
if (max == Integer.MAX_VALUE) return true;
final DomProfile dp = doms.get(domain);
if (dp == null) {
return 0 < max;
}
return dp.depth <= max;
}
public boolean grantedDomCount(final String domain) {
final int max = domMaxPages();
if (max == Integer.MAX_VALUE) return true;
final DomProfile dp = doms.get(domain);
if (dp == null) {
return 0 < max;
}
return dp.count <= max;
}
public int domSize() {
return doms.size();
}
public boolean domExists(final String domain) {
if (domFilterDepth() == Integer.MAX_VALUE) return true;
return doms.containsKey(domain);
}
public String domName(final boolean attr, final int index){
final Iterator<Map.Entry<String, DomProfile>> domnamesi = doms.entrySet().iterator();
String domname="";
Map.Entry<String, DomProfile> ey;
DomProfile dp;
int i = 0;
while ((domnamesi.hasNext()) && (i < index)) {
ey = domnamesi.next();
i++;
}
if (domnamesi.hasNext()) {
ey = domnamesi.next();
dp = ey.getValue();
domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " ");
}
return domname;
}
public static long getRecrawlDate(final long oldTimeMinutes) {
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
}
}

@ -47,7 +47,6 @@ import net.yacy.kelondro.workflow.WorkflowJob;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
//import de.anomic.http.client.Client;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.yacy.yacyClient;
@ -252,14 +251,14 @@ public class CrawlQueues {
* @return
*/
private void generateCrawl(Request urlEntry, final String stats, final String profileHandle) {
final CrawlProfile.entry profile = sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
if (profile != null) {
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
if (mp != null) {
// check if the protocol is supported
final DigestURI url = urlEntry.url();
final String urlProtocol = url.getProtocol();
if (sb.loader.isSupportedProtocol(urlProtocol)) {
CrawlProfile profile = new CrawlProfile(mp);
if (this.log.isFine())
log.logFine(stats + ": URL=" + urlEntry.url()
+ ", initiator=" + ((urlEntry.initiator() == null) ? "" : new String(urlEntry.initiator()))
@ -556,7 +555,8 @@ public class CrawlQueues {
try {
request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
CrawlProfile.entry e = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
CrawlProfile e = mp == null ? null : new CrawlProfile(mp);
Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize);
if (response == null) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED);

@ -31,6 +31,7 @@ package de.anomic.crawler;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Date;
import java.util.Map;
import net.yacy.cora.protocol.Domains;
import net.yacy.kelondro.data.meta.DigestURI;
@ -180,7 +181,8 @@ public final class CrawlStacker {
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile.entry profile = crawler.profilesActiveCrawls.getEntry(entry.profileHandle());
final Map<String, String> mp = crawler.profilesActiveCrawls.get(entry.profileHandle().getBytes());
CrawlProfile profile = mp == null ? null : new CrawlProfile(mp);
String error;
if (profile == null) {
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
@ -248,7 +250,7 @@ public final class CrawlStacker {
return null;
}
public String checkAcceptance(final DigestURI url, final CrawlProfile.entry profile, int depth) {
public String checkAcceptance(final DigestURI url, final CrawlProfile profile, int depth) {
// check if the protocol is supported
final String urlProtocol = url.getProtocol();

@ -28,11 +28,12 @@ package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import de.anomic.crawler.CrawlProfile.CacheStrategy;
import java.util.Map;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
@ -56,14 +57,14 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log;
public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls;
public CrawlProfile.entry defaultProxyProfile;
public CrawlProfile.entry defaultRemoteProfile;
public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile.entry defaultSurrogateProfile;
private final File queuesRoot;
private final Log log;
public Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls;
public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot;
public CrawlSwitchboard(
final String networkName,
@ -82,43 +83,44 @@ public final class CrawlSwitchboard {
this.queuesRoot = queuesRoot;
this.queuesRoot.mkdirs();
this.log.logConfig("Initializing Crawl Profiles");
final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (!profilesActiveFile.exists()) {
// migrate old file
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesActive1.db");
if (oldFile.exists()) oldFile.renameTo(profilesActiveFile);
}
try {
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile);
this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e) {
Log.logException(e);Log.logException(e);
FileUtils.deletedelete(profilesActiveFile);
try {
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile);
this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e1) {
Log.logException(e1);
this.profilesActiveCrawls = null;
}
}
for (byte[] handle: this.profilesActiveCrawls.keySet()) {
CrawlProfile p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
}
initActiveCrawlProfiles();
log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries");
final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
if (!profilesPassiveFile.exists()) {
// migrate old file
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesPassive1.db");
if (oldFile.exists()) oldFile.renameTo(profilesPassiveFile);
}
try {
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile);
this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e) {
FileUtils.deletedelete(profilesPassiveFile);
Log.logException(e);Log.logException(e);
FileUtils.deletedelete(profilesActiveFile);
try {
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile);
this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e1) {
Log.logException(e1);
this.profilesPassiveCrawls = null;
}
}
for (byte[] handle: this.profilesPassiveCrawls.keySet()) {
CrawlProfile p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
}
log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
", " + this.profilesPassiveCrawls.size() + " entries" +
", " + profilesPassiveFile.length()/1024);
@ -135,12 +137,11 @@ public final class CrawlSwitchboard {
this.defaultMediaSnippetLocalProfile = null;
this.defaultMediaSnippetGlobalProfile = null;
this.defaultSurrogateProfile = null;
final Iterator<CrawlProfile.entry> i = this.profilesActiveCrawls.profiles(true);
CrawlProfile.entry profile;
CrawlProfile profile;
String name;
try {
while (i.hasNext()) {
profile = i.next();
for (byte[] handle: this.profilesActiveCrawls.keySet()) {
profile = new CrawlProfile(this.profilesActiveCrawls.get(handle));
name = profile.name();
if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile;
if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile;
@ -163,45 +164,52 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultProxyProfile.handle().getBytes(), this.defaultProxyProfile);
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
}
}
@ -209,24 +217,24 @@ public final class CrawlSwitchboard {
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (pdb.exists()) FileUtils.deletedelete(pdb);
try {
profilesActiveCrawls = new CrawlProfile(pdb);
} catch (IOException e) {
Log.logException(e);
this.profilesActiveCrawls = new MapHeap(pdb, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e1) {
Log.logException(e1);
this.profilesActiveCrawls = null;
}
initActiveCrawlProfiles();
}
public boolean cleanProfiles() throws InterruptedException {
final Iterator<CrawlProfile.entry> iter = profilesActiveCrawls.profiles(true);
CrawlProfile.entry entry;
CrawlProfile entry;
boolean hasDoneSomething = false;
try {
while (iter.hasNext()) {
for (byte[] handle: profilesActiveCrawls.keySet()) {
// check for interruption
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress");
// getting next profile
entry = iter.next();
entry = new CrawlProfile(profilesActiveCrawls.get(handle));
if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) ||
(entry.name().equals(CRAWL_PROFILE_REMOTE)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) ||
@ -234,8 +242,9 @@ public final class CrawlSwitchboard {
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) ||
(entry.name().equals(CRAWL_PROFILE_SURROGATE)))) {
profilesPassiveCrawls.newEntry(entry.map());
iter.remove();
CrawlProfile p = new CrawlProfile(entry);
profilesPassiveCrawls.put(p.handle().getBytes(), p);
profilesActiveCrawls.remove(handle);
hasDoneSomething = true;
}
}
@ -248,8 +257,8 @@ public final class CrawlSwitchboard {
public void close() {
this.profilesActiveCrawls.close();
this.profilesPassiveCrawls.close();
((MapHeap) this.profilesActiveCrawls).close();
((MapHeap) this.profilesPassiveCrawls).close();
}
}

@ -29,6 +29,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
@ -213,18 +214,18 @@ public class NoticedURL {
}
}
public Request pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException {
public Request pop(final int stackType, final boolean delay, Map<byte[], Map<String, String>> profiles) throws IOException {
switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack, delay, profile);
case STACK_TYPE_LIMIT: return pop(limitStack, delay, profile);
case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profile);
case STACK_TYPE_CORE: return pop(coreStack, delay, profiles);
case STACK_TYPE_LIMIT: return pop(limitStack, delay, profiles);
case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profiles);
default: return null;
}
}
public void shift(final int fromStack, final int toStack, CrawlProfile profile) {
public void shift(final int fromStack, final int toStack, Map<byte[], Map<String, String>> profiles) {
try {
final Request entry = pop(fromStack, false, profile);
final Request entry = pop(fromStack, false, profiles);
if (entry != null) push(toStack, entry);
} catch (final IOException e) {
return;
@ -241,14 +242,14 @@ public class NoticedURL {
}
}
private Request pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException {
private Request pop(final Balancer balancer, final boolean delay, Map<byte[], Map<String, String>> profiles) throws IOException {
// this is a filo - pop
int s;
Request entry;
int errors = 0;
synchronized (balancer) {
while ((s = balancer.size()) > 0) {
entry = balancer.pop(delay, profile);
entry = balancer.pop(delay, profiles);
if (entry == null) {
if (s > balancer.size()) continue;
errors++;

@ -35,7 +35,7 @@ public class SitemapImporter extends AbstractImporter implements Importer {
private final DigestURI sitemapURL;
private final ImporterManager superviser;
public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile.entry profileEntry) throws ImporterException {
public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile profileEntry) throws ImporterException {
super("sitemap");
this.superviser = importManager;
try {

@ -31,6 +31,7 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
@ -40,6 +41,7 @@ import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.Latency;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -124,12 +126,13 @@ public class FTPLoader {
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
dirList.toString().getBytes());
}
} else {
@ -237,12 +240,13 @@ public class FTPLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes());
return response;
}
@ -254,12 +258,13 @@ public class FTPLoader {
byte[] b = ftpClient.get(path);
// create a response
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
b);
return response;
}

@ -25,9 +25,11 @@ import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable;
import net.yacy.cora.protocol.HeaderFramework;
@ -81,12 +83,13 @@ public class FileLoader {
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
content.toString().getBytes());
return response;
@ -115,12 +118,13 @@ public class FileLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes());
return response;
}
@ -131,12 +135,13 @@ public class FileLoader {
is.close();
// create response with loaded content
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
b);
return response;
}

@ -26,6 +26,7 @@ package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.util.Date;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
@ -36,6 +37,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.Latency;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -146,6 +148,7 @@ public final class HTTPLoader {
}
// create a new cache entry
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
response = new Response(
request,
requestHeader,
@ -153,7 +156,7 @@ public final class HTTPLoader {
// res.getStatusLine(),
header,
Integer.toString(code),
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
responseBody
);

@ -61,7 +61,7 @@ public class Response {
private final RequestHeader requestHeader;
private final ResponseHeader responseHeader;
private final String responseStatus;
private final CrawlProfile.entry profile;
private final CrawlProfile profile;
private byte[] content;
private int status; // tracker indexing status, see status defs below
@ -148,7 +148,7 @@ public class Response {
final RequestHeader requestHeader,
final ResponseHeader responseHeader,
final String responseStatus,
final CrawlProfile.entry profile,
final CrawlProfile profile,
final byte[] content) {
this.request = request;
// request and response headers may be zero in case that we process surrogates
@ -165,7 +165,7 @@ public class Response {
final RequestHeader requestHeader,
final ResponseHeader responseHeader,
final String responseStatus,
final CrawlProfile.entry profile) {
final CrawlProfile profile) {
this(request, requestHeader, responseHeader, responseStatus, profile, null);
}
@ -216,7 +216,7 @@ public class Response {
return this.url().language();
}
public CrawlProfile.entry profile() {
public CrawlProfile profile() {
return this.profile;
}

@ -34,6 +34,7 @@ import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
@ -41,6 +42,7 @@ import jcifs.smb.SmbFileInputStream;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable;
import net.yacy.cora.protocol.HeaderFramework;
@ -100,12 +102,13 @@ public class SMBLoader {
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
content.toString().getBytes());
return response;
@ -134,12 +137,13 @@ public class SMBLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes());
return response;
}
@ -150,12 +154,13 @@ public class SMBLoader {
is.close();
// create response with loaded content
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
b);
return response;
}

@ -94,7 +94,7 @@ public class SitemapParser extends DefaultHandler {
/**
* The crawling profile used to parse the URLs contained in the sitemap file
*/
private CrawlProfile.entry crawlingProfile = null;
private CrawlProfile crawlingProfile = null;
/**
* Name of the current XML element
@ -137,7 +137,7 @@ public class SitemapParser extends DefaultHandler {
private Date lastMod = null;
private final Switchboard sb;
public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile.entry theCrawlingProfile) {
public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile theCrawlingProfile) {
assert sitemap != null;
this.sb = sb;
this.siteMapURL = sitemap;
@ -328,8 +328,8 @@ public class SitemapParser extends DefaultHandler {
}
}
private CrawlProfile.entry createProfile(final String domainName, final DigestURI sitemapURL) {
return this.sb.crawler.profilesActiveCrawls.newEntry(
private CrawlProfile createProfile(final String domainName, final DigestURI sitemapURL) {
CrawlProfile p = new CrawlProfile(
domainName, sitemapURL,
// crawling Filter
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
@ -352,5 +352,7 @@ public class SitemapParser extends DefaultHandler {
// exclude stop-words
true, true, true,
CrawlProfile.CacheStrategy.IFFRESH);
this.sb.crawler.profilesActiveCrawls.put(p.handle().getBytes(), p);
return p;
}
}

@ -59,9 +59,12 @@ public class WorkTables extends Tables {
public final static String TABLE_API_COL_APICALL_COUNT = "apicall_count"; // counts how often the API was called (starts with 1)
public final static String TABLE_API_COL_APICALL_SCHEDULE_TIME = "apicall_schedule_time"; // factor for SCHEULE_UNIT time units
public final static String TABLE_API_COL_APICALL_SCHEDULE_UNIT= "apicall_schedule_unit"; // may be 'minutes', 'hours', 'days'
public final static String TABLE_ROBOTS_NAME = "robots";
public final static String TABLE_ACTIVECRAWLS_NAME = "crawljobsActive";
public final static String TABLE_PASSIVECRAWLS_NAME = "crawljobsPassive";
public WorkTables(final File workPath) {
super(workPath, 12);

@ -123,7 +123,6 @@ import de.anomic.crawler.ResultImages;
import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.CrawlProfile.CacheStrategy;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
@ -1102,12 +1101,12 @@ public final class Switchboard extends serverSwitch {
}
/**
* {@link CrawlProfile Crawl Profiles} are saved independently from the queues themselves
* {@link CrawlProfiles Crawl Profiles} are saved independently from the queues themselves
* and therefore have to be cleaned up from time to time. This method only performs the clean-up
* if - and only if - the {@link IndexingStack switchboard},
* {@link LoaderDispatcher loader} and {@link plasmaCrawlNURL local crawl} queues are all empty.
* <p>
* Then it iterates through all existing {@link CrawlProfile crawl profiles} and removes
* Then it iterates through all existing {@link CrawlProfiles crawl profiles} and removes
* all profiles which are not hard-coded.
* </p>
* <p>
@ -1442,34 +1441,47 @@ public final class Switchboard extends serverSwitch {
// refresh recrawl dates
try{
Iterator<CrawlProfile.entry> it = crawler.profilesActiveCrawls.profiles(true);
entry selentry;
while (it.hasNext()) {
selentry = it.next();
CrawlProfile selentry;
for (byte[] handle: crawler.profilesActiveCrawls.keySet()) {
selentry = new CrawlProfile(crawler.profilesActiveCrawls.get(handle));
assert selentry.handle() != null : "profile.name = " + selentry.name();
if (selentry.handle() == null) {
it.remove();
crawler.profilesActiveCrawls.remove(handle);
continue;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE)));
boolean insert = false;
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE)));
insert = true;
}
// if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE)));
insert = true;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
insert = true;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
insert = true;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
insert = true;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
insert = true;
}
if (insert) crawler.profilesActiveCrawls.put(selentry.handle().getBytes(), selentry);
}
} catch (final Exception e) {
Log.logException(e);
@ -1827,7 +1839,7 @@ public final class Switchboard extends serverSwitch {
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
// to compute a URL hash which is necessary for a double-check
final CrawlProfile.entry profile = in.queueEntry.profile();
final CrawlProfile profile = in.queueEntry.profile();
ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing());
} catch (final UnsupportedEncodingException e) {
@ -1987,7 +1999,8 @@ public final class Switchboard extends serverSwitch {
if (searchEvent != null) searchEvent.addHeuristic(url.hash(), heuristicName, true);
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
final Request request = loader.request(url, true, true);
String acceptedError = this.crawlStacker.checkAcceptance(url, this.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), 0);
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
String acceptedError = this.crawlStacker.checkAcceptance(url, mp == null ? null : new CrawlProfile(mp), 0);
if (acceptedError != null) {
log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
return;

@ -27,7 +27,6 @@
package de.anomic.yacy;
//import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
@ -59,8 +58,6 @@ import net.yacy.kelondro.util.OS;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
//import de.anomic.http.client.Client;
//import de.anomic.http.server.ResponseContainer;
import de.anomic.search.Switchboard;
import de.anomic.server.serverCore;
import de.anomic.tools.CryptoLib;

@ -71,6 +71,7 @@ public class HeapReader {
this.keylength = keylength;
this.index = null; // will be created as result of initialization process
this.free = null; // will be initialized later depending on existing idx/gap file
this.heapFile.getParentFile().mkdirs();
this.file = new CachedFileWriter(this.heapFile);
// read or initialize the index

@ -52,8 +52,6 @@ import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
public class MapHeap implements Map<byte[], Map<String, String>> {
private BLOB blob;
@ -229,7 +227,8 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
public Map<String, String> get(final Object key) {
if (key == null) return null;
try {
return get((byte[]) key);
if (key instanceof byte[]) return get((byte[]) key);
if (key instanceof String) return get(((String) key).getBytes());
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {

@ -159,8 +159,8 @@ public final class LoaderDispatcher {
if (url.isLocal() && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + url);
// check if we have the page in the cache
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
CrawlProfile crawlProfile = mp == null ? null : new CrawlProfile(mp);
if (crawlProfile != null && cacheStrategy != CrawlProfile.CacheStrategy.NOCACHE) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry

Loading…
Cancel
Save