redesign of crawl profiles data structure. target will be:

- permanent storage of auto-dom statistics in profile
- storage of profiles in WorkTable data structure
not finished yet. No functional change yet.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7088 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 3f1d5a061f
commit 65eaf30f77

@ -28,15 +28,15 @@ import java.text.DateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map;
import java.util.Set; import java.util.Set;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlSwitchboard; import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.CrawlProfile.entry; import de.anomic.crawler.CrawlProfile;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -80,23 +80,23 @@ public class CrawlProfileEditor_p {
private static final ArrayList <eentry> labels = new ArrayList<eentry>(); private static final ArrayList <eentry> labels = new ArrayList<eentry>();
static { static {
labels.add(new eentry(entry.NAME, "Name", true, eentry.STRING)); labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(entry.START_URL, "Start URL", true, eentry.STRING)); labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(entry.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(entry.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(entry.DEPTH, "Crawl Depth", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(entry.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(entry.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(entry.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(entry.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(entry.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN));
labels.add(new eentry(entry.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN));
} }
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
@ -106,40 +106,32 @@ public class CrawlProfileEditor_p {
// read post for handle // read post for handle
final String handle = (post == null) ? "" : post.get("handle", ""); final String handle = (post == null) ? "" : post.get("handle", "");
if (post != null) { if (post != null) {
if (post.containsKey("terminate")) { if (post.containsKey("terminate")) try {
// termination of a crawl: shift the crawl from active to passive // termination of a crawl: shift the crawl from active to passive
final CrawlProfile.entry entry = sb.crawler.profilesActiveCrawls.getEntry(handle); final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(handle.getBytes());
if (entry != null) { if (mp != null) sb.crawler.profilesPassiveCrawls.put(handle.getBytes(), new CrawlProfile(mp));
sb.crawler.profilesPassiveCrawls.newEntry(entry.map());
}
sb.crawler.profilesActiveCrawls.removeEntry(handle.getBytes());
// delete all entries from the crawl queue that are deleted here // delete all entries from the crawl queue that are deleted here
try { sb.crawler.profilesActiveCrawls.remove(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000); sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (RowSpaceExceededException e) { } catch (RowSpaceExceededException e) {
Log.logException(e); Log.logException(e);
} }
}
if (post.containsKey("delete")) { if (post.containsKey("delete")) {
// deletion of a terminated crawl profile // deletion of a terminated crawl profile
sb.crawler.profilesPassiveCrawls.removeEntry(handle.getBytes()); sb.crawler.profilesPassiveCrawls.remove(handle.getBytes());
} }
if (post.containsKey("deleteTerminatedProfiles")) { if (post.containsKey("deleteTerminatedProfiles")) {
Iterator<CrawlProfile.entry> profiles = sb.crawler.profilesPassiveCrawls.profiles(false); for (byte[] h: sb.crawler.profilesPassiveCrawls.keySet()) {
while (profiles.hasNext()) { sb.crawler.profilesPassiveCrawls.remove(h);
profiles.next();
profiles.remove();
profiles = sb.crawler.profilesPassiveCrawls.profiles(false);
} }
} }
} }
// generate handle list // generate handle list
int count = 0; int count = 0;
Iterator<CrawlProfile.entry> it = sb.crawler.profilesActiveCrawls.profiles(true); CrawlProfile selentry;
entry selentry; for (byte[] h: sb.crawler.profilesActiveCrawls.keySet()) {
while (it.hasNext()) { selentry = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(h));
selentry = it.next();
if (ignoreNames.contains(selentry.name())) { if (ignoreNames.contains(selentry.name())) {
continue; continue;
} }
@ -151,7 +143,8 @@ public class CrawlProfileEditor_p {
count++; count++;
} }
prop.put("profiles", count); prop.put("profiles", count);
selentry = sb.crawler.profilesActiveCrawls.getEntry(handle); final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(handle.getBytes());
selentry = mp == null ? null : new CrawlProfile(mp);
assert selentry == null || selentry.handle() != null; assert selentry == null || selentry.handle() != null;
// read post for change submit // read post for change submit
if ((post != null) && (selentry != null)) { if ((post != null) && (selentry != null)) {
@ -161,10 +154,11 @@ public class CrawlProfileEditor_p {
eentry tee; eentry tee;
while (lit.hasNext()) { while (lit.hasNext()) {
tee = lit.next(); tee = lit.next();
final String cval = selentry.map().get(tee.name); final String cval = selentry.get(tee.name);
final String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval); final String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval);
if (!cval.equals(val)) { if (!cval.equals(val)) {
sb.crawler.profilesActiveCrawls.changeEntry(selentry, tee.name, val); selentry.put(tee.name, val);
sb.crawler.profilesActiveCrawls.put(selentry.handle().getBytes(), selentry);
} }
} }
} catch (final Exception ex) { } catch (final Exception ex) {
@ -179,20 +173,18 @@ public class CrawlProfileEditor_p {
count = 0; count = 0;
boolean dark = true; boolean dark = true;
final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160); final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
CrawlProfile.entry profile; CrawlProfile profile;
// put active crawls into list // put active crawls into list
it = sb.crawler.profilesActiveCrawls.profiles(true); for (byte[] h: sb.crawler.profilesActiveCrawls.keySet()) {
while (it.hasNext()) { profile = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(h));
profile = it.next();
putProfileEntry(prop, profile, true, dark, count, domlistlength); putProfileEntry(prop, profile, true, dark, count, domlistlength);
dark = !dark; dark = !dark;
count++; count++;
} }
// put passive crawls into list // put passive crawls into list
boolean existPassiveCrawls = false; boolean existPassiveCrawls = false;
it = sb.crawler.profilesPassiveCrawls.profiles(true); for (byte[] h: sb.crawler.profilesPassiveCrawls.keySet()) {
while (it.hasNext()) { profile = new CrawlProfile(sb.crawler.profilesPassiveCrawls.get(h));
profile = it.next();
putProfileEntry(prop, profile, false, dark, count, domlistlength); putProfileEntry(prop, profile, false, dark, count, domlistlength);
dark = !dark; dark = !dark;
count++; count++;
@ -217,7 +209,7 @@ public class CrawlProfileEditor_p {
count = 0; count = 0;
while (lit.hasNext()) { while (lit.hasNext()) {
final eentry ee = lit.next(); final eentry ee = lit.next();
final String val = selentry.map().get(ee.name); final String val = selentry.get(ee.name);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly", ee.readonly ? "1" : "0"); prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly", ee.readonly ? "1" : "0");
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_name", ee.name); prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_name", ee.name);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_label", ee.label); prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_label", ee.label);
@ -235,7 +227,7 @@ public class CrawlProfileEditor_p {
return prop; return prop;
} }
private static void putProfileEntry(final servletProperties prop, final CrawlProfile.entry profile, final boolean active, final boolean dark, final int count, final int domlistlength) { private static void putProfileEntry(final servletProperties prop, final CrawlProfile profile, final boolean active, final boolean dark, final int count, final int domlistlength) {
prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0"); prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_name", profile.name()); prop.put(CRAWL_PROFILE_PREFIX + count + "_name", profile.name());

@ -254,8 +254,8 @@ public class Crawler_p {
sb.crawlQueues.errorURL.remove(urlhash); sb.crawlQueues.errorURL.remove(urlhash);
// stack url // stack url
sb.crawler.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry( final CrawlProfile pe = new CrawlProfile(
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(), (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
crawlingStartURL, crawlingStartURL,
newcrawlingMustMatch, newcrawlingMustMatch,
@ -265,6 +265,7 @@ public class Crawler_p {
crawlingQ, crawlingQ,
indexText, indexMedia, indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy); storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final String reasonString = sb.crawlStacker.stackCrawl(new Request( final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(), sb.peers.mySeed().hash.getBytes(),
url, url,
@ -297,7 +298,7 @@ public class Crawler_p {
// generate a YaCyNews if the global flag was set // generate a YaCyNews if the global flag was set
if (crawlOrder) { if (crawlOrder) {
final Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
m.remove("specificDepth"); m.remove("specificDepth");
m.remove("indexText"); m.remove("indexText");
m.remove("indexMedia"); m.remove("indexMedia");
@ -371,7 +372,7 @@ public class Crawler_p {
// creating a crawler profile // creating a crawler profile
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null); final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
final CrawlProfile.entry profile = sb.crawler.profilesActiveCrawls.newEntry( final CrawlProfile profile = new CrawlProfile(
fileName, crawlURL, fileName, crawlURL,
newcrawlingMustMatch, newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER,
@ -387,6 +388,7 @@ public class Crawler_p {
crawlOrder, crawlOrder,
xsstopw, xdstopw, xpstopw, xsstopw, xdstopw, xpstopw,
cachePolicy); cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
// pause local crawl here // pause local crawl here
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -435,7 +437,7 @@ public class Crawler_p {
final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null); final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
// create a new profile // create a new profile
final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry( final CrawlProfile pe = new CrawlProfile(
sitemapURLStr, sitemapURL, sitemapURLStr, sitemapURL,
newcrawlingMustMatch, newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER,
@ -446,6 +448,7 @@ public class Crawler_p {
storeHTCache, true, crawlOrder, storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw, xsstopw, xdstopw, xpstopw,
cachePolicy); cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
// create a new sitemap importer // create a new sitemap importer
final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new DigestURI(sitemapURLStr, null), pe); final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new DigestURI(sitemapURLStr, null), pe);

@ -31,6 +31,7 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.Locale; import java.util.Locale;
import java.util.Map;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
@ -95,14 +96,15 @@ public class IndexCreateWWWGlobalQueue_p {
boolean dark = true; boolean dark = true;
yacySeed initiator; yacySeed initiator;
String profileHandle; String profileHandle;
CrawlProfile.entry profileEntry; CrawlProfile profileEntry;
int i, showNum = 0; int i, showNum = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i); urle = crawlerList.get(i);
if (urle != null && urle.url() != null) { if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator())); initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle(); profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle); final Map<String, String> mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
profileEntry = mp == null ? null : new CrawlProfile(mp);
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0"); prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

@ -32,6 +32,7 @@ import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.Locale; import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
@ -95,10 +96,9 @@ public class IndexCreateWWWLocalQueue_p {
if (option == PROFILE) { if (option == PROFILE) {
// search and delete the crawl profile (_much_ faster, independant of queue size) // search and delete the crawl profile (_much_ faster, independant of queue size)
// XXX: what to do about the annoying LOST PROFILE messages in the log? // XXX: what to do about the annoying LOST PROFILE messages in the log?
final Iterator<CrawlProfile.entry> it = sb.crawler.profilesActiveCrawls.profiles(true); CrawlProfile entry;
CrawlProfile.entry entry; for (byte[] handle: sb.crawler.profilesActiveCrawls.keySet()) {
while (it.hasNext()) { entry = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(handle));
entry = it.next();
final String name = entry.name(); final String name = entry.name();
if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) || if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) || name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
@ -108,9 +108,7 @@ public class IndexCreateWWWLocalQueue_p {
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) || name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
continue; continue;
if (compiledPattern.matcher(name).find()) { if (compiledPattern.matcher(name).find()) sb.crawler.profilesActiveCrawls.remove(entry.handle().getBytes());
sb.crawler.profilesActiveCrawls.removeEntry(entry.handle().getBytes());
}
} }
} else { } else {
// iterating through the list of URLs // iterating through the list of URLs
@ -165,14 +163,15 @@ public class IndexCreateWWWLocalQueue_p {
boolean dark = true; boolean dark = true;
yacySeed initiator; yacySeed initiator;
String profileHandle; String profileHandle;
CrawlProfile.entry profileEntry; CrawlProfile profileEntry;
int i; int i;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i); urle = crawlerList.get(i);
if ((urle != null)&&(urle.url()!=null)) { if ((urle != null)&&(urle.url()!=null)) {
initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator())); initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle(); profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle); final Map<String, String> mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
profileEntry = mp == null ? null : new CrawlProfile(mp);
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0"); prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

@ -28,6 +28,7 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.Locale; import java.util.Locale;
import java.util.Map;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
@ -92,14 +93,15 @@ public class IndexCreateWWWRemoteQueue_p {
boolean dark = true; boolean dark = true;
yacySeed initiator; yacySeed initiator;
String profileHandle; String profileHandle;
CrawlProfile.entry profileEntry; CrawlProfile profileEntry;
int i, showNum = 0; int i, showNum = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i); urle = crawlerList.get(i);
if (urle != null && urle.url() != null) { if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator())); initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle(); profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle); final Map<String, String> mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
profileEntry = mp == null ? null : new CrawlProfile(mp);
prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0"); prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

@ -28,7 +28,6 @@
// if the shell's current path is HTROOT // if the shell's current path is HTROOT
import java.io.File; import java.io.File;
import java.io.IOException;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -102,13 +101,13 @@ public class ProxyIndexingMonitor_p {
if (sb.crawler.defaultProxyProfile == null) { if (sb.crawler.defaultProxyProfile == null) {
prop.put("info", "1"); //delete DATA/PLASMADB/crawlProfiles0.db prop.put("info", "1"); //delete DATA/PLASMADB/crawlProfiles0.db
} else { } else {
try {
assert sb.crawler.defaultProxyProfile.handle() != null; assert sb.crawler.defaultProxyProfile.handle() != null;
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "generalDepth", Integer.toString(newProxyPrefetchDepth)); sb.crawler.defaultProxyProfile.put("generalDepth", Integer.toString(newProxyPrefetchDepth));
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "storeHTCache", (proxyStoreHTCache) ? "true": "false"); sb.crawler.defaultProxyProfile.put("storeHTCache", (proxyStoreHTCache) ? "true": "false");
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "remoteIndexing",proxyIndexingRemote ? "true":"false"); sb.crawler.defaultProxyProfile.put("remoteIndexing",proxyIndexingRemote ? "true":"false");
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "indexText",proxyIndexingLocalText ? "true":"false"); sb.crawler.defaultProxyProfile.put("indexText",proxyIndexingLocalText ? "true":"false");
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "indexMedia",proxyIndexingLocalMedia ? "true":"false"); sb.crawler.defaultProxyProfile.put("indexMedia",proxyIndexingLocalMedia ? "true":"false");
sb.crawler.profilesActiveCrawls.put(sb.crawler.defaultProxyProfile.handle().getBytes(), sb.crawler.defaultProxyProfile);
prop.put("info", "2");//new proxyPrefetchdepth prop.put("info", "2");//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth); prop.put("info_message", newProxyPrefetchDepth);
@ -137,11 +136,6 @@ public class ProxyIndexingMonitor_p {
prop.put("info_restart", "0"); prop.put("info_restart", "0");
prop.put("info_restart_return", "0"); prop.put("info_restart_return", "0");
if (!oldProxyCachePath.equals(newProxyCachePath)) prop.put("info_restart", "1"); if (!oldProxyCachePath.equals(newProxyCachePath)) prop.put("info_restart", "1");
} catch (final IOException e) {
prop.put("info", "3"); //Error: errmsg
prop.putHTML("info_error", e.getMessage());
}
} }
} catch (final Exception e) { } catch (final Exception e) {

@ -143,9 +143,9 @@ public class QuickCrawlLink_p {
sb.crawlQueues.errorURL.remove(urlhash); sb.crawlQueues.errorURL.remove(urlhash);
// create crawling profile // create crawling profile
CrawlProfile.entry pe = null; CrawlProfile pe = null;
try { try {
pe = sb.crawler.profilesActiveCrawls.newEntry( pe = new CrawlProfile(
crawlingStartURL.getHost(), crawlingStartURL.getHost(),
crawlingStartURL, crawlingStartURL,
crawlingMustMatch, crawlingMustMatch,
@ -163,8 +163,8 @@ public class QuickCrawlLink_p {
xsstopw, xsstopw,
xdstopw, xdstopw,
xpstopw, xpstopw,
CrawlProfile.CacheStrategy.IFFRESH CrawlProfile.CacheStrategy.IFFRESH);
); sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
} catch (final Exception e) { } catch (final Exception e) {
// mist // mist
prop.put("mode_status", "2");//Error with url prop.put("mode_status", "2");//Error with url

@ -4,11 +4,9 @@
//$LastChangedBy$ //$LastChangedBy$
// //
import java.util.Iterator;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import de.anomic.crawler.CrawlProfile.entry; import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlSwitchboard; import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -50,10 +48,9 @@ public class WatchWebStructure_p {
if (host.equals("auto")) { if (host.equals("auto")) {
// try to find the host from the crawl profiles // try to find the host from the crawl profiles
final Iterator<entry> it = sb.crawler.profilesActiveCrawls.profiles(true); CrawlProfile e;
entry e; for (byte[] handle: sb.crawler.profilesActiveCrawls.keySet()) {
while (it.hasNext()) { e = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(handle));
e = it.next();
if (e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) || if (e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) || e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||

@ -320,7 +320,7 @@ public class Balancer {
* @throws IOException * @throws IOException
* @throws RowSpaceExceededException * @throws RowSpaceExceededException
*/ */
public Request pop(final boolean delay, final CrawlProfile profile) throws IOException { public Request pop(final boolean delay, final Map<byte[], Map<String, String>> profiles) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times // returns a crawl entry from the stack and ensures minimum delta times
try { try {
@ -384,7 +384,8 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists // at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again // if not: return null. A calling method must handle the null value and try again
final CrawlProfile.entry profileEntry = (profile == null) ? null : profile.getEntry(crawlEntry.profileHandle()); final Map<String, String> mp = profiles == null ? null : profiles.get(crawlEntry.profileHandle());
final CrawlProfile profileEntry = mp == null ? null : new CrawlProfile(mp);
if (profileEntry == null) { if (profileEntry == null) {
Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
return null; return null;

@ -4,7 +4,7 @@
// (C) by Michael Peter Christen; mc@yacy.net // (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de // first published on http://www.anomic.de
// Frankfurt, Germany, 2004 // Frankfurt, Germany, 2004
// last major change: 25.02.2004 // last major change: 31.08.2010
// //
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by
@ -22,263 +22,25 @@
package de.anomic.crawler; package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.Digest; import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
public class CrawlProfile { public class CrawlProfile extends ConcurrentHashMap<String, String> implements Map<String, String> {
private static final long serialVersionUID = 5527325718810703504L;
public static final String MATCH_ALL = ".*"; public static final String MATCH_ALL = ".*";
public static final String MATCH_NEVER = ""; public static final String MATCH_NEVER = "";
static ConcurrentHashMap<String, Map<String, DomProfile>> domsCache = new ConcurrentHashMap<String, Map<String, DomProfile>>();
MapHeap profileTable;
private final File profileTableFile;
public CrawlProfile(final File file) throws IOException {
//System.out.println("loading crawl profile from " + file);
this.profileTableFile = file;
profileTableFile.getParentFile().mkdirs();
profileTable = new MapHeap(profileTableFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
profileIterator pi = new profileIterator(true);
entry e;
while (pi.hasNext()) {
e = pi.next();
if (e == null) continue;
Log.logInfo("CrawlProfiles", "loaded Profile " + e.handle() + ": " + e.name());
}
}
public void clear() {
// deletes the profile database and creates a new one
if (profileTable != null) profileTable.close();
FileUtils.deletedelete(profileTableFile);
profileTableFile.getParentFile().mkdirs();
try {
profileTable = new MapHeap(profileTableFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e) {
Log.logException(e);
}
}
public void close() {
if (profileTable != null) profileTable.close();
this.profileTable = null;
}
public int size() {
return profileTable.size();
}
public Iterator<entry> profiles(final boolean up) {
// enumerates profile entries
try {
return new profileIterator(up);
} catch (final IOException e) {
Log.logException(e);
return new HashSet<entry>().iterator();
}
}
public class profileIterator implements Iterator<entry> {
// the iterator iterates all keys, which are byte[] objects
CloneableIterator<byte[]> handleIterator;
String lastkey;
public profileIterator(final boolean up) throws IOException {
handleIterator = profileTable.keys(up, false);
lastkey = null;
}
public boolean hasNext() {
try {
return handleIterator.hasNext();
} catch (final kelondroException e) {
Log.logException(e);
clear();
return false;
}
}
public entry next() {
try {
lastkey = new String(handleIterator.next());
return getEntry(lastkey);
} catch (final kelondroException e) {
Log.logException(e);
clear();
return null;
}
}
public void remove() {
if (lastkey != null) try {
removeEntry(lastkey.getBytes());
} catch (final kelondroException e) {
Log.logException(e);
clear();
}
}
}
public void removeEntry(final byte[] handle) {
try {
profileTable.delete(handle);
} catch (final IOException e) {
Log.logException(e);
}
}
public entry newEntry(final Map<String, String> mem) {
final entry ne = new entry(mem);
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception e) {
clear();
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception ee) {
Log.logException(e);
System.exit(0);
}
}
return ne;
}
public entry newEntry( final String name,
final DigestURI startURL,
final String mustmatch, final String mustnotmatch,
final int generalDepth,
final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final CacheStrategy cacheStrategy) {
final entry ne = new entry(
name, startURL,
mustmatch, mustnotmatch,
generalDepth,
recrawlIfOlder, domFilterDepth, domMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, storeTXCache,
remoteIndexing,
xsstopw, xdstopw, xpstopw,
cacheStrategy);
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception e) {
clear();
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception ee) {
Log.logException(e);
System.exit(0);
}
}
return ne;
}
public boolean hasEntry(final String handle) {
return profileTable.containsKey(handle.getBytes());
}
public entry getEntry(final String handle) {
if (profileTable == null) return null;
Map<String, String> m;
try {
m = profileTable.get(handle.getBytes());
} catch (final IOException e) {
Log.logException(e);
return null;
} catch (RowSpaceExceededException e) {
Log.logException(e);
return null;
}
if (m == null) return null;
return new entry(m);
}
public void changeEntry(final entry e, final String propName, final String newValue) throws IOException, RowSpaceExceededException {
e.mem.put(propName, newValue);
assert e.handle() != null;
profileTable.insert(e.handle().getBytes(), e.mem);
}
public long getRecrawlDate(final long oldTimeMinutes) {
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
}
public static class DomProfile {
public String referrer;
public int depth, count;
public DomProfile(final String ref, final int d) {
this.referrer = ref;
this.depth = d;
this.count = 1;
}
public void inc() {
this.count++;
}
}
public static enum CacheStrategy {
NOCACHE(0), // never use the cache, all content from fresh internet source
IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules
IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
CACHEONLY(3); // never go online, use all content from cache. If no cache exist, treat content as unavailable
public int code;
private CacheStrategy(int code) {
this.code = code;
}
public String toString() {
return Integer.toString(this.code);
}
public static CacheStrategy decode(int code) {
for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy;
return NOCACHE;
}
public static CacheStrategy parse(String name) {
if (name.equals("nocache")) return NOCACHE;
if (name.equals("iffresh")) return IFFRESH;
if (name.equals("ifexist")) return IFEXIST;
if (name.equals("cacheonly")) return CACHEONLY;
return null;
}
public String toName() {
return this.name().toLowerCase();
}
public boolean isAllowedToFetchOnline() {
return this.code < 3;
}
public boolean mustBeOffline() {
return this.code == 3;
}
}
public static class entry {
// this is a simple record structure that hold all properties of a single crawl start // this is a simple record structure that hold all properties of a single crawl start
public static final String HANDLE = "handle"; public static final String HANDLE = "handle";
public static final String NAME = "name"; public static final String NAME = "name";
public static final String START_URL = "startURL"; public static final String START_URL = "startURL";
@ -299,12 +61,11 @@ public class CrawlProfile {
public static final String XPSTOPW = "xpstopw"; public static final String XPSTOPW = "xpstopw";
public static final String CACHE_STRAGEGY = "cacheStrategy"; public static final String CACHE_STRAGEGY = "cacheStrategy";
private Map<String, String> mem;
private Map<String, DomProfile> doms; private Map<String, DomProfile> doms;
private Pattern mustmatch = null, mustnotmatch = null; private Pattern mustmatch = null, mustnotmatch = null;
public entry(final String name, final DigestURI startURL, public CrawlProfile(final String name, final DigestURI startURL,
final String mustmatch, final String mustmatch,
final String mustnotmatch, final String mustnotmatch,
final int depth, final int depth,
@ -316,83 +77,81 @@ public class CrawlProfile {
final boolean remoteIndexing, final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw, final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final CacheStrategy cacheStrategy) { final CacheStrategy cacheStrategy) {
super(40);
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null"); if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : new String(startURL.hash()); final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : new String(startURL.hash());
mem = new ConcurrentHashMap<String, String>(40); put(HANDLE, handle);
mem.put(HANDLE, handle); put(NAME, name);
mem.put(NAME, name); put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
mem.put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false)); put(FILTER_MUSTMATCH, (mustmatch == null) ? CrawlProfile.MATCH_ALL : mustmatch);
mem.put(FILTER_MUSTMATCH, (mustmatch == null) ? MATCH_ALL : mustmatch); put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
mem.put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? MATCH_NEVER : mustnotmatch); put(DEPTH, depth);
mem.put(DEPTH, Integer.toString(depth)); put(RECRAWL_IF_OLDER, recrawlIfOlder);
mem.put(RECRAWL_IF_OLDER, Long.toString(recrawlIfOlder)); put(DOM_FILTER_DEPTH, domFilterDepth);
mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth)); put(DOM_MAX_PAGES, domMaxPages);
mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages)); put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
mem.put(CRAWLING_Q, Boolean.toString(crawlingQ)); // crawling of urls with '?' put(INDEX_TEXT, indexText);
mem.put(INDEX_TEXT, Boolean.toString(indexText)); put(INDEX_MEDIA, indexMedia);
mem.put(INDEX_MEDIA, Boolean.toString(indexMedia)); put(STORE_HTCACHE, storeHTCache);
mem.put(STORE_HTCACHE, Boolean.toString(storeHTCache)); put(STORE_TXCACHE, storeTXCache);
mem.put(STORE_TXCACHE, Boolean.toString(storeTXCache)); put(REMOTE_INDEXING, remoteIndexing);
mem.put(REMOTE_INDEXING, Boolean.toString(remoteIndexing)); put(XSSTOPW, xsstopw); // exclude static stop-words
mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words put(XDSTOPW, xdstopw); // exclude dynamic stop-word
mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word put(XPSTOPW, xpstopw); // exclude parent stop-words
mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words put(CACHE_STRAGEGY, cacheStrategy.toString());
mem.put(CACHE_STRAGEGY, cacheStrategy.toString());
doms = new ConcurrentHashMap<String, DomProfile>(); doms = new ConcurrentHashMap<String, DomProfile>();
} }
@Override public CrawlProfile(Map<String, String> ext) {
public String toString() { super(ext == null ? 1 : ext.size());
final StringBuilder str = new StringBuilder(); if (ext != null) this.putAll(ext);
doms = new ConcurrentHashMap<String, DomProfile>();
if (this.mem != null) {
str.append(this.mem.toString());
} }
return str.toString(); public void put(String key, boolean value) {
super.put(key, Boolean.toString(value));
} }
public entry(final Map<String, String> mem) { public void put(String key, int value) {
this.mem = mem; super.put(key, Integer.toString(value));
this.doms = domsCache.get(this.mem.get(HANDLE));
if (this.doms == null) this.doms = new ConcurrentHashMap<String, DomProfile>();
} }
public Map<String, String> map() { public void put(String key, long value) {
return mem; super.put(key, Long.toString(value));
} }
public String handle() { public String handle() {
final String r = mem.get(HANDLE); final String r = get(HANDLE);
//if (r == null) return null; //if (r == null) return null;
return r; return r;
} }
public String name() { public String name() {
final String r = mem.get(NAME); final String r = get(NAME);
if (r == null) return ""; if (r == null) return "";
return r; return r;
} }
public String startURL() { public String startURL() {
final String r = mem.get(START_URL); final String r = get(START_URL);
return r; return r;
} }
public Pattern mustMatchPattern() { public Pattern mustMatchPattern() {
if (this.mustmatch == null) { if (this.mustmatch == null) {
String r = mem.get(FILTER_MUSTMATCH); String r = get(FILTER_MUSTMATCH);
if (r == null) r = MATCH_ALL; if (r == null) r = CrawlProfile.MATCH_ALL;
this.mustmatch = Pattern.compile(r); this.mustmatch = Pattern.compile(r);
} }
return this.mustmatch; return this.mustmatch;
} }
public Pattern mustNotMatchPattern() { public Pattern mustNotMatchPattern() {
if (this.mustnotmatch == null) { if (this.mustnotmatch == null) {
String r = mem.get(FILTER_MUSTNOTMATCH); String r = get(FILTER_MUSTNOTMATCH);
if (r == null) r = MATCH_NEVER; if (r == null) r = CrawlProfile.MATCH_NEVER;
this.mustnotmatch = Pattern.compile(r); this.mustnotmatch = Pattern.compile(r);
} }
return this.mustnotmatch; return this.mustnotmatch;
} }
public int depth() { public int depth() {
final String r = mem.get(DEPTH); final String r = get(DEPTH);
if (r == null) return 0; if (r == null) return 0;
try { try {
return Integer.parseInt(r); return Integer.parseInt(r);
@ -402,7 +161,7 @@ public class CrawlProfile {
} }
} }
public CacheStrategy cacheStrategy() { public CacheStrategy cacheStrategy() {
final String r = mem.get(CACHE_STRAGEGY); final String r = get(CACHE_STRAGEGY);
if (r == null) return CacheStrategy.IFFRESH; if (r == null) return CacheStrategy.IFFRESH;
try { try {
return CacheStrategy.decode(Integer.parseInt(r)); return CacheStrategy.decode(Integer.parseInt(r));
@ -412,12 +171,12 @@ public class CrawlProfile {
} }
} }
public void setCacheStrategy(CacheStrategy newStrategy) { public void setCacheStrategy(CacheStrategy newStrategy) {
mem.put(CACHE_STRAGEGY, newStrategy.toString()); put(CACHE_STRAGEGY, newStrategy.toString());
} }
public long recrawlIfOlder() { public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that // returns a long (millis) that is the minimum age that
// an entry must have to be re-crawled // an entry must have to be re-crawled
final String r = mem.get(RECRAWL_IF_OLDER); final String r = get(RECRAWL_IF_OLDER);
if (r == null) return 0L; if (r == null) return 0L;
try { try {
final long l = Long.parseLong(r); final long l = Long.parseLong(r);
@ -431,7 +190,7 @@ public class CrawlProfile {
// if the depth is equal or less to this depth, // if the depth is equal or less to this depth,
// then the current url feeds with its domain the crawl filter // then the current url feeds with its domain the crawl filter
// if this is -1, all domains are feeded // if this is -1, all domains are feeded
final String r = mem.get(DOM_FILTER_DEPTH); final String r = get(DOM_FILTER_DEPTH);
if (r == null) return Integer.MAX_VALUE; if (r == null) return Integer.MAX_VALUE;
try { try {
final int i = Integer.parseInt(r); final int i = Integer.parseInt(r);
@ -445,7 +204,7 @@ public class CrawlProfile {
public int domMaxPages() { public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain // this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit // if -1, this means no limit
final String r = mem.get(DOM_MAX_PAGES); final String r = get(DOM_MAX_PAGES);
if (r == null) return Integer.MAX_VALUE; if (r == null) return Integer.MAX_VALUE;
try { try {
final int i = Integer.parseInt(r); final int i = Integer.parseInt(r);
@ -457,47 +216,47 @@ public class CrawlProfile {
} }
} }
public boolean crawlingQ() { public boolean crawlingQ() {
final String r = mem.get(CRAWLING_Q); final String r = get(CRAWLING_Q);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean indexText() { public boolean indexText() {
final String r = mem.get(INDEX_TEXT); final String r = get(INDEX_TEXT);
if (r == null) return true; if (r == null) return true;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean indexMedia() { public boolean indexMedia() {
final String r = mem.get(INDEX_MEDIA); final String r = get(INDEX_MEDIA);
if (r == null) return true; if (r == null) return true;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean storeHTCache() { public boolean storeHTCache() {
final String r = mem.get(STORE_HTCACHE); final String r = get(STORE_HTCACHE);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean storeTXCache() { public boolean storeTXCache() {
final String r = mem.get(STORE_TXCACHE); final String r = get(STORE_TXCACHE);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean remoteIndexing() { public boolean remoteIndexing() {
final String r = mem.get(REMOTE_INDEXING); final String r = get(REMOTE_INDEXING);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean excludeStaticStopwords() { public boolean excludeStaticStopwords() {
final String r = mem.get(XSSTOPW); final String r = get(XSSTOPW);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean excludeDynamicStopwords() { public boolean excludeDynamicStopwords() {
final String r = mem.get(XDSTOPW); final String r = get(XDSTOPW);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean excludeParentStopwords() { public boolean excludeParentStopwords() {
final String r = mem.get(XPSTOPW); final String r = get(XPSTOPW);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
@ -510,7 +269,6 @@ public class CrawlProfile {
// increase counter // increase counter
dp.inc(); dp.inc();
} }
domsCache.put(this.mem.get(HANDLE), doms);
} }
public boolean grantedDomAppearance(final String domain) { public boolean grantedDomAppearance(final String domain) {
final int max = domFilterDepth(); final int max = domFilterDepth();
@ -556,6 +314,59 @@ public class CrawlProfile {
} }
return domname; return domname;
} }
public final static class DomProfile {
public String referrer;
public int depth, count;
public DomProfile(final String ref, final int d) {
this.referrer = ref;
this.depth = d;
this.count = 1;
}
public void inc() {
this.count++;
}
} }
public static enum CacheStrategy {
NOCACHE(0), // never use the cache, all content from fresh internet source
IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules
IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
CACHEONLY(3); // never go online, use all content from cache. If no cache exist, treat content as unavailable
public int code;
private CacheStrategy(int code) {
this.code = code;
}
public String toString() {
return Integer.toString(this.code);
}
public static CacheStrategy decode(int code) {
for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy;
return NOCACHE;
}
public static CacheStrategy parse(String name) {
if (name.equals("nocache")) return NOCACHE;
if (name.equals("iffresh")) return IFFRESH;
if (name.equals("ifexist")) return IFEXIST;
if (name.equals("cacheonly")) return CACHEONLY;
return null;
}
public String toName() {
return this.name().toLowerCase();
}
public boolean isAllowedToFetchOnline() {
return this.code < 3;
}
public boolean mustBeOffline() {
return this.code == 3;
}
}
public static long getRecrawlDate(final long oldTimeMinutes) {
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
}
} }

@ -47,7 +47,6 @@ import net.yacy.kelondro.workflow.WorkflowJob;
import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
//import de.anomic.http.client.Client;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants; import de.anomic.search.SwitchboardConstants;
import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacyClient;
@ -252,14 +251,14 @@ public class CrawlQueues {
* @return * @return
*/ */
private void generateCrawl(Request urlEntry, final String stats, final String profileHandle) { private void generateCrawl(Request urlEntry, final String stats, final String profileHandle) {
final CrawlProfile.entry profile = sb.crawler.profilesActiveCrawls.getEntry(profileHandle); final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
if (profile != null) { if (mp != null) {
// check if the protocol is supported // check if the protocol is supported
final DigestURI url = urlEntry.url(); final DigestURI url = urlEntry.url();
final String urlProtocol = url.getProtocol(); final String urlProtocol = url.getProtocol();
if (sb.loader.isSupportedProtocol(urlProtocol)) { if (sb.loader.isSupportedProtocol(urlProtocol)) {
CrawlProfile profile = new CrawlProfile(mp);
if (this.log.isFine()) if (this.log.isFine())
log.logFine(stats + ": URL=" + urlEntry.url() log.logFine(stats + ": URL=" + urlEntry.url()
+ ", initiator=" + ((urlEntry.initiator() == null) ? "" : new String(urlEntry.initiator())) + ", initiator=" + ((urlEntry.initiator() == null) ? "" : new String(urlEntry.initiator()))
@ -556,7 +555,8 @@ public class CrawlQueues {
try { try {
request.setStatus("loading", WorkflowJob.STATUS_RUNNING); request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
CrawlProfile.entry e = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
CrawlProfile e = mp == null ? null : new CrawlProfile(mp);
Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize); Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize);
if (response == null) { if (response == null) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED); request.setStatus("error", WorkflowJob.STATUS_FINISHED);

@ -31,6 +31,7 @@ package de.anomic.crawler;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.util.Date; import java.util.Date;
import java.util.Map;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
@ -180,7 +181,8 @@ public final class CrawlStacker {
// returns null if successful, a reason string if not successful // returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile.entry profile = crawler.profilesActiveCrawls.getEntry(entry.profileHandle()); final Map<String, String> mp = crawler.profilesActiveCrawls.get(entry.profileHandle().getBytes());
CrawlProfile profile = mp == null ? null : new CrawlProfile(mp);
String error; String error;
if (profile == null) { if (profile == null) {
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
@ -248,7 +250,7 @@ public final class CrawlStacker {
return null; return null;
} }
public String checkAcceptance(final DigestURI url, final CrawlProfile.entry profile, int depth) { public String checkAcceptance(final DigestURI url, final CrawlProfile profile, int depth) {
// check if the protocol is supported // check if the protocol is supported
final String urlProtocol = url.getProtocol(); final String urlProtocol = url.getProtocol();

@ -28,11 +28,12 @@ package de.anomic.crawler;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Map;
import de.anomic.crawler.CrawlProfile.CacheStrategy;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException; import net.yacy.kelondro.util.kelondroException;
@ -57,12 +58,12 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log; private final Log log;
public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls; public Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls;
public CrawlProfile.entry defaultProxyProfile; public CrawlProfile defaultProxyProfile;
public CrawlProfile.entry defaultRemoteProfile; public CrawlProfile defaultRemoteProfile;
public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile.entry defaultSurrogateProfile; public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot; private final File queuesRoot;
public CrawlSwitchboard( public CrawlSwitchboard(
@ -82,43 +83,44 @@ public final class CrawlSwitchboard {
this.queuesRoot = queuesRoot; this.queuesRoot = queuesRoot;
this.queuesRoot.mkdirs(); this.queuesRoot.mkdirs();
this.log.logConfig("Initializing Crawl Profiles"); this.log.logConfig("Initializing Crawl Profiles");
final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (!profilesActiveFile.exists()) {
// migrate old file
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesActive1.db");
if (oldFile.exists()) oldFile.renameTo(profilesActiveFile);
}
try { try {
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile); this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e) { } catch (IOException e) {
Log.logException(e);Log.logException(e); Log.logException(e);Log.logException(e);
FileUtils.deletedelete(profilesActiveFile); FileUtils.deletedelete(profilesActiveFile);
try { try {
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile); this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e1) { } catch (IOException e1) {
Log.logException(e1); Log.logException(e1);
this.profilesActiveCrawls = null; this.profilesActiveCrawls = null;
} }
} }
for (byte[] handle: this.profilesActiveCrawls.keySet()) {
CrawlProfile p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
}
initActiveCrawlProfiles(); initActiveCrawlProfiles();
log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries"); log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries");
final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES); final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
if (!profilesPassiveFile.exists()) {
// migrate old file
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesPassive1.db");
if (oldFile.exists()) oldFile.renameTo(profilesPassiveFile);
}
try { try {
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile); this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e) { } catch (IOException e) {
FileUtils.deletedelete(profilesPassiveFile); Log.logException(e);Log.logException(e);
FileUtils.deletedelete(profilesActiveFile);
try { try {
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile); this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e1) { } catch (IOException e1) {
Log.logException(e1); Log.logException(e1);
this.profilesPassiveCrawls = null; this.profilesPassiveCrawls = null;
} }
} }
for (byte[] handle: this.profilesPassiveCrawls.keySet()) {
CrawlProfile p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
}
log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() + log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
", " + this.profilesPassiveCrawls.size() + " entries" + ", " + this.profilesPassiveCrawls.size() + " entries" +
", " + profilesPassiveFile.length()/1024); ", " + profilesPassiveFile.length()/1024);
@ -135,12 +137,11 @@ public final class CrawlSwitchboard {
this.defaultMediaSnippetLocalProfile = null; this.defaultMediaSnippetLocalProfile = null;
this.defaultMediaSnippetGlobalProfile = null; this.defaultMediaSnippetGlobalProfile = null;
this.defaultSurrogateProfile = null; this.defaultSurrogateProfile = null;
final Iterator<CrawlProfile.entry> i = this.profilesActiveCrawls.profiles(true); CrawlProfile profile;
CrawlProfile.entry profile;
String name; String name;
try { try {
while (i.hasNext()) { for (byte[] handle: this.profilesActiveCrawls.keySet()) {
profile = i.next(); profile = new CrawlProfile(this.profilesActiveCrawls.get(handle));
name = profile.name(); name = profile.name();
if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile; if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile;
if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile; if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile;
@ -163,45 +164,52 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) { if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling // generate new default entry for proxy crawling
this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true, true, true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true, false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
CrawlProfile.CacheStrategy.IFFRESH); CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultProxyProfile.handle().getBytes(), this.defaultProxyProfile);
} }
if (this.defaultRemoteProfile == null) { if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling // generate new default entry for remote crawling
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
} }
if (this.defaultTextSnippetLocalProfile == null) { if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
} }
if (this.defaultTextSnippetGlobalProfile == null) { if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
} }
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) { if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
} }
if (this.defaultMediaSnippetGlobalProfile == null) { if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
} }
if (this.defaultSurrogateProfile == null) { if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing // generate new default entry for surrogate parsing
defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
} }
} }
@ -209,24 +217,24 @@ public final class CrawlSwitchboard {
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (pdb.exists()) FileUtils.deletedelete(pdb); if (pdb.exists()) FileUtils.deletedelete(pdb);
try { try {
profilesActiveCrawls = new CrawlProfile(pdb); this.profilesActiveCrawls = new MapHeap(pdb, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e) { } catch (IOException e1) {
Log.logException(e); Log.logException(e1);
this.profilesActiveCrawls = null;
} }
initActiveCrawlProfiles(); initActiveCrawlProfiles();
} }
public boolean cleanProfiles() throws InterruptedException { public boolean cleanProfiles() throws InterruptedException {
final Iterator<CrawlProfile.entry> iter = profilesActiveCrawls.profiles(true); CrawlProfile entry;
CrawlProfile.entry entry;
boolean hasDoneSomething = false; boolean hasDoneSomething = false;
try { try {
while (iter.hasNext()) { for (byte[] handle: profilesActiveCrawls.keySet()) {
// check for interruption // check for interruption
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress"); if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress");
// getting next profile // getting next profile
entry = iter.next(); entry = new CrawlProfile(profilesActiveCrawls.get(handle));
if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) || if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) ||
(entry.name().equals(CRAWL_PROFILE_REMOTE)) || (entry.name().equals(CRAWL_PROFILE_REMOTE)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) ||
@ -234,8 +242,9 @@ public final class CrawlSwitchboard {
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) ||
(entry.name().equals(CRAWL_PROFILE_SURROGATE)))) { (entry.name().equals(CRAWL_PROFILE_SURROGATE)))) {
profilesPassiveCrawls.newEntry(entry.map()); CrawlProfile p = new CrawlProfile(entry);
iter.remove(); profilesPassiveCrawls.put(p.handle().getBytes(), p);
profilesActiveCrawls.remove(handle);
hasDoneSomething = true; hasDoneSomething = true;
} }
} }
@ -248,8 +257,8 @@ public final class CrawlSwitchboard {
public void close() { public void close() {
this.profilesActiveCrawls.close(); ((MapHeap) this.profilesActiveCrawls).close();
this.profilesPassiveCrawls.close(); ((MapHeap) this.profilesPassiveCrawls).close();
} }
} }

@ -29,6 +29,7 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.index.RowSpaceExceededException;
@ -213,18 +214,18 @@ public class NoticedURL {
} }
} }
public Request pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException { public Request pop(final int stackType, final boolean delay, Map<byte[], Map<String, String>> profiles) throws IOException {
switch (stackType) { switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack, delay, profile); case STACK_TYPE_CORE: return pop(coreStack, delay, profiles);
case STACK_TYPE_LIMIT: return pop(limitStack, delay, profile); case STACK_TYPE_LIMIT: return pop(limitStack, delay, profiles);
case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profile); case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profiles);
default: return null; default: return null;
} }
} }
public void shift(final int fromStack, final int toStack, CrawlProfile profile) { public void shift(final int fromStack, final int toStack, Map<byte[], Map<String, String>> profiles) {
try { try {
final Request entry = pop(fromStack, false, profile); final Request entry = pop(fromStack, false, profiles);
if (entry != null) push(toStack, entry); if (entry != null) push(toStack, entry);
} catch (final IOException e) { } catch (final IOException e) {
return; return;
@ -241,14 +242,14 @@ public class NoticedURL {
} }
} }
private Request pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException { private Request pop(final Balancer balancer, final boolean delay, Map<byte[], Map<String, String>> profiles) throws IOException {
// this is a filo - pop // this is a filo - pop
int s; int s;
Request entry; Request entry;
int errors = 0; int errors = 0;
synchronized (balancer) { synchronized (balancer) {
while ((s = balancer.size()) > 0) { while ((s = balancer.size()) > 0) {
entry = balancer.pop(delay, profile); entry = balancer.pop(delay, profiles);
if (entry == null) { if (entry == null) {
if (s > balancer.size()) continue; if (s > balancer.size()) continue;
errors++; errors++;

@ -35,7 +35,7 @@ public class SitemapImporter extends AbstractImporter implements Importer {
private final DigestURI sitemapURL; private final DigestURI sitemapURL;
private final ImporterManager superviser; private final ImporterManager superviser;
public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile.entry profileEntry) throws ImporterException { public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile profileEntry) throws ImporterException {
super("sitemap"); super("sitemap");
this.superviser = importManager; this.superviser = importManager;
try { try {

@ -31,6 +31,7 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.Date; import java.util.Date;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
@ -40,6 +41,7 @@ import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.Latency; import de.anomic.crawler.Latency;
import de.anomic.search.Segments; import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
@ -124,12 +126,13 @@ public class FTPLoader {
ResponseHeader responseHeader = new ResponseHeader(); ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
response = new Response( response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), mp == null ? null : new CrawlProfile(mp),
dirList.toString().getBytes()); dirList.toString().getBytes());
} }
} else { } else {
@ -237,12 +240,13 @@ public class FTPLoader {
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes()); url.toNormalform(true, true).getBytes());
return response; return response;
} }
@ -254,12 +258,13 @@ public class FTPLoader {
byte[] b = ftpClient.get(path); byte[] b = ftpClient.get(path);
// create a response // create a response
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), mp == null ? null : new CrawlProfile(mp),
b); b);
return response; return response;
} }

@ -25,9 +25,11 @@ import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map;
import de.anomic.search.Segments; import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable; import de.anomic.data.MimeTable;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
@ -81,12 +83,13 @@ public class FileLoader {
ResponseHeader responseHeader = new ResponseHeader(); ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), mp == null ? null : new CrawlProfile(mp),
content.toString().getBytes()); content.toString().getBytes());
return response; return response;
@ -115,12 +118,13 @@ public class FileLoader {
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes()); url.toNormalform(true, true).getBytes());
return response; return response;
} }
@ -131,12 +135,13 @@ public class FileLoader {
is.close(); is.close();
// create response with loaded content // create response with loaded content
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), mp == null ? null : new CrawlProfile(mp),
b); b);
return response; return response;
} }

@ -26,6 +26,7 @@ package de.anomic.crawler.retrieval;
import java.io.IOException; import java.io.IOException;
import java.util.Date; import java.util.Date;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
@ -36,6 +37,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.Latency; import de.anomic.crawler.Latency;
import de.anomic.search.Segments; import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
@ -146,6 +148,7 @@ public final class HTTPLoader {
} }
// create a new cache entry // create a new cache entry
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
response = new Response( response = new Response(
request, request,
requestHeader, requestHeader,
@ -153,7 +156,7 @@ public final class HTTPLoader {
// res.getStatusLine(), // res.getStatusLine(),
header, header,
Integer.toString(code), Integer.toString(code),
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), mp == null ? null : new CrawlProfile(mp),
responseBody responseBody
); );

@ -61,7 +61,7 @@ public class Response {
private final RequestHeader requestHeader; private final RequestHeader requestHeader;
private final ResponseHeader responseHeader; private final ResponseHeader responseHeader;
private final String responseStatus; private final String responseStatus;
private final CrawlProfile.entry profile; private final CrawlProfile profile;
private byte[] content; private byte[] content;
private int status; // tracker indexing status, see status defs below private int status; // tracker indexing status, see status defs below
@ -148,7 +148,7 @@ public class Response {
final RequestHeader requestHeader, final RequestHeader requestHeader,
final ResponseHeader responseHeader, final ResponseHeader responseHeader,
final String responseStatus, final String responseStatus,
final CrawlProfile.entry profile, final CrawlProfile profile,
final byte[] content) { final byte[] content) {
this.request = request; this.request = request;
// request and response headers may be zero in case that we process surrogates // request and response headers may be zero in case that we process surrogates
@ -165,7 +165,7 @@ public class Response {
final RequestHeader requestHeader, final RequestHeader requestHeader,
final ResponseHeader responseHeader, final ResponseHeader responseHeader,
final String responseStatus, final String responseStatus,
final CrawlProfile.entry profile) { final CrawlProfile profile) {
this(request, requestHeader, responseHeader, responseStatus, profile, null); this(request, requestHeader, responseHeader, responseStatus, profile, null);
} }
@ -216,7 +216,7 @@ public class Response {
return this.url().language(); return this.url().language();
} }
public CrawlProfile.entry profile() { public CrawlProfile profile() {
return this.profile; return this.profile;
} }

@ -34,6 +34,7 @@ import java.net.UnknownHostException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map;
import jcifs.smb.SmbException; import jcifs.smb.SmbException;
import jcifs.smb.SmbFile; import jcifs.smb.SmbFile;
@ -41,6 +42,7 @@ import jcifs.smb.SmbFileInputStream;
import de.anomic.search.Segments; import de.anomic.search.Segments;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable; import de.anomic.data.MimeTable;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
@ -100,12 +102,13 @@ public class SMBLoader {
ResponseHeader responseHeader = new ResponseHeader(); ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), mp == null ? null : new CrawlProfile(mp),
content.toString().getBytes()); content.toString().getBytes());
return response; return response;
@ -134,12 +137,13 @@ public class SMBLoader {
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes()); url.toNormalform(true, true).getBytes());
return response; return response;
} }
@ -150,12 +154,13 @@ public class SMBLoader {
is.close(); is.close();
// create response with loaded content // create response with loaded content
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), mp == null ? null : new CrawlProfile(mp),
b); b);
return response; return response;
} }

@ -94,7 +94,7 @@ public class SitemapParser extends DefaultHandler {
/** /**
* The crawling profile used to parse the URLs contained in the sitemap file * The crawling profile used to parse the URLs contained in the sitemap file
*/ */
private CrawlProfile.entry crawlingProfile = null; private CrawlProfile crawlingProfile = null;
/** /**
* Name of the current XML element * Name of the current XML element
@ -137,7 +137,7 @@ public class SitemapParser extends DefaultHandler {
private Date lastMod = null; private Date lastMod = null;
private final Switchboard sb; private final Switchboard sb;
public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile.entry theCrawlingProfile) { public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile theCrawlingProfile) {
assert sitemap != null; assert sitemap != null;
this.sb = sb; this.sb = sb;
this.siteMapURL = sitemap; this.siteMapURL = sitemap;
@ -328,8 +328,8 @@ public class SitemapParser extends DefaultHandler {
} }
} }
private CrawlProfile.entry createProfile(final String domainName, final DigestURI sitemapURL) { private CrawlProfile createProfile(final String domainName, final DigestURI sitemapURL) {
return this.sb.crawler.profilesActiveCrawls.newEntry( CrawlProfile p = new CrawlProfile(
domainName, sitemapURL, domainName, sitemapURL,
// crawling Filter // crawling Filter
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
@ -352,5 +352,7 @@ public class SitemapParser extends DefaultHandler {
// exclude stop-words // exclude stop-words
true, true, true, true, true, true,
CrawlProfile.CacheStrategy.IFFRESH); CrawlProfile.CacheStrategy.IFFRESH);
this.sb.crawler.profilesActiveCrawls.put(p.handle().getBytes(), p);
return p;
} }
} }

@ -62,6 +62,9 @@ public class WorkTables extends Tables {
public final static String TABLE_ROBOTS_NAME = "robots"; public final static String TABLE_ROBOTS_NAME = "robots";
public final static String TABLE_ACTIVECRAWLS_NAME = "crawljobsActive";
public final static String TABLE_PASSIVECRAWLS_NAME = "crawljobsPassive";
public WorkTables(final File workPath) { public WorkTables(final File workPath) {
super(workPath, 12); super(workPath, 12);

@ -123,7 +123,6 @@ import de.anomic.crawler.ResultImages;
import de.anomic.crawler.ResultURLs; import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt; import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.CrawlProfile.CacheStrategy; import de.anomic.crawler.CrawlProfile.CacheStrategy;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Request;
@ -1102,12 +1101,12 @@ public final class Switchboard extends serverSwitch {
} }
/** /**
* {@link CrawlProfile Crawl Profiles} are saved independently from the queues themselves * {@link CrawlProfiles Crawl Profiles} are saved independently from the queues themselves
* and therefore have to be cleaned up from time to time. This method only performs the clean-up * and therefore have to be cleaned up from time to time. This method only performs the clean-up
* if - and only if - the {@link IndexingStack switchboard}, * if - and only if - the {@link IndexingStack switchboard},
* {@link LoaderDispatcher loader} and {@link plasmaCrawlNURL local crawl} queues are all empty. * {@link LoaderDispatcher loader} and {@link plasmaCrawlNURL local crawl} queues are all empty.
* <p> * <p>
* Then it iterates through all existing {@link CrawlProfile crawl profiles} and removes * Then it iterates through all existing {@link CrawlProfiles crawl profiles} and removes
* all profiles which are not hard-coded. * all profiles which are not hard-coded.
* </p> * </p>
* <p> * <p>
@ -1442,34 +1441,47 @@ public final class Switchboard extends serverSwitch {
// refresh recrawl dates // refresh recrawl dates
try{ try{
Iterator<CrawlProfile.entry> it = crawler.profilesActiveCrawls.profiles(true); CrawlProfile selentry;
entry selentry; for (byte[] handle: crawler.profilesActiveCrawls.keySet()) {
while (it.hasNext()) { selentry = new CrawlProfile(crawler.profilesActiveCrawls.get(handle));
selentry = it.next();
assert selentry.handle() != null : "profile.name = " + selentry.name(); assert selentry.handle() != null : "profile.name = " + selentry.name();
if (selentry.handle() == null) { if (selentry.handle() == null) {
it.remove(); crawler.profilesActiveCrawls.remove(handle);
continue; continue;
} }
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY)) boolean insert = false;
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY)) {
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE))); selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE)));
insert = true;
}
// if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE)); // if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) {
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE))); Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) insert = true;
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, }
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE))); if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) {
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE))); insert = true;
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) }
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) {
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE))); selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER, insert = true;
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE))); }
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
insert = true;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
insert = true;
}
if (insert) crawler.profilesActiveCrawls.put(selentry.handle().getBytes(), selentry);
} }
} catch (final Exception e) { } catch (final Exception e) {
Log.logException(e); Log.logException(e);
@ -1827,7 +1839,7 @@ public final class Switchboard extends serverSwitch {
// update image result list statistics // update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup // its good to do this concurrently here, because it needs a DNS lookup
// to compute a URL hash which is necessary for a double-check // to compute a URL hash which is necessary for a double-check
final CrawlProfile.entry profile = in.queueEntry.profile(); final CrawlProfile profile = in.queueEntry.profile();
ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing()); ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing());
} catch (final UnsupportedEncodingException e) { } catch (final UnsupportedEncodingException e) {
@ -1987,7 +1999,8 @@ public final class Switchboard extends serverSwitch {
if (searchEvent != null) searchEvent.addHeuristic(url.hash(), heuristicName, true); if (searchEvent != null) searchEvent.addHeuristic(url.hash(), heuristicName, true);
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
final Request request = loader.request(url, true, true); final Request request = loader.request(url, true, true);
String acceptedError = this.crawlStacker.checkAcceptance(url, this.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), 0); final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
String acceptedError = this.crawlStacker.checkAcceptance(url, mp == null ? null : new CrawlProfile(mp), 0);
if (acceptedError != null) { if (acceptedError != null) {
log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError); log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
return; return;

@ -27,7 +27,6 @@
package de.anomic.yacy; package de.anomic.yacy;
//import java.io.BufferedInputStream;
import java.io.BufferedOutputStream; import java.io.BufferedOutputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
@ -59,8 +58,6 @@ import net.yacy.kelondro.util.OS;
import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.HTTPLoader;
//import de.anomic.http.client.Client;
//import de.anomic.http.server.ResponseContainer;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.tools.CryptoLib; import de.anomic.tools.CryptoLib;

@ -71,6 +71,7 @@ public class HeapReader {
this.keylength = keylength; this.keylength = keylength;
this.index = null; // will be created as result of initialization process this.index = null; // will be created as result of initialization process
this.free = null; // will be initialized later depending on existing idx/gap file this.free = null; // will be initialized later depending on existing idx/gap file
this.heapFile.getParentFile().mkdirs();
this.file = new CachedFileWriter(this.heapFile); this.file = new CachedFileWriter(this.heapFile);
// read or initialize the index // read or initialize the index

@ -52,8 +52,6 @@ import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException; import net.yacy.kelondro.util.kelondroException;
public class MapHeap implements Map<byte[], Map<String, String>> { public class MapHeap implements Map<byte[], Map<String, String>> {
private BLOB blob; private BLOB blob;
@ -229,7 +227,8 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
public Map<String, String> get(final Object key) { public Map<String, String> get(final Object key) {
if (key == null) return null; if (key == null) return null;
try { try {
return get((byte[]) key); if (key instanceof byte[]) return get((byte[]) key);
if (key instanceof String) return get(((String) key).getBytes());
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);
} catch (RowSpaceExceededException e) { } catch (RowSpaceExceededException e) {

@ -159,8 +159,8 @@ public final class LoaderDispatcher {
if (url.isLocal() && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + url); if (url.isLocal() && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + url);
// check if we have the page in the cache // check if we have the page in the cache
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); CrawlProfile crawlProfile = mp == null ? null : new CrawlProfile(mp);
if (crawlProfile != null && cacheStrategy != CrawlProfile.CacheStrategy.NOCACHE) { if (crawlProfile != null && cacheStrategy != CrawlProfile.CacheStrategy.NOCACHE) {
// we have passed a first test if caching is allowed // we have passed a first test if caching is allowed
// now see if there is a cache entry // now see if there is a cache entry

Loading…
Cancel
Save