redesign of crawl profiles data structure. target will be:

- permanent storage of auto-dom statistics in profile
- storage of profiles in WorkTable data structure
not finished yet. No functional change yet.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7088 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 3f1d5a061f
commit 65eaf30f77

@ -28,15 +28,15 @@ import java.text.DateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -80,23 +80,23 @@ public class CrawlProfileEditor_p {
private static final ArrayList <eentry> labels = new ArrayList<eentry>();
static {
labels.add(new eentry(entry.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(entry.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(entry.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(entry.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(entry.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(entry.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(entry.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(entry.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(entry.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(entry.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN));
labels.add(new eentry(entry.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN));
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
@ -106,40 +106,32 @@ public class CrawlProfileEditor_p {
// read post for handle
final String handle = (post == null) ? "" : post.get("handle", "");
if (post != null) {
if (post.containsKey("terminate")) {
if (post.containsKey("terminate")) try {
// termination of a crawl: shift the crawl from active to passive
final CrawlProfile.entry entry = sb.crawler.profilesActiveCrawls.getEntry(handle);
if (entry != null) {
sb.crawler.profilesPassiveCrawls.newEntry(entry.map());
}
sb.crawler.profilesActiveCrawls.removeEntry(handle.getBytes());
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(handle.getBytes());
if (mp != null) sb.crawler.profilesPassiveCrawls.put(handle.getBytes(), new CrawlProfile(mp));
// delete all entries from the crawl queue that are deleted here
try {
sb.crawler.profilesActiveCrawls.remove(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
}
if (post.containsKey("delete")) {
// deletion of a terminated crawl profile
sb.crawler.profilesPassiveCrawls.removeEntry(handle.getBytes());
sb.crawler.profilesPassiveCrawls.remove(handle.getBytes());
}
if (post.containsKey("deleteTerminatedProfiles")) {
Iterator<CrawlProfile.entry> profiles = sb.crawler.profilesPassiveCrawls.profiles(false);
while (profiles.hasNext()) {
profiles.next();
profiles.remove();
profiles = sb.crawler.profilesPassiveCrawls.profiles(false);
for (byte[] h: sb.crawler.profilesPassiveCrawls.keySet()) {
sb.crawler.profilesPassiveCrawls.remove(h);
}
}
}
// generate handle list
int count = 0;
Iterator<CrawlProfile.entry> it = sb.crawler.profilesActiveCrawls.profiles(true);
entry selentry;
while (it.hasNext()) {
selentry = it.next();
CrawlProfile selentry;
for (byte[] h: sb.crawler.profilesActiveCrawls.keySet()) {
selentry = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(h));
if (ignoreNames.contains(selentry.name())) {
continue;
}
@ -151,7 +143,8 @@ public class CrawlProfileEditor_p {
count++;
}
prop.put("profiles", count);
selentry = sb.crawler.profilesActiveCrawls.getEntry(handle);
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(handle.getBytes());
selentry = mp == null ? null : new CrawlProfile(mp);
assert selentry == null || selentry.handle() != null;
// read post for change submit
if ((post != null) && (selentry != null)) {
@ -161,10 +154,11 @@ public class CrawlProfileEditor_p {
eentry tee;
while (lit.hasNext()) {
tee = lit.next();
final String cval = selentry.map().get(tee.name);
final String cval = selentry.get(tee.name);
final String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval);
if (!cval.equals(val)) {
sb.crawler.profilesActiveCrawls.changeEntry(selentry, tee.name, val);
selentry.put(tee.name, val);
sb.crawler.profilesActiveCrawls.put(selentry.handle().getBytes(), selentry);
}
}
} catch (final Exception ex) {
@ -179,20 +173,18 @@ public class CrawlProfileEditor_p {
count = 0;
boolean dark = true;
final int domlistlength = (post == null) ? 160 : post.getInt("domlistlength", 160);
CrawlProfile.entry profile;
CrawlProfile profile;
// put active crawls into list
it = sb.crawler.profilesActiveCrawls.profiles(true);
while (it.hasNext()) {
profile = it.next();
for (byte[] h: sb.crawler.profilesActiveCrawls.keySet()) {
profile = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(h));
putProfileEntry(prop, profile, true, dark, count, domlistlength);
dark = !dark;
count++;
}
// put passive crawls into list
boolean existPassiveCrawls = false;
it = sb.crawler.profilesPassiveCrawls.profiles(true);
while (it.hasNext()) {
profile = it.next();
for (byte[] h: sb.crawler.profilesPassiveCrawls.keySet()) {
profile = new CrawlProfile(sb.crawler.profilesPassiveCrawls.get(h));
putProfileEntry(prop, profile, false, dark, count, domlistlength);
dark = !dark;
count++;
@ -217,7 +209,7 @@ public class CrawlProfileEditor_p {
count = 0;
while (lit.hasNext()) {
final eentry ee = lit.next();
final String val = selentry.map().get(ee.name);
final String val = selentry.get(ee.name);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly", ee.readonly ? "1" : "0");
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_name", ee.name);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_label", ee.label);
@ -235,7 +227,7 @@ public class CrawlProfileEditor_p {
return prop;
}
private static void putProfileEntry(final servletProperties prop, final CrawlProfile.entry profile, final boolean active, final boolean dark, final int count, final int domlistlength) {
private static void putProfileEntry(final servletProperties prop, final CrawlProfile profile, final boolean active, final boolean dark, final int count, final int domlistlength) {
prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_name", profile.name());

@ -254,8 +254,8 @@ public class Crawler_p {
sb.crawlQueues.errorURL.remove(urlhash);
// stack url
sb.crawler.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry(
sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile(
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
crawlingStartURL,
newcrawlingMustMatch,
@ -265,6 +265,7 @@ public class Crawler_p {
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
url,
@ -297,7 +298,7 @@ public class Crawler_p {
// generate a YaCyNews if the global flag was set
if (crawlOrder) {
final Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
final Map<String, String> m = new HashMap<String, String>(pe); // must be cloned
m.remove("specificDepth");
m.remove("indexText");
m.remove("indexMedia");
@ -371,7 +372,7 @@ public class Crawler_p {
// creating a crawler profile
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
final CrawlProfile.entry profile = sb.crawler.profilesActiveCrawls.newEntry(
final CrawlProfile profile = new CrawlProfile(
fileName, crawlURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
@ -387,6 +388,7 @@ public class Crawler_p {
crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
// pause local crawl here
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -435,7 +437,7 @@ public class Crawler_p {
final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
// create a new profile
final CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry(
final CrawlProfile pe = new CrawlProfile(
sitemapURLStr, sitemapURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
@ -446,6 +448,7 @@ public class Crawler_p {
storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
// create a new sitemap importer
final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new DigestURI(sitemapURLStr, null), pe);

@ -31,6 +31,7 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.protocol.RequestHeader;
@ -95,14 +96,15 @@ public class IndexCreateWWWGlobalQueue_p {
boolean dark = true;
yacySeed initiator;
String profileHandle;
CrawlProfile.entry profileEntry;
CrawlProfile profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
final Map<String, String> mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
profileEntry = mp == null ? null : new CrawlProfile(mp);
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

@ -32,6 +32,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@ -95,10 +96,9 @@ public class IndexCreateWWWLocalQueue_p {
if (option == PROFILE) {
// search and delete the crawl profile (_much_ faster, independant of queue size)
// XXX: what to do about the annoying LOST PROFILE messages in the log?
final Iterator<CrawlProfile.entry> it = sb.crawler.profilesActiveCrawls.profiles(true);
CrawlProfile.entry entry;
while (it.hasNext()) {
entry = it.next();
CrawlProfile entry;
for (byte[] handle: sb.crawler.profilesActiveCrawls.keySet()) {
entry = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(handle));
final String name = entry.name();
if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
@ -108,9 +108,7 @@ public class IndexCreateWWWLocalQueue_p {
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
continue;
if (compiledPattern.matcher(name).find()) {
sb.crawler.profilesActiveCrawls.removeEntry(entry.handle().getBytes());
}
if (compiledPattern.matcher(name).find()) sb.crawler.profilesActiveCrawls.remove(entry.handle().getBytes());
}
} else {
// iterating through the list of URLs
@ -165,14 +163,15 @@ public class IndexCreateWWWLocalQueue_p {
boolean dark = true;
yacySeed initiator;
String profileHandle;
CrawlProfile.entry profileEntry;
CrawlProfile profileEntry;
int i;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if ((urle != null)&&(urle.url()!=null)) {
initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
final Map<String, String> mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
profileEntry = mp == null ? null : new CrawlProfile(mp);
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

@ -28,6 +28,7 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.protocol.RequestHeader;
@ -92,14 +93,15 @@ public class IndexCreateWWWRemoteQueue_p {
boolean dark = true;
yacySeed initiator;
String profileHandle;
CrawlProfile.entry profileEntry;
CrawlProfile profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
final Map<String, String> mp = profileHandle == null ? null : sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
profileEntry = mp == null ? null : new CrawlProfile(mp);
prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));

@ -28,7 +28,6 @@
// if the shell's current path is HTROOT
import java.io.File;
import java.io.IOException;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.logging.Log;
@ -102,13 +101,13 @@ public class ProxyIndexingMonitor_p {
if (sb.crawler.defaultProxyProfile == null) {
prop.put("info", "1"); //delete DATA/PLASMADB/crawlProfiles0.db
} else {
try {
assert sb.crawler.defaultProxyProfile.handle() != null;
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "generalDepth", Integer.toString(newProxyPrefetchDepth));
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "storeHTCache", (proxyStoreHTCache) ? "true": "false");
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "remoteIndexing",proxyIndexingRemote ? "true":"false");
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "indexText",proxyIndexingLocalText ? "true":"false");
sb.crawler.profilesActiveCrawls.changeEntry(sb.crawler.defaultProxyProfile, "indexMedia",proxyIndexingLocalMedia ? "true":"false");
sb.crawler.defaultProxyProfile.put("generalDepth", Integer.toString(newProxyPrefetchDepth));
sb.crawler.defaultProxyProfile.put("storeHTCache", (proxyStoreHTCache) ? "true": "false");
sb.crawler.defaultProxyProfile.put("remoteIndexing",proxyIndexingRemote ? "true":"false");
sb.crawler.defaultProxyProfile.put("indexText",proxyIndexingLocalText ? "true":"false");
sb.crawler.defaultProxyProfile.put("indexMedia",proxyIndexingLocalMedia ? "true":"false");
sb.crawler.profilesActiveCrawls.put(sb.crawler.defaultProxyProfile.handle().getBytes(), sb.crawler.defaultProxyProfile);
prop.put("info", "2");//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth);
@ -137,11 +136,6 @@ public class ProxyIndexingMonitor_p {
prop.put("info_restart", "0");
prop.put("info_restart_return", "0");
if (!oldProxyCachePath.equals(newProxyCachePath)) prop.put("info_restart", "1");
} catch (final IOException e) {
prop.put("info", "3"); //Error: errmsg
prop.putHTML("info_error", e.getMessage());
}
}
} catch (final Exception e) {

@ -143,9 +143,9 @@ public class QuickCrawlLink_p {
sb.crawlQueues.errorURL.remove(urlhash);
// create crawling profile
CrawlProfile.entry pe = null;
CrawlProfile pe = null;
try {
pe = sb.crawler.profilesActiveCrawls.newEntry(
pe = new CrawlProfile(
crawlingStartURL.getHost(),
crawlingStartURL,
crawlingMustMatch,
@ -163,8 +163,8 @@ public class QuickCrawlLink_p {
xsstopw,
xdstopw,
xpstopw,
CrawlProfile.CacheStrategy.IFFRESH
);
CrawlProfile.CacheStrategy.IFFRESH);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
} catch (final Exception e) {
// mist
prop.put("mode_status", "2");//Error with url

@ -4,11 +4,9 @@
//$LastChangedBy$
//
import java.util.Iterator;
import net.yacy.cora.protocol.RequestHeader;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
@ -50,10 +48,9 @@ public class WatchWebStructure_p {
if (host.equals("auto")) {
// try to find the host from the crawl profiles
final Iterator<entry> it = sb.crawler.profilesActiveCrawls.profiles(true);
entry e;
while (it.hasNext()) {
e = it.next();
CrawlProfile e;
for (byte[] handle: sb.crawler.profilesActiveCrawls.keySet()) {
e = new CrawlProfile(sb.crawler.profilesActiveCrawls.get(handle));
if (e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||

@ -320,7 +320,7 @@ public class Balancer {
* @throws IOException
* @throws RowSpaceExceededException
*/
public Request pop(final boolean delay, final CrawlProfile profile) throws IOException {
public Request pop(final boolean delay, final Map<byte[], Map<String, String>> profiles) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
try {
@ -384,7 +384,8 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
final CrawlProfile.entry profileEntry = (profile == null) ? null : profile.getEntry(crawlEntry.profileHandle());
final Map<String, String> mp = profiles == null ? null : profiles.get(crawlEntry.profileHandle());
final CrawlProfile profileEntry = mp == null ? null : new CrawlProfile(mp);
if (profileEntry == null) {
Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
return null;

@ -4,7 +4,7 @@
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 25.02.2004
// last major change: 31.08.2010
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -22,263 +22,25 @@
package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
public class CrawlProfile {
public class CrawlProfile extends ConcurrentHashMap<String, String> implements Map<String, String> {
private static final long serialVersionUID = 5527325718810703504L;
public static final String MATCH_ALL = ".*";
public static final String MATCH_NEVER = "";
static ConcurrentHashMap<String, Map<String, DomProfile>> domsCache = new ConcurrentHashMap<String, Map<String, DomProfile>>();
MapHeap profileTable;
private final File profileTableFile;
public CrawlProfile(final File file) throws IOException {
//System.out.println("loading crawl profile from " + file);
this.profileTableFile = file;
profileTableFile.getParentFile().mkdirs();
profileTable = new MapHeap(profileTableFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
profileIterator pi = new profileIterator(true);
entry e;
while (pi.hasNext()) {
e = pi.next();
if (e == null) continue;
Log.logInfo("CrawlProfiles", "loaded Profile " + e.handle() + ": " + e.name());
}
}
public void clear() {
// deletes the profile database and creates a new one
if (profileTable != null) profileTable.close();
FileUtils.deletedelete(profileTableFile);
profileTableFile.getParentFile().mkdirs();
try {
profileTable = new MapHeap(profileTableFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e) {
Log.logException(e);
}
}
public void close() {
if (profileTable != null) profileTable.close();
this.profileTable = null;
}
public int size() {
return profileTable.size();
}
public Iterator<entry> profiles(final boolean up) {
// enumerates profile entries
try {
return new profileIterator(up);
} catch (final IOException e) {
Log.logException(e);
return new HashSet<entry>().iterator();
}
}
public class profileIterator implements Iterator<entry> {
// the iterator iterates all keys, which are byte[] objects
CloneableIterator<byte[]> handleIterator;
String lastkey;
public profileIterator(final boolean up) throws IOException {
handleIterator = profileTable.keys(up, false);
lastkey = null;
}
public boolean hasNext() {
try {
return handleIterator.hasNext();
} catch (final kelondroException e) {
Log.logException(e);
clear();
return false;
}
}
public entry next() {
try {
lastkey = new String(handleIterator.next());
return getEntry(lastkey);
} catch (final kelondroException e) {
Log.logException(e);
clear();
return null;
}
}
public void remove() {
if (lastkey != null) try {
removeEntry(lastkey.getBytes());
} catch (final kelondroException e) {
Log.logException(e);
clear();
}
}
}
public void removeEntry(final byte[] handle) {
try {
profileTable.delete(handle);
} catch (final IOException e) {
Log.logException(e);
}
}
public entry newEntry(final Map<String, String> mem) {
final entry ne = new entry(mem);
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception e) {
clear();
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception ee) {
Log.logException(e);
System.exit(0);
}
}
return ne;
}
public entry newEntry( final String name,
final DigestURI startURL,
final String mustmatch, final String mustnotmatch,
final int generalDepth,
final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final CacheStrategy cacheStrategy) {
final entry ne = new entry(
name, startURL,
mustmatch, mustnotmatch,
generalDepth,
recrawlIfOlder, domFilterDepth, domMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, storeTXCache,
remoteIndexing,
xsstopw, xdstopw, xpstopw,
cacheStrategy);
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception e) {
clear();
try {
profileTable.insert(ne.handle().getBytes(), ne.map());
} catch (final Exception ee) {
Log.logException(e);
System.exit(0);
}
}
return ne;
}
public boolean hasEntry(final String handle) {
return profileTable.containsKey(handle.getBytes());
}
public entry getEntry(final String handle) {
if (profileTable == null) return null;
Map<String, String> m;
try {
m = profileTable.get(handle.getBytes());
} catch (final IOException e) {
Log.logException(e);
return null;
} catch (RowSpaceExceededException e) {
Log.logException(e);
return null;
}
if (m == null) return null;
return new entry(m);
}
public void changeEntry(final entry e, final String propName, final String newValue) throws IOException, RowSpaceExceededException {
e.mem.put(propName, newValue);
assert e.handle() != null;
profileTable.insert(e.handle().getBytes(), e.mem);
}
public long getRecrawlDate(final long oldTimeMinutes) {
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
}
public static class DomProfile {
public String referrer;
public int depth, count;
public DomProfile(final String ref, final int d) {
this.referrer = ref;
this.depth = d;
this.count = 1;
}
public void inc() {
this.count++;
}
}
public static enum CacheStrategy {
NOCACHE(0), // never use the cache, all content from fresh internet source
IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules
IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
CACHEONLY(3); // never go online, use all content from cache. If no cache exist, treat content as unavailable
public int code;
private CacheStrategy(int code) {
this.code = code;
}
public String toString() {
return Integer.toString(this.code);
}
public static CacheStrategy decode(int code) {
for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy;
return NOCACHE;
}
public static CacheStrategy parse(String name) {
if (name.equals("nocache")) return NOCACHE;
if (name.equals("iffresh")) return IFFRESH;
if (name.equals("ifexist")) return IFEXIST;
if (name.equals("cacheonly")) return CACHEONLY;
return null;
}
public String toName() {
return this.name().toLowerCase();
}
public boolean isAllowedToFetchOnline() {
return this.code < 3;
}
public boolean mustBeOffline() {
return this.code == 3;
}
}
public static class entry {
// this is a simple record structure that hold all properties of a single crawl start
public static final String HANDLE = "handle";
public static final String NAME = "name";
public static final String START_URL = "startURL";
@ -299,12 +61,11 @@ public class CrawlProfile {
public static final String XPSTOPW = "xpstopw";
public static final String CACHE_STRAGEGY = "cacheStrategy";
private Map<String, String> mem;
private Map<String, DomProfile> doms;
private Pattern mustmatch = null, mustnotmatch = null;
public entry(final String name, final DigestURI startURL,
public CrawlProfile(final String name, final DigestURI startURL,
final String mustmatch,
final String mustnotmatch,
final int depth,
@ -316,83 +77,81 @@ public class CrawlProfile {
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final CacheStrategy cacheStrategy) {
super(40);
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : new String(startURL.hash());
mem = new ConcurrentHashMap<String, String>(40);
mem.put(HANDLE, handle);
mem.put(NAME, name);
mem.put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
mem.put(FILTER_MUSTMATCH, (mustmatch == null) ? MATCH_ALL : mustmatch);
mem.put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? MATCH_NEVER : mustnotmatch);
mem.put(DEPTH, Integer.toString(depth));
mem.put(RECRAWL_IF_OLDER, Long.toString(recrawlIfOlder));
mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth));
mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages));
mem.put(CRAWLING_Q, Boolean.toString(crawlingQ)); // crawling of urls with '?'
mem.put(INDEX_TEXT, Boolean.toString(indexText));
mem.put(INDEX_MEDIA, Boolean.toString(indexMedia));
mem.put(STORE_HTCACHE, Boolean.toString(storeHTCache));
mem.put(STORE_TXCACHE, Boolean.toString(storeTXCache));
mem.put(REMOTE_INDEXING, Boolean.toString(remoteIndexing));
mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words
mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word
mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words
mem.put(CACHE_STRAGEGY, cacheStrategy.toString());
put(HANDLE, handle);
put(NAME, name);
put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
put(FILTER_MUSTMATCH, (mustmatch == null) ? CrawlProfile.MATCH_ALL : mustmatch);
put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
put(DEPTH, depth);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(DOM_FILTER_DEPTH, domFilterDepth);
put(DOM_MAX_PAGES, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
put(INDEX_TEXT, indexText);
put(INDEX_MEDIA, indexMedia);
put(STORE_HTCACHE, storeHTCache);
put(STORE_TXCACHE, storeTXCache);
put(REMOTE_INDEXING, remoteIndexing);
put(XSSTOPW, xsstopw); // exclude static stop-words
put(XDSTOPW, xdstopw); // exclude dynamic stop-word
put(XPSTOPW, xpstopw); // exclude parent stop-words
put(CACHE_STRAGEGY, cacheStrategy.toString());
doms = new ConcurrentHashMap<String, DomProfile>();
}
@Override
public String toString() {
final StringBuilder str = new StringBuilder();
if (this.mem != null) {
str.append(this.mem.toString());
public CrawlProfile(Map<String, String> ext) {
super(ext == null ? 1 : ext.size());
if (ext != null) this.putAll(ext);
doms = new ConcurrentHashMap<String, DomProfile>();
}
return str.toString();
public void put(String key, boolean value) {
super.put(key, Boolean.toString(value));
}
public entry(final Map<String, String> mem) {
this.mem = mem;
this.doms = domsCache.get(this.mem.get(HANDLE));
if (this.doms == null) this.doms = new ConcurrentHashMap<String, DomProfile>();
public void put(String key, int value) {
super.put(key, Integer.toString(value));
}
public Map<String, String> map() {
return mem;
public void put(String key, long value) {
super.put(key, Long.toString(value));
}
public String handle() {
final String r = mem.get(HANDLE);
final String r = get(HANDLE);
//if (r == null) return null;
return r;
}
public String name() {
final String r = mem.get(NAME);
final String r = get(NAME);
if (r == null) return "";
return r;
}
public String startURL() {
final String r = mem.get(START_URL);
final String r = get(START_URL);
return r;
}
public Pattern mustMatchPattern() {
if (this.mustmatch == null) {
String r = mem.get(FILTER_MUSTMATCH);
if (r == null) r = MATCH_ALL;
String r = get(FILTER_MUSTMATCH);
if (r == null) r = CrawlProfile.MATCH_ALL;
this.mustmatch = Pattern.compile(r);
}
return this.mustmatch;
}
public Pattern mustNotMatchPattern() {
if (this.mustnotmatch == null) {
String r = mem.get(FILTER_MUSTNOTMATCH);
if (r == null) r = MATCH_NEVER;
String r = get(FILTER_MUSTNOTMATCH);
if (r == null) r = CrawlProfile.MATCH_NEVER;
this.mustnotmatch = Pattern.compile(r);
}
return this.mustnotmatch;
}
public int depth() {
final String r = mem.get(DEPTH);
final String r = get(DEPTH);
if (r == null) return 0;
try {
return Integer.parseInt(r);
@ -402,7 +161,7 @@ public class CrawlProfile {
}
}
public CacheStrategy cacheStrategy() {
final String r = mem.get(CACHE_STRAGEGY);
final String r = get(CACHE_STRAGEGY);
if (r == null) return CacheStrategy.IFFRESH;
try {
return CacheStrategy.decode(Integer.parseInt(r));
@ -412,12 +171,12 @@ public class CrawlProfile {
}
}
public void setCacheStrategy(CacheStrategy newStrategy) {
mem.put(CACHE_STRAGEGY, newStrategy.toString());
put(CACHE_STRAGEGY, newStrategy.toString());
}
public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that
// an entry must have to be re-crawled
final String r = mem.get(RECRAWL_IF_OLDER);
final String r = get(RECRAWL_IF_OLDER);
if (r == null) return 0L;
try {
final long l = Long.parseLong(r);
@ -431,7 +190,7 @@ public class CrawlProfile {
// if the depth is equal or less to this depth,
// then the current url feeds with its domain the crawl filter
// if this is -1, all domains are feeded
final String r = mem.get(DOM_FILTER_DEPTH);
final String r = get(DOM_FILTER_DEPTH);
if (r == null) return Integer.MAX_VALUE;
try {
final int i = Integer.parseInt(r);
@ -445,7 +204,7 @@ public class CrawlProfile {
public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
final String r = mem.get(DOM_MAX_PAGES);
final String r = get(DOM_MAX_PAGES);
if (r == null) return Integer.MAX_VALUE;
try {
final int i = Integer.parseInt(r);
@ -457,47 +216,47 @@ public class CrawlProfile {
}
}
public boolean crawlingQ() {
final String r = mem.get(CRAWLING_Q);
final String r = get(CRAWLING_Q);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexText() {
final String r = mem.get(INDEX_TEXT);
final String r = get(INDEX_TEXT);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexMedia() {
final String r = mem.get(INDEX_MEDIA);
final String r = get(INDEX_MEDIA);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeHTCache() {
final String r = mem.get(STORE_HTCACHE);
final String r = get(STORE_HTCACHE);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeTXCache() {
final String r = mem.get(STORE_TXCACHE);
final String r = get(STORE_TXCACHE);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean remoteIndexing() {
final String r = mem.get(REMOTE_INDEXING);
final String r = get(REMOTE_INDEXING);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeStaticStopwords() {
final String r = mem.get(XSSTOPW);
final String r = get(XSSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeDynamicStopwords() {
final String r = mem.get(XDSTOPW);
final String r = get(XDSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeParentStopwords() {
final String r = mem.get(XPSTOPW);
final String r = get(XPSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
@ -510,7 +269,6 @@ public class CrawlProfile {
// increase counter
dp.inc();
}
domsCache.put(this.mem.get(HANDLE), doms);
}
public boolean grantedDomAppearance(final String domain) {
final int max = domFilterDepth();
@ -556,6 +314,59 @@ public class CrawlProfile {
}
return domname;
}
public final static class DomProfile {
public String referrer;
public int depth, count;
public DomProfile(final String ref, final int d) {
this.referrer = ref;
this.depth = d;
this.count = 1;
}
public void inc() {
this.count++;
}
}
public static enum CacheStrategy {
NOCACHE(0), // never use the cache, all content from fresh internet source
IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules
IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
CACHEONLY(3); // never go online, use all content from cache. If no cache exist, treat content as unavailable
public int code;
private CacheStrategy(int code) {
this.code = code;
}
public String toString() {
return Integer.toString(this.code);
}
public static CacheStrategy decode(int code) {
for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy;
return NOCACHE;
}
public static CacheStrategy parse(String name) {
if (name.equals("nocache")) return NOCACHE;
if (name.equals("iffresh")) return IFFRESH;
if (name.equals("ifexist")) return IFEXIST;
if (name.equals("cacheonly")) return CACHEONLY;
return null;
}
public String toName() {
return this.name().toLowerCase();
}
public boolean isAllowedToFetchOnline() {
return this.code < 3;
}
public boolean mustBeOffline() {
return this.code == 3;
}
}
public static long getRecrawlDate(final long oldTimeMinutes) {
return System.currentTimeMillis() - (60000L * oldTimeMinutes);
}
}

@ -47,7 +47,6 @@ import net.yacy.kelondro.workflow.WorkflowJob;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
//import de.anomic.http.client.Client;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.yacy.yacyClient;
@ -252,14 +251,14 @@ public class CrawlQueues {
* @return
*/
private void generateCrawl(Request urlEntry, final String stats, final String profileHandle) {
final CrawlProfile.entry profile = sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
if (profile != null) {
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(profileHandle.getBytes());
if (mp != null) {
// check if the protocol is supported
final DigestURI url = urlEntry.url();
final String urlProtocol = url.getProtocol();
if (sb.loader.isSupportedProtocol(urlProtocol)) {
CrawlProfile profile = new CrawlProfile(mp);
if (this.log.isFine())
log.logFine(stats + ": URL=" + urlEntry.url()
+ ", initiator=" + ((urlEntry.initiator() == null) ? "" : new String(urlEntry.initiator()))
@ -556,7 +555,8 @@ public class CrawlQueues {
try {
request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
CrawlProfile.entry e = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
CrawlProfile e = mp == null ? null : new CrawlProfile(mp);
Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize);
if (response == null) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED);

@ -31,6 +31,7 @@ package de.anomic.crawler;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Date;
import java.util.Map;
import net.yacy.cora.protocol.Domains;
import net.yacy.kelondro.data.meta.DigestURI;
@ -180,7 +181,8 @@ public final class CrawlStacker {
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile.entry profile = crawler.profilesActiveCrawls.getEntry(entry.profileHandle());
final Map<String, String> mp = crawler.profilesActiveCrawls.get(entry.profileHandle().getBytes());
CrawlProfile profile = mp == null ? null : new CrawlProfile(mp);
String error;
if (profile == null) {
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
@ -248,7 +250,7 @@ public final class CrawlStacker {
return null;
}
public String checkAcceptance(final DigestURI url, final CrawlProfile.entry profile, int depth) {
public String checkAcceptance(final DigestURI url, final CrawlProfile profile, int depth) {
// check if the protocol is supported
final String urlProtocol = url.getProtocol();

@ -28,11 +28,12 @@ package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import de.anomic.crawler.CrawlProfile.CacheStrategy;
import java.util.Map;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
@ -57,12 +58,12 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log;
public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls;
public CrawlProfile.entry defaultProxyProfile;
public CrawlProfile.entry defaultRemoteProfile;
public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile.entry defaultSurrogateProfile;
public Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls;
public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot;
public CrawlSwitchboard(
@ -82,43 +83,44 @@ public final class CrawlSwitchboard {
this.queuesRoot = queuesRoot;
this.queuesRoot.mkdirs();
this.log.logConfig("Initializing Crawl Profiles");
final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (!profilesActiveFile.exists()) {
// migrate old file
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesActive1.db");
if (oldFile.exists()) oldFile.renameTo(profilesActiveFile);
}
try {
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile);
this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e) {
Log.logException(e);Log.logException(e);
FileUtils.deletedelete(profilesActiveFile);
try {
this.profilesActiveCrawls = new CrawlProfile(profilesActiveFile);
this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e1) {
Log.logException(e1);
this.profilesActiveCrawls = null;
}
}
for (byte[] handle: this.profilesActiveCrawls.keySet()) {
CrawlProfile p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
}
initActiveCrawlProfiles();
log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries");
final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
if (!profilesPassiveFile.exists()) {
// migrate old file
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "crawlProfilesPassive1.db");
if (oldFile.exists()) oldFile.renameTo(profilesPassiveFile);
}
try {
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile);
this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e) {
FileUtils.deletedelete(profilesPassiveFile);
Log.logException(e);Log.logException(e);
FileUtils.deletedelete(profilesActiveFile);
try {
this.profilesPassiveCrawls = new CrawlProfile(profilesPassiveFile);
this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e1) {
Log.logException(e1);
this.profilesPassiveCrawls = null;
}
}
for (byte[] handle: this.profilesPassiveCrawls.keySet()) {
CrawlProfile p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
}
log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
", " + this.profilesPassiveCrawls.size() + " entries" +
", " + profilesPassiveFile.length()/1024);
@ -135,12 +137,11 @@ public final class CrawlSwitchboard {
this.defaultMediaSnippetLocalProfile = null;
this.defaultMediaSnippetGlobalProfile = null;
this.defaultSurrogateProfile = null;
final Iterator<CrawlProfile.entry> i = this.profilesActiveCrawls.profiles(true);
CrawlProfile.entry profile;
CrawlProfile profile;
String name;
try {
while (i.hasNext()) {
profile = i.next();
for (byte[] handle: this.profilesActiveCrawls.keySet()) {
profile = new CrawlProfile(this.profilesActiveCrawls.get(handle));
name = profile.name();
if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile;
if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile;
@ -163,45 +164,52 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultProxyProfile.handle().getBytes(), this.defaultProxyProfile);
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
}
}
@ -209,24 +217,24 @@ public final class CrawlSwitchboard {
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (pdb.exists()) FileUtils.deletedelete(pdb);
try {
profilesActiveCrawls = new CrawlProfile(pdb);
} catch (IOException e) {
Log.logException(e);
this.profilesActiveCrawls = new MapHeap(pdb, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
} catch (IOException e1) {
Log.logException(e1);
this.profilesActiveCrawls = null;
}
initActiveCrawlProfiles();
}
public boolean cleanProfiles() throws InterruptedException {
final Iterator<CrawlProfile.entry> iter = profilesActiveCrawls.profiles(true);
CrawlProfile.entry entry;
CrawlProfile entry;
boolean hasDoneSomething = false;
try {
while (iter.hasNext()) {
for (byte[] handle: profilesActiveCrawls.keySet()) {
// check for interruption
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress");
// getting next profile
entry = iter.next();
entry = new CrawlProfile(profilesActiveCrawls.get(handle));
if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) ||
(entry.name().equals(CRAWL_PROFILE_REMOTE)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) ||
@ -234,8 +242,9 @@ public final class CrawlSwitchboard {
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) ||
(entry.name().equals(CRAWL_PROFILE_SURROGATE)))) {
profilesPassiveCrawls.newEntry(entry.map());
iter.remove();
CrawlProfile p = new CrawlProfile(entry);
profilesPassiveCrawls.put(p.handle().getBytes(), p);
profilesActiveCrawls.remove(handle);
hasDoneSomething = true;
}
}
@ -248,8 +257,8 @@ public final class CrawlSwitchboard {
public void close() {
this.profilesActiveCrawls.close();
this.profilesPassiveCrawls.close();
((MapHeap) this.profilesActiveCrawls).close();
((MapHeap) this.profilesPassiveCrawls).close();
}
}

@ -29,6 +29,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
@ -213,18 +214,18 @@ public class NoticedURL {
}
}
public Request pop(final int stackType, final boolean delay, CrawlProfile profile) throws IOException {
public Request pop(final int stackType, final boolean delay, Map<byte[], Map<String, String>> profiles) throws IOException {
switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack, delay, profile);
case STACK_TYPE_LIMIT: return pop(limitStack, delay, profile);
case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profile);
case STACK_TYPE_CORE: return pop(coreStack, delay, profiles);
case STACK_TYPE_LIMIT: return pop(limitStack, delay, profiles);
case STACK_TYPE_REMOTE: return pop(remoteStack, delay, profiles);
default: return null;
}
}
public void shift(final int fromStack, final int toStack, CrawlProfile profile) {
public void shift(final int fromStack, final int toStack, Map<byte[], Map<String, String>> profiles) {
try {
final Request entry = pop(fromStack, false, profile);
final Request entry = pop(fromStack, false, profiles);
if (entry != null) push(toStack, entry);
} catch (final IOException e) {
return;
@ -241,14 +242,14 @@ public class NoticedURL {
}
}
private Request pop(final Balancer balancer, final boolean delay, CrawlProfile profile) throws IOException {
private Request pop(final Balancer balancer, final boolean delay, Map<byte[], Map<String, String>> profiles) throws IOException {
// this is a filo - pop
int s;
Request entry;
int errors = 0;
synchronized (balancer) {
while ((s = balancer.size()) > 0) {
entry = balancer.pop(delay, profile);
entry = balancer.pop(delay, profiles);
if (entry == null) {
if (s > balancer.size()) continue;
errors++;

@ -35,7 +35,7 @@ public class SitemapImporter extends AbstractImporter implements Importer {
private final DigestURI sitemapURL;
private final ImporterManager superviser;
public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile.entry profileEntry) throws ImporterException {
public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile profileEntry) throws ImporterException {
super("sitemap");
this.superviser = importManager;
try {

@ -31,6 +31,7 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
@ -40,6 +41,7 @@ import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.Latency;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -124,12 +126,13 @@ public class FTPLoader {
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
dirList.toString().getBytes());
}
} else {
@ -237,12 +240,13 @@ public class FTPLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes());
return response;
}
@ -254,12 +258,13 @@ public class FTPLoader {
byte[] b = ftpClient.get(path);
// create a response
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
b);
return response;
}

@ -25,9 +25,11 @@ import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable;
import net.yacy.cora.protocol.HeaderFramework;
@ -81,12 +83,13 @@ public class FileLoader {
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
content.toString().getBytes());
return response;
@ -115,12 +118,13 @@ public class FileLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes());
return response;
}
@ -131,12 +135,13 @@ public class FileLoader {
is.close();
// create response with loaded content
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
b);
return response;
}

@ -26,6 +26,7 @@ package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.util.Date;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
@ -36,6 +37,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.Latency;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -146,6 +148,7 @@ public final class HTTPLoader {
}
// create a new cache entry
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
response = new Response(
request,
requestHeader,
@ -153,7 +156,7 @@ public final class HTTPLoader {
// res.getStatusLine(),
header,
Integer.toString(code),
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
responseBody
);

@ -61,7 +61,7 @@ public class Response {
private final RequestHeader requestHeader;
private final ResponseHeader responseHeader;
private final String responseStatus;
private final CrawlProfile.entry profile;
private final CrawlProfile profile;
private byte[] content;
private int status; // tracker indexing status, see status defs below
@ -148,7 +148,7 @@ public class Response {
final RequestHeader requestHeader,
final ResponseHeader responseHeader,
final String responseStatus,
final CrawlProfile.entry profile,
final CrawlProfile profile,
final byte[] content) {
this.request = request;
// request and response headers may be zero in case that we process surrogates
@ -165,7 +165,7 @@ public class Response {
final RequestHeader requestHeader,
final ResponseHeader responseHeader,
final String responseStatus,
final CrawlProfile.entry profile) {
final CrawlProfile profile) {
this(request, requestHeader, responseHeader, responseStatus, profile, null);
}
@ -216,7 +216,7 @@ public class Response {
return this.url().language();
}
public CrawlProfile.entry profile() {
public CrawlProfile profile() {
return this.profile;
}

@ -34,6 +34,7 @@ import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
@ -41,6 +42,7 @@ import jcifs.smb.SmbFileInputStream;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable;
import net.yacy.cora.protocol.HeaderFramework;
@ -100,12 +102,13 @@ public class SMBLoader {
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
content.toString().getBytes());
return response;
@ -134,12 +137,13 @@ public class SMBLoader {
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes());
return response;
}
@ -150,12 +154,13 @@ public class SMBLoader {
is.close();
// create response with loaded content
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
mp == null ? null : new CrawlProfile(mp),
b);
return response;
}

@ -94,7 +94,7 @@ public class SitemapParser extends DefaultHandler {
/**
* The crawling profile used to parse the URLs contained in the sitemap file
*/
private CrawlProfile.entry crawlingProfile = null;
private CrawlProfile crawlingProfile = null;
/**
* Name of the current XML element
@ -137,7 +137,7 @@ public class SitemapParser extends DefaultHandler {
private Date lastMod = null;
private final Switchboard sb;
public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile.entry theCrawlingProfile) {
public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile theCrawlingProfile) {
assert sitemap != null;
this.sb = sb;
this.siteMapURL = sitemap;
@ -328,8 +328,8 @@ public class SitemapParser extends DefaultHandler {
}
}
private CrawlProfile.entry createProfile(final String domainName, final DigestURI sitemapURL) {
return this.sb.crawler.profilesActiveCrawls.newEntry(
private CrawlProfile createProfile(final String domainName, final DigestURI sitemapURL) {
CrawlProfile p = new CrawlProfile(
domainName, sitemapURL,
// crawling Filter
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
@ -352,5 +352,7 @@ public class SitemapParser extends DefaultHandler {
// exclude stop-words
true, true, true,
CrawlProfile.CacheStrategy.IFFRESH);
this.sb.crawler.profilesActiveCrawls.put(p.handle().getBytes(), p);
return p;
}
}

@ -62,6 +62,9 @@ public class WorkTables extends Tables {
public final static String TABLE_ROBOTS_NAME = "robots";
public final static String TABLE_ACTIVECRAWLS_NAME = "crawljobsActive";
public final static String TABLE_PASSIVECRAWLS_NAME = "crawljobsPassive";
public WorkTables(final File workPath) {
super(workPath, 12);

@ -123,7 +123,6 @@ import de.anomic.crawler.ResultImages;
import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.CrawlProfile.CacheStrategy;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
@ -1102,12 +1101,12 @@ public final class Switchboard extends serverSwitch {
}
/**
* {@link CrawlProfile Crawl Profiles} are saved independently from the queues themselves
* {@link CrawlProfiles Crawl Profiles} are saved independently from the queues themselves
* and therefore have to be cleaned up from time to time. This method only performs the clean-up
* if - and only if - the {@link IndexingStack switchboard},
* {@link LoaderDispatcher loader} and {@link plasmaCrawlNURL local crawl} queues are all empty.
* <p>
* Then it iterates through all existing {@link CrawlProfile crawl profiles} and removes
* Then it iterates through all existing {@link CrawlProfiles crawl profiles} and removes
* all profiles which are not hard-coded.
* </p>
* <p>
@ -1442,34 +1441,47 @@ public final class Switchboard extends serverSwitch {
// refresh recrawl dates
try{
Iterator<CrawlProfile.entry> it = crawler.profilesActiveCrawls.profiles(true);
entry selentry;
while (it.hasNext()) {
selentry = it.next();
CrawlProfile selentry;
for (byte[] handle: crawler.profilesActiveCrawls.keySet()) {
selentry = new CrawlProfile(crawler.profilesActiveCrawls.get(handle));
assert selentry.handle() != null : "profile.name = " + selentry.name();
if (selentry.handle() == null) {
it.remove();
crawler.profilesActiveCrawls.remove(handle);
continue;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE)));
boolean insert = false;
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE)));
insert = true;
}
// if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
crawler.profilesActiveCrawls.changeEntry(selentry, CrawlProfile.entry.RECRAWL_IF_OLDER,
Long.toString(crawler.profilesActiveCrawls.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE)));
insert = true;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
insert = true;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
insert = true;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE)));
insert = true;
}
if (selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER,
Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE)));
insert = true;
}
if (insert) crawler.profilesActiveCrawls.put(selentry.handle().getBytes(), selentry);
}
} catch (final Exception e) {
Log.logException(e);
@ -1827,7 +1839,7 @@ public final class Switchboard extends serverSwitch {
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
// to compute a URL hash which is necessary for a double-check
final CrawlProfile.entry profile = in.queueEntry.profile();
final CrawlProfile profile = in.queueEntry.profile();
ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing());
} catch (final UnsupportedEncodingException e) {
@ -1987,7 +1999,8 @@ public final class Switchboard extends serverSwitch {
if (searchEvent != null) searchEvent.addHeuristic(url.hash(), heuristicName, true);
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
final Request request = loader.request(url, true, true);
String acceptedError = this.crawlStacker.checkAcceptance(url, this.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), 0);
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
String acceptedError = this.crawlStacker.checkAcceptance(url, mp == null ? null : new CrawlProfile(mp), 0);
if (acceptedError != null) {
log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
return;

@ -27,7 +27,6 @@
package de.anomic.yacy;
//import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
@ -59,8 +58,6 @@ import net.yacy.kelondro.util.OS;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
//import de.anomic.http.client.Client;
//import de.anomic.http.server.ResponseContainer;
import de.anomic.search.Switchboard;
import de.anomic.server.serverCore;
import de.anomic.tools.CryptoLib;

@ -71,6 +71,7 @@ public class HeapReader {
this.keylength = keylength;
this.index = null; // will be created as result of initialization process
this.free = null; // will be initialized later depending on existing idx/gap file
this.heapFile.getParentFile().mkdirs();
this.file = new CachedFileWriter(this.heapFile);
// read or initialize the index

@ -52,8 +52,6 @@ import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
public class MapHeap implements Map<byte[], Map<String, String>> {
private BLOB blob;
@ -229,7 +227,8 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
public Map<String, String> get(final Object key) {
if (key == null) return null;
try {
return get((byte[]) key);
if (key instanceof byte[]) return get((byte[]) key);
if (key instanceof String) return get(((String) key).getBytes());
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {

@ -159,8 +159,8 @@ public final class LoaderDispatcher {
if (url.isLocal() && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + url);
// check if we have the page in the cache
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
CrawlProfile crawlProfile = mp == null ? null : new CrawlProfile(mp);
if (crawlProfile != null && cacheStrategy != CrawlProfile.CacheStrategy.NOCACHE) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry

Loading…
Cancel
Save