All entities of crawl profiles are now editable in the crawl profile

editor.
pull/8/head
Michael Peter Christen 10 years ago
parent 1d8e1e4bac
commit 197f7449e5

@ -24,9 +24,6 @@
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
@ -45,50 +42,6 @@ public class CrawlProfileEditor_p {
private final static String CRAWL_PROFILE_PREFIX = "crawlProfiles_"; private final static String CRAWL_PROFILE_PREFIX = "crawlProfiles_";
private static final String EDIT_ENTRIES_PREFIX = "edit_entries_"; private static final String EDIT_ENTRIES_PREFIX = "edit_entries_";
public static class eentry {
public static final int BOOLEAN = 0;
public static final int INTEGER = 1;
public static final int STRING = 2;
public final String name;
public final String label;
public final boolean readonly;
public final int type;
public eentry(final String name, final String label, final boolean readonly, final int type) {
this.name = name;
this.label = label;
this.readonly = readonly;
this.type = type;
}
}
private static final List <eentry> labels = new ArrayList<eentry>();
static {
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.COLLECTIONS, "Collections (comma-separated list)", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "URL Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "URL Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTMATCH, "IP Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTNOTMATCH, "IP Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_COUNTRY_MUSTMATCH, "Country Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CRAWLER_URL_NODEPTHLIMITMATCH, "URL No-Depth-Limit Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTMATCH, "Indexing URL Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTNOTMATCH, "Indexing URL Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.INDEXING_CONTENT_MUSTMATCH, "Indexing Content Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.INDEXING_CONTENT_MUSTNOTMATCH, "Indexing Content Must-Not-Match Filter",false, eentry.STRING));
labels.add(new eentry(CrawlProfile.CACHE_STRAGEGY, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.DIRECT_DOC_BY_URL, "Put all linked urls into index without parsing", false, eentry.BOOLEAN));
}
public static serverObjects respond( public static serverObjects respond(
@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final RequestHeader header,
final serverObjects post, final serverObjects post,
@ -148,14 +101,11 @@ public class CrawlProfileEditor_p {
if ((post != null) && (selentry != null)) { if ((post != null) && (selentry != null)) {
if (post.containsKey("submit")) { if (post.containsKey("submit")) {
try { try {
final Iterator<eentry> lit = labels.iterator(); for (CrawlProfile.CrawlAttribute attribute: CrawlProfile.CrawlAttribute.values()) {
eentry tee; final String cval = selentry.get(attribute.key);
while (lit.hasNext()) { final String val = (attribute.type == CrawlProfile.CrawlAttribute.BOOLEAN) ? Boolean.toString(post.containsKey(attribute.key)) : post.get(attribute.key, cval);
tee = lit.next();
final String cval = selentry.get(tee.name);
final String val = (tee.type == eentry.BOOLEAN) ? Boolean.toString(post.containsKey(tee.name)) : post.get(tee.name, cval);
if (!cval.equals(val)) { if (!cval.equals(val)) {
selentry.put(tee.name, val); selentry.put(attribute.key, val);
sb.crawler.putActive(selentry.handle().getBytes(), selentry); sb.crawler.putActive(selentry.handle().getBytes(), selentry);
} }
} }
@ -199,16 +149,14 @@ public class CrawlProfileEditor_p {
prop.put("edit", "1"); prop.put("edit", "1");
prop.put("edit_name", selentry.collectionName()); prop.put("edit_name", selentry.collectionName());
prop.put("edit_handle", selentry.handle()); prop.put("edit_handle", selentry.handle());
final Iterator<eentry> lit = labels.iterator();
count = 0; count = 0;
while (lit.hasNext()) { for (CrawlProfile.CrawlAttribute attribute: CrawlProfile.CrawlAttribute.values()) {
final eentry ee = lit.next(); final String val = selentry.get(attribute.key);
final String val = selentry.get(ee.name); prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly", attribute.readonly ? "1" : "0");
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly", ee.readonly ? "1" : "0"); prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_name", attribute.key);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_name", ee.name); prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_label", attribute.label);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_label", ee.label); prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type", attribute.type);
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type", ee.type); if (attribute.type == CrawlProfile.CrawlAttribute.BOOLEAN) {
if (ee.type == eentry.BOOLEAN) {
prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type_checked", prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type_checked",
Boolean.parseBoolean(val) ? "1" : "0"); Boolean.parseBoolean(val) ? "1" : "0");
} else { } else {

@ -50,6 +50,9 @@ import net.yacy.kelondro.data.word.Word;
import net.yacy.search.query.QueryParams; import net.yacy.search.query.QueryParams;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
/**
* this is a simple record structure that hold all properties of a single crawl start
*/
public class CrawlProfile extends ConcurrentHashMap<String, String> implements Map<String, String> { public class CrawlProfile extends ConcurrentHashMap<String, String> implements Map<String, String> {
private static final long serialVersionUID = 5527325718810703504L; private static final long serialVersionUID = 5527325718810703504L;
@ -60,41 +63,62 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING); public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);
public static final String CRAWL_PROFILE_PUSH_STUB = "push_"; public static final String CRAWL_PROFILE_PUSH_STUB = "push_";
public enum CrawlAttribute {
HANDLE ("handle", true, CrawlAttribute.STRING, "Profile Handle"),
NAME ("name", true, CrawlAttribute.STRING, "Name"), // corresponds to the start url in many cases (not all)
DEPTH ("generalDepth", false, CrawlAttribute.INTEGER, "Crawl Depth"),
DIRECT_DOC_BY_URL ("directDocByURL", false, CrawlAttribute.BOOLEAN, "Put all linked urls into index without parsing"),
CRAWLER_URL_NODEPTHLIMITMATCH("crawlerNoLimitURLMustMatch", false, CrawlAttribute.STRING, "URL No-Depth-Limit Must-Match Filter"),
DOM_MAX_PAGES ("domMaxPages", false, CrawlAttribute.INTEGER, "Domain Max. Pages"),
CRAWLING_Q ("crawlingQ", false, CrawlAttribute.BOOLEAN, "CrawlingQ / '?'-URLs"),
FOLLOW_FRAMES ("followFrames", false, CrawlAttribute.BOOLEAN, "Flag if frames shall be followed (no by default)"),
OBEY_HTML_ROBOTS_NOINDEX ("obeyHtmlRobotsNoindex", false, CrawlAttribute.BOOLEAN, "Obey html-robots-noindex"),
OBEY_HTML_ROBOTS_NOFOLLOW ("obeyHtmlRobotsNofollow", false, CrawlAttribute.BOOLEAN, "Obey html-robots-nofollow"),
CRAWLER_URL_MUSTMATCH ("crawlerURLMustMatch", false, CrawlAttribute.STRING, "URL Must-Match Filter"),
CRAWLER_URL_MUSTNOTMATCH ("crawlerURLMustNotMatch", false, CrawlAttribute.STRING, "URL Must-Not-Match Filter"),
CRAWLER_IP_MUSTMATCH ("crawlerIPMustMatch", false, CrawlAttribute.STRING, "IP Must-Match Filter"),
CRAWLER_IP_MUSTNOTMATCH ("crawlerIPMustNotMatch", false, CrawlAttribute.STRING, "IP Must-Not-Match Filter"),
CRAWLER_COUNTRY_MUSTMATCH ("crawlerCountryMustMatch", false, CrawlAttribute.STRING, "Country Must-Match Filter"),
INDEXING_URL_MUSTMATCH ("indexURLMustMatch", false, CrawlAttribute.STRING, "Indexing URL Must-Match Filter"),
INDEXING_URL_MUSTNOTMATCH ("indexURLMustNotMatch", false, CrawlAttribute.STRING, "Indexing URL Must-Not-Match Filter"),
INDEXING_CONTENT_MUSTMATCH ("indexContentMustMatch", false, CrawlAttribute.STRING, "Indexing Content Must-Match Filter"),
INDEXING_CONTENT_MUSTNOTMATCH("indexContentMustNotMatch", false, CrawlAttribute.STRING, "Indexing Content Must-Not-Match Filter"),
RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"),
STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
AGENT_NAME ("agentName", false, CrawlAttribute.STRING, "User Agent Profile Name"),
SNAPSHOTS_MAXDEPTH ("snapshotsMaxDepth", false, CrawlAttribute.INTEGER, "Max Depth for Snapshots"),
SNAPSHOTS_REPLACEOLD ("snapshotsReplaceOld", false, CrawlAttribute.BOOLEAN, "Multiple Snapshot Versions - replace old with new"),
SNAPSHOTS_MUSTNOTMATCH ("snapshotsMustnotmatch", false, CrawlAttribute.STRING, "must-not-match filter for snapshot generation"),
SNAPSHOTS_LOADIMAGE ("snapshotsLoadImage", false, CrawlAttribute.BOOLEAN, "Flag for Snapshot image generation"),
REMOTE_INDEXING ("remoteIndexing", false, CrawlAttribute.BOOLEAN, "Remote Indexing (only for p2p networks)"),
INDEX_TEXT ("indexText", false, CrawlAttribute.BOOLEAN, "Index Text"),
INDEX_MEDIA ("indexMedia", false, CrawlAttribute.BOOLEAN, "Index Media"),
COLLECTIONS ("collections", false, CrawlAttribute.STRING, "Collections (comma-separated list)"),
SCRAPER ("scraper", false, CrawlAttribute.STRING, "Declaration for Vocabulary Scraper"),
TIMEZONEOFFSET ("timezoneOffset", true, CrawlAttribute.INTEGER, "Time Zone of Crawl Start Agent");
public static final int BOOLEAN = 0;
public static final int INTEGER = 1;
public static final int STRING = 2;
public final String key, label;
public final boolean readonly;
public final int type;
private CrawlAttribute(String key, final boolean readonly, final int type, final String label) {
this.key = key;
this.readonly = readonly;
this.type = type;
this.label = label;
}
@Override
public String toString() {
return this.key;
}
}
// this is a simple record structure that hold all properties of a single crawl start
private static final String HANDLE = "handle";
public static final String AGENT_NAME = "agentName";
public static final String NAME = "name";
public static final String DEPTH = "generalDepth";
public static final String DIRECT_DOC_BY_URL= "directDocByURL";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ";
public static final String FOLLOW_FRAMES = "followFrames";
public static final String OBEY_HTML_ROBOTS_NOINDEX = "obeyHtmlRobotsNoindex";
public static final String OBEY_HTML_ROBOTS_NOFOLLOW = "obeyHtmlRobotsNofollow";
public static final String INDEX_TEXT = "indexText";
public static final String INDEX_MEDIA = "indexMedia";
public static final String STORE_HTCACHE = "storeHTCache";
public static final String REMOTE_INDEXING = "remoteIndexing";
public static final String CACHE_STRAGEGY = "cacheStrategy";
public static final String COLLECTIONS = "collections";
public static final String SCRAPER = "scraper";
public static final String TIMEZONEOFFSET = "timezoneOffset";
public static final String CRAWLER_URL_MUSTMATCH = "crawlerURLMustMatch";
public static final String CRAWLER_URL_MUSTNOTMATCH = "crawlerURLMustNotMatch";
public static final String CRAWLER_IP_MUSTMATCH = "crawlerIPMustMatch";
public static final String CRAWLER_IP_MUSTNOTMATCH = "crawlerIPMustNotMatch";
public static final String CRAWLER_COUNTRY_MUSTMATCH = "crawlerCountryMustMatch";
public static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch";
public static final String INDEXING_URL_MUSTMATCH = "indexURLMustMatch";
public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch";
public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch";
public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch";
public static final String SNAPSHOTS_MAXDEPTH = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
public static final String SNAPSHOTS_REPLACEOLD = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day
public static final String SNAPSHOTS_LOADIMAGE = "snapshotsLoadImage"; // if true, an image is loaded
public static final String SNAPSHOTS_MUSTNOTMATCH = "snapshotsMustnotmatch";
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
@ -175,43 +199,43 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (name.length() > 256) name = name.substring(256); if (name.length() > 256) name = name.substring(256);
this.doms = new ConcurrentHashMap<String, AtomicInteger>(); this.doms = new ConcurrentHashMap<String, AtomicInteger>();
final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections)).substring(0, Word.commonHashLength); final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name + crawlerUrlMustMatch + depth + crawlerUrlMustNotMatch + domMaxPages + collections)).substring(0, Word.commonHashLength);
put(HANDLE, handle); put(CrawlAttribute.HANDLE.key, handle);
put(NAME, name); put(CrawlAttribute.NAME.key, name);
put(AGENT_NAME, userAgentName); put(CrawlAttribute.AGENT_NAME.key, userAgentName);
put(CRAWLER_URL_MUSTMATCH, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch); put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CRAWLER_URL_MUSTNOTMATCH, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CRAWLER_IP_MUSTMATCH, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch); put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
put(CRAWLER_IP_MUSTNOTMATCH, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch); put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
put(CRAWLER_COUNTRY_MUSTMATCH, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch); put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
put(CRAWLER_URL_NODEPTHLIMITMATCH, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch); put(CrawlAttribute.CRAWLER_URL_NODEPTHLIMITMATCH.key, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch);
put(INDEXING_URL_MUSTMATCH, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch); put(CrawlAttribute.INDEXING_URL_MUSTMATCH.key, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch);
put(INDEXING_URL_MUSTNOTMATCH, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch); put(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch);
put(INDEXING_CONTENT_MUSTMATCH, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch); put(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch);
put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch); put(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch);
put(DEPTH, depth); put(CrawlAttribute.DEPTH.key, depth);
put(DIRECT_DOC_BY_URL, directDocByURL); put(CrawlAttribute.DIRECT_DOC_BY_URL.key, directDocByURL);
put(RECRAWL_IF_OLDER, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime()); put(CrawlAttribute.RECRAWL_IF_OLDER.key, recrawlIfOlder == null ? Long.MAX_VALUE : recrawlIfOlder.getTime());
put(DOM_MAX_PAGES, domMaxPages); put(CrawlAttribute.DOM_MAX_PAGES.key, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' put(CrawlAttribute.CRAWLING_Q.key, crawlingQ); // crawling of urls with '?'
put(FOLLOW_FRAMES, followFrames); // load pages contained in frames or ifames put(CrawlAttribute.FOLLOW_FRAMES.key, followFrames); // load pages contained in frames or ifames
put(OBEY_HTML_ROBOTS_NOINDEX, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored put(CrawlAttribute.OBEY_HTML_ROBOTS_NOINDEX.key, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored
put(OBEY_HTML_ROBOTS_NOFOLLOW, obeyHtmlRobotsNofollow); put(CrawlAttribute.OBEY_HTML_ROBOTS_NOFOLLOW.key, obeyHtmlRobotsNofollow);
put(INDEX_TEXT, indexText); put(CrawlAttribute.INDEX_TEXT.key, indexText);
put(INDEX_MEDIA, indexMedia); put(CrawlAttribute.INDEX_MEDIA.key, indexMedia);
put(STORE_HTCACHE, storeHTCache); put(CrawlAttribute.STORE_HTCACHE.key, storeHTCache);
put(REMOTE_INDEXING, remoteIndexing); put(CrawlAttribute.REMOTE_INDEXING.key, remoteIndexing);
put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth); put(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key, snapshotsMaxDepth);
put(SNAPSHOTS_LOADIMAGE, snapshotsLoadImage); put(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key, snapshotsLoadImage);
put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld); put(CrawlAttribute.SNAPSHOTS_REPLACEOLD.key, snapshotsReplaceOld);
put(SNAPSHOTS_MUSTNOTMATCH, snapshotsMustnotmatch); put(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key, snapshotsMustnotmatch);
put(CACHE_STRAGEGY, cacheStrategy.toString()); put(CrawlAttribute.CACHE_STRAGEGY.key, cacheStrategy.toString());
put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); put(CrawlAttribute.COLLECTIONS.key, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the scraper information into a JSON Array // we transform the scraper information into a JSON Array
this.scraper = scraper == null ? new VocabularyScraper() : scraper; this.scraper = scraper == null ? new VocabularyScraper() : scraper;
String jsonString = this.scraper.toString(); String jsonString = this.scraper.toString();
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString; assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
put(SCRAPER, jsonString); put(CrawlAttribute.SCRAPER.key, jsonString);
put(TIMEZONEOFFSET, timezoneOffset); put(CrawlAttribute.TIMEZONEOFFSET.key, timezoneOffset);
} }
/** /**
@ -222,7 +246,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
super(ext == null ? 1 : ext.size()); super(ext == null ? 1 : ext.size());
if (ext != null) putAll(ext); if (ext != null) putAll(ext);
this.doms = new ConcurrentHashMap<String, AtomicInteger>(); this.doms = new ConcurrentHashMap<String, AtomicInteger>();
String jsonString = ext.get(SCRAPER); String jsonString = ext.get(CrawlAttribute.SCRAPER.key);
this.scraper = jsonString == null || jsonString.length() == 0 ? new VocabularyScraper() : new VocabularyScraper(jsonString); this.scraper = jsonString == null || jsonString.length() == 0 ? new VocabularyScraper() : new VocabularyScraper(jsonString);
} }
@ -260,7 +284,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
public ClientIdentification.Agent getAgent() { public ClientIdentification.Agent getAgent() {
String agentName = this.get(AGENT_NAME); String agentName = this.get(CrawlAttribute.AGENT_NAME.key);
return ClientIdentification.getAgent(agentName); return ClientIdentification.getAgent(agentName);
} }
@ -306,7 +330,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @return handle of the profile * @return handle of the profile
*/ */
public String handle() { public String handle() {
final String r = get(HANDLE); final String r = get(CrawlAttribute.HANDLE.key);
assert r != null; assert r != null;
//if (r == null) return null; //if (r == null) return null;
return r; return r;
@ -320,7 +344,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Map<String, Pattern> collections() { public Map<String, Pattern> collections() {
if (cmap != null) return cmap; if (cmap != null) return cmap;
final String r = get(COLLECTIONS); final String r = get(CrawlAttribute.COLLECTIONS.key);
this.cmap = collectionParser(r); this.cmap = collectionParser(r);
return this.cmap; return this.cmap;
} }
@ -341,7 +365,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @return name of the profile * @return name of the profile
*/ */
public String name() { public String name() {
final String r = get(NAME); final String r = get(CrawlAttribute.NAME.key);
if (r == null) return ""; if (r == null) return "";
return r; return r;
} }
@ -351,7 +375,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @return the name of the collection if that is not "user" or the name() otherwise; * @return the name of the collection if that is not "user" or the name() otherwise;
*/ */
public String collectionName() { public String collectionName() {
final String r = get(COLLECTIONS); final String r = get(CrawlAttribute.COLLECTIONS.key);
return r == null || r.length() == 0 || "user".equals(r) ? name() : r; return r == null || r.length() == 0 || "user".equals(r) ? name() : r;
} }
@ -361,7 +385,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern urlMustMatchPattern() { public Pattern urlMustMatchPattern() {
if (this.crawlerurlmustmatch == null) { if (this.crawlerurlmustmatch == null) {
final String r = get(CRAWLER_URL_MUSTMATCH); final String r = get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key);
try { try {
this.crawlerurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); this.crawlerurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.crawlerurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.crawlerurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
@ -375,7 +399,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern urlMustNotMatchPattern() { public Pattern urlMustNotMatchPattern() {
if (this.crawlerurlmustnotmatch == null) { if (this.crawlerurlmustnotmatch == null) {
final String r = get(CRAWLER_URL_MUSTNOTMATCH); final String r = get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key);
try { try {
this.crawlerurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); this.crawlerurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.crawlerurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.crawlerurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
@ -389,7 +413,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern ipMustMatchPattern() { public Pattern ipMustMatchPattern() {
if (this.crawleripmustmatch == null) { if (this.crawleripmustmatch == null) {
final String r = get(CRAWLER_IP_MUSTMATCH); final String r = get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key);
try { try {
this.crawleripmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); this.crawleripmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.crawleripmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.crawleripmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
@ -403,7 +427,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern ipMustNotMatchPattern() { public Pattern ipMustNotMatchPattern() {
if (this.crawleripmustnotmatch == null) { if (this.crawleripmustnotmatch == null) {
final String r = get(CRAWLER_IP_MUSTNOTMATCH); final String r = get(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key);
try { try {
this.crawleripmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); this.crawleripmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.crawleripmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.crawleripmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
@ -416,7 +440,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @return a list of country codes * @return a list of country codes
*/ */
public String[] countryMustMatchList() { public String[] countryMustMatchList() {
String countryMustMatch = get(CRAWLER_COUNTRY_MUSTMATCH); String countryMustMatch = get(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key);
if (countryMustMatch == null) countryMustMatch = CrawlProfile.MATCH_NEVER_STRING; if (countryMustMatch == null) countryMustMatch = CrawlProfile.MATCH_NEVER_STRING;
if (countryMustMatch.isEmpty()) return new String[0]; if (countryMustMatch.isEmpty()) return new String[0];
String[] list = CommonPattern.COMMA.split(countryMustMatch); String[] list = CommonPattern.COMMA.split(countryMustMatch);
@ -430,7 +454,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern crawlerNoDepthLimitMatchPattern() { public Pattern crawlerNoDepthLimitMatchPattern() {
if (this.crawlernodepthlimitmatch == null) { if (this.crawlernodepthlimitmatch == null) {
final String r = get(CRAWLER_URL_NODEPTHLIMITMATCH); final String r = get(CrawlAttribute.CRAWLER_URL_NODEPTHLIMITMATCH.key);
try { try {
this.crawlernodepthlimitmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); this.crawlernodepthlimitmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.crawlernodepthlimitmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.crawlernodepthlimitmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
@ -444,7 +468,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern indexUrlMustMatchPattern() { public Pattern indexUrlMustMatchPattern() {
if (this.indexurlmustmatch == null) { if (this.indexurlmustmatch == null) {
final String r = get(INDEXING_URL_MUSTMATCH); final String r = get(CrawlAttribute.INDEXING_URL_MUSTMATCH.key);
try { try {
this.indexurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); this.indexurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.indexurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.indexurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
@ -458,7 +482,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern indexUrlMustNotMatchPattern() { public Pattern indexUrlMustNotMatchPattern() {
if (this.indexurlmustnotmatch == null) { if (this.indexurlmustnotmatch == null) {
final String r = get(INDEXING_URL_MUSTNOTMATCH); final String r = get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key);
try { try {
this.indexurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); this.indexurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.indexurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.indexurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
@ -472,7 +496,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern indexContentMustMatchPattern() { public Pattern indexContentMustMatchPattern() {
if (this.indexcontentmustmatch == null) { if (this.indexcontentmustmatch == null) {
final String r = get(INDEXING_CONTENT_MUSTMATCH); final String r = get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key);
try { try {
this.indexcontentmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); this.indexcontentmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.indexcontentmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.indexcontentmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
@ -486,7 +510,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern indexContentMustNotMatchPattern() { public Pattern indexContentMustNotMatchPattern() {
if (this.indexcontentmustnotmatch == null) { if (this.indexcontentmustnotmatch == null) {
final String r = get(INDEXING_CONTENT_MUSTNOTMATCH); final String r = get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key);
try { try {
this.indexcontentmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); this.indexcontentmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.indexcontentmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.indexcontentmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
@ -500,7 +524,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @return depth of crawl job * @return depth of crawl job
*/ */
public int depth() { public int depth() {
final String r = get(DEPTH); final String r = get(CrawlAttribute.DEPTH.key);
if (r == null) return 0; if (r == null) return 0;
try { try {
return Integer.parseInt(r); return Integer.parseInt(r);
@ -511,13 +535,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
public boolean directDocByURL() { public boolean directDocByURL() {
final String r = get(DIRECT_DOC_BY_URL); final String r = get(CrawlAttribute.DIRECT_DOC_BY_URL.key);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public CacheStrategy cacheStrategy() { public CacheStrategy cacheStrategy() {
final String r = get(CACHE_STRAGEGY); final String r = get(CrawlAttribute.CACHE_STRAGEGY.key);
if (r == null) return CacheStrategy.IFEXIST; if (r == null) return CacheStrategy.IFEXIST;
try { try {
return CacheStrategy.decode(Integer.parseInt(r)); return CacheStrategy.decode(Integer.parseInt(r));
@ -528,7 +552,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
public void setCacheStrategy(final CacheStrategy newStrategy) { public void setCacheStrategy(final CacheStrategy newStrategy) {
put(CACHE_STRAGEGY, newStrategy.toString()); put(CrawlAttribute.CACHE_STRAGEGY.key, newStrategy.toString());
} }
/** /**
@ -538,7 +562,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public long recrawlIfOlder() { public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that // returns a long (millis) that is the minimum age that
// an entry must have to be re-crawled // an entry must have to be re-crawled
final String r = get(RECRAWL_IF_OLDER); final String r = get(CrawlAttribute.RECRAWL_IF_OLDER.key);
if (r == null) return 0L; if (r == null) return 0L;
try { try {
final long l = Long.parseLong(r); final long l = Long.parseLong(r);
@ -552,7 +576,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public int domMaxPages() { public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain // this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit // if -1, this means no limit
final String r = get(DOM_MAX_PAGES); final String r = get(CrawlAttribute.DOM_MAX_PAGES.key);
if (r == null) return Integer.MAX_VALUE; if (r == null) return Integer.MAX_VALUE;
try { try {
final int i = Integer.parseInt(r); final int i = Integer.parseInt(r);
@ -565,55 +589,55 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
public boolean crawlingQ() { public boolean crawlingQ() {
final String r = get(CRAWLING_Q); final String r = get(CrawlAttribute.CRAWLING_Q.key);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean followFrames() { public boolean followFrames() {
final String r = get(FOLLOW_FRAMES); final String r = get(CrawlAttribute.FOLLOW_FRAMES.key);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean obeyHtmlRobotsNoindex() { public boolean obeyHtmlRobotsNoindex() {
final String r = get(OBEY_HTML_ROBOTS_NOINDEX); final String r = get(CrawlAttribute.OBEY_HTML_ROBOTS_NOINDEX.key);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean obeyHtmlRobotsNofollow() { public boolean obeyHtmlRobotsNofollow() {
final String r = get(OBEY_HTML_ROBOTS_NOFOLLOW); final String r = get(CrawlAttribute.OBEY_HTML_ROBOTS_NOFOLLOW.key);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean indexText() { public boolean indexText() {
final String r = get(INDEX_TEXT); final String r = get(CrawlAttribute.INDEX_TEXT.key);
if (r == null) return true; if (r == null) return true;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean indexMedia() { public boolean indexMedia() {
final String r = get(INDEX_MEDIA); final String r = get(CrawlAttribute.INDEX_MEDIA.key);
if (r == null) return true; if (r == null) return true;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean storeHTCache() { public boolean storeHTCache() {
final String r = get(STORE_HTCACHE); final String r = get(CrawlAttribute.STORE_HTCACHE.key);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean remoteIndexing() { public boolean remoteIndexing() {
final String r = get(REMOTE_INDEXING); final String r = get(CrawlAttribute.REMOTE_INDEXING.key);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public int snapshotMaxdepth() { public int snapshotMaxdepth() {
final String r = get(SNAPSHOTS_MAXDEPTH); final String r = get(CrawlAttribute.SNAPSHOTS_MAXDEPTH.key);
if (r == null) return -1; if (r == null) return -1;
try { try {
final int i = Integer.parseInt(r); final int i = Integer.parseInt(r);
@ -626,20 +650,20 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
public boolean snapshotLoadImage() { public boolean snapshotLoadImage() {
final String r = get(SNAPSHOTS_LOADIMAGE); final String r = get(CrawlAttribute.SNAPSHOTS_LOADIMAGE.key);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean snapshotReplaceold() { public boolean snapshotReplaceold() {
final String r = get(SNAPSHOTS_REPLACEOLD); final String r = get(CrawlAttribute.SNAPSHOTS_REPLACEOLD.key);
if (r == null) return false; if (r == null) return false;
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public Pattern snapshotsMustnotmatch() { public Pattern snapshotsMustnotmatch() {
if (this.snapshotsMustnotmatch == null) { if (this.snapshotsMustnotmatch == null) {
final String r = get(SNAPSHOTS_MUSTNOTMATCH); final String r = get(CrawlAttribute.SNAPSHOTS_MUSTNOTMATCH.key);
try { try {
this.snapshotsMustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); this.snapshotsMustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
@ -648,7 +672,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
} }
public int timezoneOffset() { public int timezoneOffset() {
final String timezoneOffset = get(TIMEZONEOFFSET); final String timezoneOffset = get(CrawlAttribute.TIMEZONEOFFSET.key);
if (timezoneOffset == null) return 0; if (timezoneOffset == null) return 0;
try { try {
return Integer.parseInt(timezoneOffset); return Integer.parseInt(timezoneOffset);
@ -717,8 +741,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle()); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_name", this.name()); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_name", this.name());
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collection", this.get(COLLECTIONS)); // TODO: remove, replace with 'collections' //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collection", this.get(COLLECTIONS)); // TODO: remove, replace with 'collections'
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collections", this.get(COLLECTIONS)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_collections", this.get(CrawlAttribute.COLLECTIONS.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(AGENT_NAME)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(CrawlAttribute.AGENT_NAME.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent);
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth()); prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0); prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0);
@ -734,17 +758,17 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
//prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", this.storeHTCache() ? 1 : 0); // TODO: remove, replace with 'storeHTCache' //prop.put(CRAWL_PROFILE_PREFIX + count + "_storeCache", this.storeHTCache() ? 1 : 0); // TODO: remove, replace with 'storeHTCache'
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0); prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0); prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CACHE_STRAGEGY)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CrawlAttribute.CACHE_STRAGEGY.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CRAWLER_URL_MUSTMATCH)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CRAWLER_URL_MUSTNOTMATCH)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CRAWLER_IP_MUSTMATCH)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustNotMatch", this.get(CRAWLER_IP_MUSTNOTMATCH)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustNotMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerCountryMustMatch", this.get(CRAWLER_COUNTRY_MUSTMATCH)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerCountryMustMatch", this.get(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerNoLimitURLMustMatch", this.get(CRAWLER_URL_NODEPTHLIMITMATCH)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerNoLimitURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_NODEPTHLIMITMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustMatch", this.get(INDEXING_URL_MUSTMATCH)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(INDEXING_URL_MUSTNOTMATCH)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(INDEXING_CONTENT_MUSTMATCH)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(INDEXING_CONTENT_MUSTNOTMATCH)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder //prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder

@ -2249,31 +2249,31 @@ public final class Switchboard extends serverSwitch {
} }
boolean insert = false; boolean insert = false;
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE).getTime())); selentry.put(CrawlProfile.CrawlAttribute.RECRAWL_IF_OLDER.key, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE).getTime())); selentry.put(CrawlProfile.CrawlAttribute.RECRAWL_IF_OLDER.key, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE).getTime())); selentry.put(CrawlProfile.CrawlAttribute.RECRAWL_IF_OLDER.key, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE).getTime())); selentry.put(CrawlProfile.CrawlAttribute.RECRAWL_IF_OLDER.key, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE).getTime())); selentry.put(CrawlProfile.CrawlAttribute.RECRAWL_IF_OLDER.key, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE).getTime())); selentry.put(CrawlProfile.CrawlAttribute.RECRAWL_IF_OLDER.key, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE) ) { if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE).getTime())); selentry.put(CrawlProfile.CrawlAttribute.RECRAWL_IF_OLDER.key, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE).getTime()));
insert = true; insert = true;
} }
if ( insert ) { if ( insert ) {

Loading…
Cancel
Save