implemented crawl restrictions for IP pattern and country lists

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7980 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent e207c41c8e
commit b250e6466d

@ -9,7 +9,7 @@
// $LastChangedBy$ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or // the Free Software Foundation; either version 2 of the License, or
@ -38,10 +38,9 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlStacker; import de.anomic.crawler.CrawlStacker;
import de.anomic.crawler.CrawlSwitchboard; import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties; import de.anomic.server.servletProperties;
@ -63,7 +62,7 @@ public class CrawlProfileEditor_p {
ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES); ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES); ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES);
} }
public static class eentry { public static class eentry {
public static final int BOOLEAN = 0; public static final int BOOLEAN = 0;
public static final int INTEGER = 1; public static final int INTEGER = 1;
@ -73,7 +72,7 @@ public class CrawlProfileEditor_p {
public final String label; public final String label;
public final boolean readonly; public final boolean readonly;
public final int type; public final int type;
public eentry(final String name, final String label, final boolean readonly, final int type) { public eentry(final String name, final String label, final boolean readonly, final int type) {
this.name = name; this.name = name;
this.label = label; this.label = label;
@ -81,7 +80,7 @@ public class CrawlProfileEditor_p {
this.type = type; this.type = type;
} }
} }
private static final List <eentry> labels = new ArrayList<eentry>(); private static final List <eentry> labels = new ArrayList<eentry>();
static { static {
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING)); labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
@ -100,14 +99,14 @@ public class CrawlProfileEditor_p {
labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN));
} }
public static serverObjects respond( public static serverObjects respond(
final RequestHeader header, final RequestHeader header,
final serverObjects post, final serverObjects post,
final serverSwitch env) { final serverSwitch env) {
final servletProperties prop = new servletProperties(); final servletProperties prop = new servletProperties();
final Switchboard sb = (Switchboard)env; final Switchboard sb = (Switchboard)env;
// read post for handle // read post for handle
final String handle = (post == null) ? "" : post.get("handle", ""); final String handle = (post == null) ? "" : post.get("handle", "");
if (post != null) { if (post != null) {
@ -117,8 +116,8 @@ public class CrawlProfileEditor_p {
if (p != null) sb.crawler.putPassive(handle.getBytes(), p); if (p != null) sb.crawler.putPassive(handle.getBytes(), p);
// delete all entries from the crawl queue that are deleted here // delete all entries from the crawl queue that are deleted here
sb.crawler.removeActive(handle.getBytes()); sb.crawler.removeActive(handle.getBytes());
sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000); sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000);
} catch (RowSpaceExceededException e) { } catch (final RowSpaceExceededException e) {
Log.logException(e); Log.logException(e);
} }
if (post.containsKey("delete")) { if (post.containsKey("delete")) {
@ -131,7 +130,7 @@ public class CrawlProfileEditor_p {
} }
} }
} }
// generate handle list: first sort by handle name // generate handle list: first sort by handle name
CrawlProfile selentry; CrawlProfile selentry;
final Map<String, String> orderdHandles = new TreeMap<String, String>(); final Map<String, String> orderdHandles = new TreeMap<String, String>();
@ -141,7 +140,7 @@ public class CrawlProfileEditor_p {
orderdHandles.put(selentry.name(), selentry.handle()); orderdHandles.put(selentry.name(), selentry.handle());
} }
} }
// then write into pop-up menu list // then write into pop-up menu list
int count = 0; int count = 0;
for (final Map.Entry<String, String> NameHandle: orderdHandles.entrySet()) { for (final Map.Entry<String, String> NameHandle: orderdHandles.entrySet()) {
@ -159,8 +158,8 @@ public class CrawlProfileEditor_p {
if ((post != null) && (selentry != null)) { if ((post != null) && (selentry != null)) {
if (post.containsKey("submit")) { if (post.containsKey("submit")) {
try { try {
Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL)); Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL_STRING));
Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER)); Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER_STRING));
final Iterator<eentry> lit = labels.iterator(); final Iterator<eentry> lit = labels.iterator();
eentry tee; eentry tee;
while (lit.hasNext()) { while (lit.hasNext()) {
@ -179,7 +178,7 @@ public class CrawlProfileEditor_p {
} }
} }
} }
// generate crawl profile table // generate crawl profile table
count = 0; count = 0;
boolean dark = true; boolean dark = true;
@ -231,10 +230,10 @@ public class CrawlProfileEditor_p {
} }
prop.put("edit_entries", count); prop.put("edit_entries", count);
} }
return prop; return prop;
} }
private static void putProfileEntry( private static void putProfileEntry(
final servletProperties prop, final servletProperties prop,
final CrawlStacker crawlStacker, final CrawlStacker crawlStacker,
@ -253,8 +252,8 @@ public class CrawlProfileEditor_p {
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", profile.startURL()); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", profile.startURL());
prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", profile.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", profile.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", profile.depth()); prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", profile.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString()); prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.urlMustMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString()); prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.urlMustNotMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder())); prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive"); prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
@ -270,7 +269,7 @@ public class CrawlProfileEditor_p {
i++; i++;
} }
} }
prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i); prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i);
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(profile.domMaxPages())); prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(profile.domMaxPages()));

@ -185,7 +185,7 @@
<td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td> <td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
<td> <td>
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp; <input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
<input name="crawlingCountryMustMatch" id="crawlingCountryMustMatch" type="text" size="60" maxlength="100" value="#[crawlingCountryMustMatch]#" /> <input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="100" value="#[countryMustMatch]#" /><br />
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction <input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction
</td> </td>
<td> <td>

@ -42,11 +42,11 @@ public class CrawlStartExpert_p {
prop.put("starturl", /*(intranet) ? repository :*/ "http://"); prop.put("starturl", /*(intranet) ? repository :*/ "http://");
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0))); prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL); prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER); prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL)); prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER)); prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
prop.put("crawlingCountryMustMatch", sb.getConfig("crawlingCountryMustMatch", "")); prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
prop.put("crawlingIfOlderCheck", "0"); prop.put("crawlingIfOlderCheck", "0");
prop.put("crawlingIfOlderUnitYearCheck", "0"); prop.put("crawlingIfOlderUnitYearCheck", "0");

@ -153,12 +153,12 @@ public class Crawler_p {
final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start
// set the crawl filter // set the crawl filter
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL); String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);
final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER); final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL; if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL_STRING;
final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : ""; final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : "";
sb.setConfig("crawlingIPMustMatch", ipMustMatch); sb.setConfig("crawlingIPMustMatch", ipMustMatch);
sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch); sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
@ -439,7 +439,7 @@ public class Crawler_p {
crawlingFileName, crawlingFileName,
crawlURL, crawlURL,
newcrawlingMustMatch, newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER_STRING,
ipMustMatch, ipMustMatch,
ipMustNotMatch, ipMustNotMatch,
countryMustMatch, countryMustMatch,
@ -478,8 +478,8 @@ public class Crawler_p {
final CrawlProfile pe = new CrawlProfile( final CrawlProfile pe = new CrawlProfile(
sitemapURLStr, sitemapURLStr,
sitemapURL, sitemapURL,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER_STRING,
ipMustMatch, ipMustMatch,
ipMustNotMatch, ipMustNotMatch,
countryMustMatch, countryMustMatch,
@ -523,7 +523,7 @@ public class Crawler_p {
sitelistURL.getHost(), sitelistURL.getHost(),
sitelistURL, sitelistURL,
newcrawlingMustMatch, newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER_STRING,
ipMustMatch, ipMustMatch,
ipMustNotMatch, ipMustNotMatch,
countryMustMatch, countryMustMatch,

@ -108,8 +108,8 @@ public class QuickCrawlLink_p {
final String title = post.get("title",null); final String title = post.get("title",null);
// get other parameters if set // get other parameters if set
final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final int CrawlingDepth = post.getInt("crawlingDepth", 0); final int CrawlingDepth = post.getInt("crawlingDepth", 0);
final boolean crawlDynamic = post.get("crawlingQ", "").equals("on"); final boolean crawlDynamic = post.get("crawlingQ", "").equals("on");
final boolean indexText = post.get("indexText", "on").equals("on"); final boolean indexText = post.get("indexText", "on").equals("on");
@ -149,8 +149,8 @@ public class QuickCrawlLink_p {
crawlingStartURL.getHost(), crawlingStartURL.getHost(),
crawlingStartURL, crawlingStartURL,
crawlingMustMatch, crawlingMustMatch,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER_STRING,
"", "",
crawlingMustNotMatch, crawlingMustNotMatch,
CrawlingDepth, CrawlingDepth,

@ -41,8 +41,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private static final long serialVersionUID = 5527325718810703504L; private static final long serialVersionUID = 5527325718810703504L;
public static final String MATCH_ALL = ".*"; public static final String MATCH_ALL_STRING = ".*";
public static final String MATCH_NEVER = ""; public static final String MATCH_NEVER_STRING = "";
public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING);
public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);
// this is a simple record structure that hold all properties of a single crawl start // this is a simple record structure that hold all properties of a single crawl start
public static final String HANDLE = "handle"; public static final String HANDLE = "handle";
@ -67,7 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String FILTER_IP_MUSTNOTMATCH = "crawlingIPMustNotMatch"; public static final String FILTER_IP_MUSTNOTMATCH = "crawlingIPMustNotMatch";
public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch"; public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch";
private Pattern mustmatch = null, mustnotmatch = null; private Pattern urlmustmatch = null, urlmustnotmatch = null, ipmustmatch = null, ipmustnotmatch = null;
/** /**
* Constructor which creates CrawlPofile from parameters. * Constructor which creates CrawlPofile from parameters.
@ -119,10 +121,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(HANDLE, handle); put(HANDLE, handle);
put(NAME, name); put(NAME, name);
put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false)); put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
put(FILTER_URL_MUSTMATCH, (urlMustMatch == null) ? CrawlProfile.MATCH_ALL : urlMustMatch); put(FILTER_URL_MUSTMATCH, (urlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : urlMustMatch);
put(FILTER_URL_MUSTNOTMATCH, (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : urlMustNotMatch); put(FILTER_URL_MUSTNOTMATCH, (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : urlMustNotMatch);
put(FILTER_IP_MUSTMATCH, (ipMustMatch == null) ? CrawlProfile.MATCH_ALL : ipMustMatch); put(FILTER_IP_MUSTMATCH, (ipMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : ipMustMatch);
put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : ipMustNotMatch); put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : ipMustNotMatch);
put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch); put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch);
put(DEPTH, depth); put(DEPTH, depth);
put(RECRAWL_IF_OLDER, recrawlIfOlder); put(RECRAWL_IF_OLDER, recrawlIfOlder);
@ -207,26 +209,77 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* Gets the regex which must be matched by URLs in order to be crawled. * Gets the regex which must be matched by URLs in order to be crawled.
* @return regex which must be matched * @return regex which must be matched
*/ */
public Pattern mustMatchPattern() { public Pattern urlMustMatchPattern() {
if (this.mustmatch == null) { if (this.urlmustmatch == null) {
String r = get(FILTER_URL_MUSTMATCH); final String r = get(FILTER_URL_MUSTMATCH);
if (r == null) r = CrawlProfile.MATCH_ALL; if (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) {
this.mustmatch = Pattern.compile(r); this.urlmustmatch = CrawlProfile.MATCH_ALL_PATTERN;
} else {
this.urlmustmatch = Pattern.compile(r);
}
} }
return this.mustmatch; return this.urlmustmatch;
} }
/** /**
* Gets the regex which must not be matched by URLs in order to be crawled. * Gets the regex which must not be matched by URLs in order to be crawled.
* @return regex which must not be matched * @return regex which must not be matched
*/ */
public Pattern mustNotMatchPattern() { public Pattern urlMustNotMatchPattern() {
if (this.mustnotmatch == null) { if (this.urlmustnotmatch == null) {
String r = get(FILTER_URL_MUSTNOTMATCH); final String r = get(FILTER_URL_MUSTNOTMATCH);
if (r == null) r = CrawlProfile.MATCH_NEVER; if (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) {
this.mustnotmatch = Pattern.compile(r); this.urlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN;
} else {
this.urlmustnotmatch = Pattern.compile(r);
}
} }
return this.mustnotmatch; return this.urlmustnotmatch;
}
/**
* Gets the regex which must be matched by IPs in order to be crawled.
* @return regex which must be matched
*/
public Pattern ipMustMatchPattern() {
if (this.ipmustmatch == null) {
final String r = get(FILTER_IP_MUSTMATCH);
if (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) {
this.ipmustmatch = CrawlProfile.MATCH_ALL_PATTERN;
} else {
this.ipmustmatch = Pattern.compile(r);
}
}
return this.ipmustmatch;
}
/**
* Gets the regex which must not be matched by IPs in order to be crawled.
* @return regex which must not be matched
*/
public Pattern ipMustNotMatchPattern() {
if (this.ipmustnotmatch == null) {
final String r = get(FILTER_IP_MUSTNOTMATCH);
if (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) {
this.ipmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN;
} else {
this.ipmustnotmatch = Pattern.compile(r);
}
}
return this.ipmustnotmatch;
}
/**
* get the list of countries that must match for the locations of the URLs IPs
* @return a list of country codes
*/
public String[] countryMustMatchList() {
String countryMustMatch = get(FILTER_COUNTRY_MUSTMATCH);
if (countryMustMatch == null) countryMustMatch = "";
if (countryMustMatch.length() == 0) return new String[0];
String[] list = countryMustMatch.split(",");
if (list.length == 1 && list.length == 0) list = new String[0];
return list;
} }
/** /**

@ -299,8 +299,8 @@ public class CrawlQueues {
+ ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false")
+ ", depth=" + urlEntry.depth() + ", depth=" + urlEntry.depth()
+ ", crawlDepth=" + profile.depth() + ", crawlDepth=" + profile.depth()
+ ", must-match=" + profile.mustMatchPattern().toString() + ", must-match=" + profile.urlMustMatchPattern().toString()
+ ", must-not-match=" + profile.mustNotMatchPattern().toString() + ", must-not-match=" + profile.urlMustNotMatchPattern().toString()
+ ", permission=" + ((this.sb.peers == null) ? "undefined" : (((this.sb.peers.mySeed().isSenior()) || (this.sb.peers.mySeed().isPrincipal())) ? "true" : "false"))); + ", permission=" + ((this.sb.peers == null) ? "undefined" : (((this.sb.peers.mySeed().isSenior()) || (this.sb.peers.mySeed().isPrincipal())) ? "true" : "false")));
// work off one Crawl stack entry // work off one Crawl stack entry

@ -34,6 +34,7 @@ import java.net.MalformedURLException;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
@ -438,8 +439,9 @@ public final class CrawlStacker {
// check if the protocol is supported // check if the protocol is supported
final String urlProtocol = url.getProtocol(); final String urlProtocol = url.getProtocol();
final String urlstring = url.toString();
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) { if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
this.log.logSevere("Unsupported protocol in URL '" + url.toString() + "'."); this.log.logSevere("Unsupported protocol in URL '" + urlstring + "'.");
return "unsupported protocol"; return "unsupported protocol";
} }
@ -452,31 +454,31 @@ public final class CrawlStacker {
// check blacklist // check blacklist
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) { if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is in blacklist."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is in blacklist.");
return "url in blacklist"; return "url in blacklist";
} }
// filter with must-match // filter with must-match for URLs
if ((depth > 0) && !profile.mustMatchPattern().matcher(url.toString()).matches()) { if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return "url does not match must-match filter"; return "url does not match must-match filter";
} }
// filter with must-not-match // filter with must-not-match for URLs
if ((depth > 0) && profile.mustNotMatchPattern().matcher(url.toString()).matches()) { if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
return "url matches must-not-match filter"; return "url matches must-not-match filter";
} }
// deny cgi // deny cgi
if (url.isIndividual() && !(profile.crawlingQ())) { // TODO: make special property for crawlingIndividual if (url.isIndividual() && !(profile.crawlingQ())) { // TODO: make special property for crawlingIndividual
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is CGI URL.");
return "individual url (sessionid etc) not wanted"; return "individual url (sessionid etc) not wanted";
} }
// deny post properties // deny post properties
if (url.isPOST() && !(profile.crawlingQ())) { if (url.isPOST() && !(profile.crawlingQ())) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is post URL."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is post URL.");
return "post url not allowed"; return "post url not allowed";
} }
@ -486,7 +488,7 @@ public final class CrawlStacker {
if (oldEntry == null) { if (oldEntry == null) {
if (dbocc != null) { if (dbocc != null) {
// do double-check // do double-check
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is double registered in '" + dbocc + "'.");
if (dbocc.equals("errors")) { if (dbocc.equals("errors")) {
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")"; return "double in: errors (" + errorEntry.anycause() + ")";
@ -498,13 +500,13 @@ public final class CrawlStacker {
final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime(); final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime();
if (recrawl) { if (recrawl) {
if (this.log.isInfo()) if (this.log.isInfo())
this.log.logInfo("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " + this.log.logInfo("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago."); ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
} else { } else {
if (dbocc == null) { if (dbocc == null) {
return "double in: LURL-DB"; return "double in: LURL-DB";
} else { } else {
if (this.log.isInfo()) this.log.logInfo("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:"); if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
if (dbocc.equals("errors")) { if (dbocc.equals("errors")) {
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")"; return "double in: errors (" + errorEntry.anycause() + ")";
@ -520,16 +522,51 @@ public final class CrawlStacker {
if (maxAllowedPagesPerDomain < Integer.MAX_VALUE) { if (maxAllowedPagesPerDomain < Integer.MAX_VALUE) {
final DomProfile dp = this.doms.get(url.getHost()); final DomProfile dp = this.doms.get(url.getHost());
if (dp != null && dp.count >= maxAllowedPagesPerDomain) { if (dp != null && dp.count >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed.");
return "crawl stack domain counter exceeded"; return "crawl stack domain counter exceeded";
} }
if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= profile.domMaxPages()) { if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= profile.domMaxPages()) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often in result stack, a maximum of " + profile.domMaxPages() + " is allowed."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + profile.domMaxPages() + " is allowed.");
return "result stack domain counter exceeded"; return "result stack domain counter exceeded";
} }
} }
// the following filters use a DNS lookup to check if the url matches with IP filter
// this is expensive and those filters are check at the end of all other tests
// filter with must-match for IPs
if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_ALL_PATTERN && !profile.ipMustMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'.");
return "ip " + url.getInetAddress().getHostAddress() + " of url does not match must-match filter";
}
// filter with must-not-match for IPs
if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.ipMustNotMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'.");
return "ip " + url.getInetAddress().getHostAddress() + " of url matches must-not-match filter";
}
// filter with must-match for IPs
final String[] countryMatchList = profile.countryMustMatchList();
if (depth > 0 && countryMatchList != null && countryMatchList.length > 0) {
final Locale locale = url.getLocale();
if (locale != null) {
final String c0 = locale.getCountry();
boolean granted = false;
matchloop: for (final String c: countryMatchList) {
if (c0.equals(c)) {
granted = true;
break matchloop;
}
}
if (!granted) {
if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'.");
return "country " + c0 + " of url does not match must-match filter for countries";
}
}
}
return null; return null;
} }

@ -229,8 +229,8 @@ public final class CrawlSwitchboard {
// generate new default entry for proxy crawling // generate new default entry for proxy crawling
this.defaultProxyProfile = new CrawlProfile( this.defaultProxyProfile = new CrawlProfile(
"proxy", null, "proxy", null,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING,
"", "",
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
@ -243,38 +243,38 @@ public final class CrawlSwitchboard {
} }
if (this.defaultRemoteProfile == null) { if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling // generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", CrawlProfile.MATCH_NEVER, 0, this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0,
-1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH); -1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
} }
if (this.defaultTextSnippetLocalProfile == null) { if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
} }
if (this.defaultTextSnippetGlobalProfile == null) { if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
} }
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) { if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
} }
if (this.defaultMediaSnippetGlobalProfile == null) { if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
} }
if (this.defaultSurrogateProfile == null) { if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing // generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
} }

@ -88,6 +88,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
protected final String protocol, userInfo; protected final String protocol, userInfo;
protected String host, path, quest, ref; protected String host, path, quest, ref;
protected int port; protected int port;
private InetAddress hostAddress;
/** /**
* initialization of a MultiProtocolURI to produce poison pills for concurrent blocking queues * initialization of a MultiProtocolURI to produce poison pills for concurrent blocking queues
@ -95,6 +96,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
public MultiProtocolURI() { public MultiProtocolURI() {
this.protocol = null; this.protocol = null;
this.host = null; this.host = null;
this.hostAddress = null;
this.userInfo = null; this.userInfo = null;
this.path = null; this.path = null;
this.quest = null; this.quest = null;
@ -109,6 +111,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
protected MultiProtocolURI(final MultiProtocolURI url) { protected MultiProtocolURI(final MultiProtocolURI url) {
this.protocol = url.protocol; this.protocol = url.protocol;
this.host = url.host; this.host = url.host;
this.hostAddress = null;
this.userInfo = url.userInfo; this.userInfo = url.userInfo;
this.path = url.path; this.path = url.path;
this.quest = url.quest; this.quest = url.quest;
@ -119,6 +122,8 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
public MultiProtocolURI(String url) throws MalformedURLException { public MultiProtocolURI(String url) throws MalformedURLException {
if (url == null) throw new MalformedURLException("url string is null"); if (url == null) throw new MalformedURLException("url string is null");
this.hostAddress = null;
// identify protocol // identify protocol
assert (url != null); assert (url != null);
url = url.trim(); url = url.trim();
@ -688,6 +693,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return this.host; return this.host;
} }
public InetAddress getInetAddress() {
if (this.hostAddress != null) return this.hostAddress;
this.hostAddress = Domains.dnsResolve(this.host.toLowerCase());
return this.hostAddress;
}
public int getPort() { public int getPort() {
return this.port; return this.port;
} }
@ -827,7 +838,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
} }
final String hl = getHost().toLowerCase(); final String hl = getHost().toLowerCase();
if (resolveHost) { if (resolveHost) {
final InetAddress r = Domains.dnsResolve(hl); final InetAddress r = getInetAddress();
u.append(r == null ? hl : r.getHostAddress()); u.append(r == null ? hl : r.getHostAddress());
} else { } else {
u.append(hl); u.append(hl);
@ -1119,8 +1130,11 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return baos.toByteArray(); return baos.toByteArray();
} }
public Locale getLocale() { public Locale getLocale() {
if (this.hostAddress != null) {
final Locale locale = Domains.getLocale(this.hostAddress);
if (locale != null && locale.getCountry() != null && locale.getCountry().length() > 0) return locale;
}
return Domains.getLocale(this.host); return Domains.getLocale(this.host);
} }

@ -550,6 +550,11 @@ public class Domains {
cacheHit_Insert++; cacheHit_Insert++;
} }
/**
* resolve a host address using a local DNS cache and a DNS lookup if necessary
* @param host
* @return the hosts InetAddress or null if the address cannot be resolved
*/
public static InetAddress dnsResolve(String host) { public static InetAddress dnsResolve(String host) {
if ((host == null) || (host.length() == 0)) return null; if ((host == null) || (host.length() == 0)) return null;
host = host.toLowerCase().trim(); host = host.toLowerCase().trim();
@ -921,7 +926,7 @@ public class Domains {
public static Locale getLocale(final String host) { public static Locale getLocale(final String host) {
if (host == null) return null; if (host == null) return null;
final Locale locale = getLocale(dnsResolve(host)); final Locale locale = getLocale(dnsResolve(host));
if (locale != null) return locale; if (locale != null && locale.getCountry() != null && locale.getCountry().length() > 0) return locale;
final int p = host.lastIndexOf('.'); final int p = host.lastIndexOf('.');
if (p < 0) return null; if (p < 0) return null;
String tld = host.substring(p + 1).toUpperCase(); String tld = host.substring(p + 1).toUpperCase();

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public * modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either * License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version. * version 2.1 of the License, or (at your option) any later version.
* *
* This library is distributed in the hope that it will be useful, * This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details. * Lesser General Public License for more details.
* *
* You should have received a copy of the GNU Lesser General Public License * You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt * along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>. * If not, see <http://www.gnu.org/licenses/>.
@ -53,22 +53,22 @@ public class Scanner extends Thread {
private static final Service POISONSERVICE = new Service(Protocol.http, null); private static final Service POISONSERVICE = new Service(Protocol.http, null);
private static final Object PRESENT = new Object(); private static final Object PRESENT = new Object();
public static enum Access {unknown, empty, granted, denied;} public static enum Access {unknown, empty, granted, denied;}
public static enum Protocol {http(80), https(443), ftp(21), smb(445); public static enum Protocol {http(80), https(443), ftp(21), smb(445);
public int port; public int port;
private Protocol(int port) {this.port = port;} private Protocol(final int port) {this.port = port;}
} }
public static class Service { public static class Service {
public Protocol protocol; public Protocol protocol;
public InetAddress inetAddress; public InetAddress inetAddress;
private String hostname; private String hostname;
public Service(Protocol protocol, InetAddress inetAddress) { public Service(final Protocol protocol, final InetAddress inetAddress) {
this.protocol = protocol; this.protocol = protocol;
this.inetAddress = inetAddress; this.inetAddress = inetAddress;
this.hostname = null; this.hostname = null;
} }
public Service(String protocol, InetAddress inetAddress) { public Service(final String protocol, final InetAddress inetAddress) {
this.protocol = protocol.equals("http") ? Protocol.http : protocol.equals("https") ? Protocol.https : protocol.equals("ftp") ? Protocol.ftp : Protocol.smb; this.protocol = protocol.equals("http") ? Protocol.http : protocol.equals("https") ? Protocol.https : protocol.equals("ftp") ? Protocol.ftp : Protocol.smb;
this.inetAddress = inetAddress; this.inetAddress = inetAddress;
this.hostname = null; this.hostname = null;
@ -92,7 +92,7 @@ public class Scanner extends Thread {
try { try {
this.hostname = TimeoutRequest.getHostName(this.inetAddress, 100); this.hostname = TimeoutRequest.getHostName(this.inetAddress, 100);
Domains.setHostName(this.inetAddress, this.hostname); Domains.setHostName(this.inetAddress, this.hostname);
} catch (ExecutionException e) { } catch (final ExecutionException e) {
this.hostname = this.inetAddress.getHostAddress(); this.hostname = this.inetAddress.getHostAddress();
} }
//this.hostname = Domains.getHostName(this.inetAddress); //this.hostname = Domains.getHostName(this.inetAddress);
@ -105,7 +105,7 @@ public class Scanner extends Thread {
public String toString() { public String toString() {
try { try {
return new MultiProtocolURI(this.protocol.name() + "://" + this.inetAddress.getHostAddress() + "/").toNormalform(true, false); return new MultiProtocolURI(this.protocol.name() + "://" + this.inetAddress.getHostAddress() + "/").toNormalform(true, false);
} catch (MalformedURLException e) { } catch (final MalformedURLException e) {
return ""; return "";
} }
} }
@ -114,11 +114,11 @@ public class Scanner extends Thread {
return this.inetAddress.hashCode(); return this.inetAddress.hashCode();
} }
@Override @Override
public boolean equals(Object o) { public boolean equals(final Object o) {
return (o instanceof Service) && ((Service) o).protocol == this.protocol && ((Service) o).inetAddress.equals(this.inetAddress); return (o instanceof Service) && ((Service) o).protocol == this.protocol && ((Service) o).inetAddress.equals(this.inetAddress);
} }
} }
private final static Map<Service, Access> scancache = new ConcurrentHashMap<Service, Access>(); private final static Map<Service, Access> scancache = new ConcurrentHashMap<Service, Access>();
//private static long scancacheUpdateTime = 0; //private static long scancacheUpdateTime = 0;
//private static long scancacheValidUntilTime = Long.MAX_VALUE; //private static long scancacheValidUntilTime = Long.MAX_VALUE;
@ -127,17 +127,17 @@ public class Scanner extends Thread {
public static int scancacheSize() { public static int scancacheSize() {
return scancache.size(); return scancache.size();
} }
public static void scancacheReplace(Scanner newScanner, long validTime) { public static void scancacheReplace(final Scanner newScanner, final long validTime) {
scancache.clear(); scancache.clear();
scancache.putAll(newScanner.services()); scancache.putAll(newScanner.services());
//scancacheUpdateTime = System.currentTimeMillis(); //scancacheUpdateTime = System.currentTimeMillis();
//scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime; //scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime;
scancacheScanrange = newScanner.scanrange; scancacheScanrange = newScanner.scanrange;
} }
public static void scancacheExtend(Scanner newScanner, long validTime) { public static void scancacheExtend(final Scanner newScanner, final long validTime) {
Iterator<Map.Entry<Service, Access>> i = Scanner.scancache.entrySet().iterator(); final Iterator<Map.Entry<Service, Access>> i = Scanner.scancache.entrySet().iterator();
Map.Entry<Service, Access> entry; Map.Entry<Service, Access> entry;
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
@ -148,11 +148,11 @@ public class Scanner extends Thread {
//scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime; //scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime;
scancacheScanrange = newScanner.scanrange; scancacheScanrange = newScanner.scanrange;
} }
public static Iterator<Map.Entry<Service, Scanner.Access>> scancacheEntries() { public static Iterator<Map.Entry<Service, Scanner.Access>> scancacheEntries() {
return scancache.entrySet().iterator(); return scancache.entrySet().iterator();
} }
/** /**
* check if the url can be accepted by the scanner. the scanner accepts the url if: * check if the url can be accepted by the scanner. the scanner accepts the url if:
* - the host of the url is not supervised (it is not in the scan range), or * - the host of the url is not supervised (it is not in the scan range), or
@ -160,100 +160,100 @@ public class Scanner extends Thread {
* @param url * @param url
* @return true if the url shall be part of a search result * @return true if the url shall be part of a search result
*/ */
public static boolean acceptURL(MultiProtocolURI url) { public static boolean acceptURL(final MultiProtocolURI url) {
// if the scan range is empty, then all urls are accepted // if the scan range is empty, then all urls are accepted
if (scancacheScanrange == null || scancacheScanrange.isEmpty()) return true; if (scancacheScanrange == null || scancacheScanrange.isEmpty()) return true;
//if (System.currentTimeMillis() > scancacheValidUntilTime) return true; //if (System.currentTimeMillis() > scancacheValidUntilTime) return true;
InetAddress a = Domains.dnsResolve(url.getHost()); // try to avoid that! final InetAddress a = url.getInetAddress(); // try to avoid that!
if (a == null) return true; if (a == null) return true;
InetAddress n = normalize(a); final InetAddress n = normalize(a);
if (!scancacheScanrange.contains(n)) return true; if (!scancacheScanrange.contains(n)) return true;
Access access = scancache.get(new Service(url.getProtocol(), a)); final Access access = scancache.get(new Service(url.getProtocol(), a));
if (access == null) return false; if (access == null) return false;
return access == Access.granted; return access == Access.granted;
} }
private static InetAddress normalize(InetAddress a) { private static InetAddress normalize(final InetAddress a) {
if (a == null) return null; if (a == null) return null;
byte[] b = a.getAddress(); final byte[] b = a.getAddress();
if (b[3] == 1) return a; if (b[3] == 1) return a;
b[3] = 1; b[3] = 1;
try { try {
return InetAddress.getByAddress(b); return InetAddress.getByAddress(b);
} catch (UnknownHostException e) { } catch (final UnknownHostException e) {
return a; return a;
} }
} }
private int runnerCount;
private Set<InetAddress> scanrange;
private BlockingQueue<Service> scanqueue;
private Map<Service, Access> services;
private Map<Runner, Object> runner;
private int timeout;
public Scanner(Set<InetAddress> scanrange, int concurrentRunner, int timeout) { private final int runnerCount;
private final Set<InetAddress> scanrange;
private final BlockingQueue<Service> scanqueue;
private final Map<Service, Access> services;
private final Map<Runner, Object> runner;
private final int timeout;
public Scanner(final Set<InetAddress> scanrange, final int concurrentRunner, final int timeout) {
this.runnerCount = concurrentRunner; this.runnerCount = concurrentRunner;
this.scanrange = new HashSet<InetAddress>(); this.scanrange = new HashSet<InetAddress>();
for (InetAddress a: scanrange) this.scanrange.add(normalize(a)); for (final InetAddress a: scanrange) this.scanrange.add(normalize(a));
this.scanqueue = new LinkedBlockingQueue<Service>(); this.scanqueue = new LinkedBlockingQueue<Service>();
this.services = Collections.synchronizedMap(new HashMap<Service, Access>()); this.services = Collections.synchronizedMap(new HashMap<Service, Access>());
this.runner = new ConcurrentHashMap<Runner, Object>(); this.runner = new ConcurrentHashMap<Runner, Object>();
this.timeout = timeout; this.timeout = timeout;
} }
public Scanner(int concurrentRunner, int timeout) { public Scanner(final int concurrentRunner, final int timeout) {
this(Domains.myIntranetIPs(), concurrentRunner, timeout); this(Domains.myIntranetIPs(), concurrentRunner, timeout);
} }
@Override @Override
public void run() { public void run() {
Service uri; Service uri;
try { try {
while ((uri = scanqueue.take()) != POISONSERVICE) { while ((uri = this.scanqueue.take()) != POISONSERVICE) {
while (runner.size() >= this.runnerCount) { while (this.runner.size() >= this.runnerCount) {
/*for (Runner r: runner.keySet()) { /*for (Runner r: runner.keySet()) {
if (r.age() > 3000) synchronized(r) { r.interrupt(); } if (r.age() > 3000) synchronized(r) { r.interrupt(); }
}*/ }*/
if (runner.size() >= this.runnerCount) Thread.sleep(20); if (this.runner.size() >= this.runnerCount) Thread.sleep(20);
} }
Runner runner = new Runner(uri); final Runner runner = new Runner(uri);
this.runner.put(runner, PRESENT); this.runner.put(runner, PRESENT);
runner.start(); runner.start();
} }
} catch (InterruptedException e) { } catch (final InterruptedException e) {
} }
} }
public int pending() { public int pending() {
return this.scanqueue.size(); return this.scanqueue.size();
} }
public void terminate() { public void terminate() {
for (int i = 0; i < runnerCount; i++) try { for (int i = 0; i < this.runnerCount; i++) try {
this.scanqueue.put(POISONSERVICE); this.scanqueue.put(POISONSERVICE);
} catch (InterruptedException e) { } catch (final InterruptedException e) {
} }
try { try {
this.join(); this.join();
} catch (InterruptedException e) { } catch (final InterruptedException e) {
} }
} }
public class Runner extends Thread { public class Runner extends Thread {
private Service service; private final Service service;
private long starttime; private final long starttime;
public Runner(Service service) { public Runner(final Service service) {
this.service = service; this.service = service;
this.starttime = System.currentTimeMillis(); this.starttime = System.currentTimeMillis();
} }
@Override @Override
public void run() { public void run() {
try { try {
if (TimeoutRequest.ping(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port, timeout)) { if (TimeoutRequest.ping(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port, Scanner.this.timeout)) {
Access access = this.service.getProtocol() == Protocol.http || this.service.getProtocol() == Protocol.https ? Access.granted : Access.unknown; Access access = this.service.getProtocol() == Protocol.http || this.service.getProtocol() == Protocol.https ? Access.granted : Access.unknown;
services.put(service, access); Scanner.this.services.put(this.service, access);
if (access == Access.unknown) { if (access == Access.unknown) {
// ask the service if it lets us in // ask the service if it lets us in
if (this.service.getProtocol() == Protocol.ftp) { if (this.service.getProtocol() == Protocol.ftp) {
@ -261,35 +261,35 @@ public class Scanner extends Thread {
try { try {
ftpClient.open(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port); ftpClient.open(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port);
ftpClient.login("anonymous", "anomic@"); ftpClient.login("anonymous", "anomic@");
List<String> list = ftpClient.list("/", false); final List<String> list = ftpClient.list("/", false);
ftpClient.CLOSE(); ftpClient.CLOSE();
access = list == null || list.isEmpty() ? Access.empty : Access.granted; access = list == null || list.isEmpty() ? Access.empty : Access.granted;
} catch (IOException e) { } catch (final IOException e) {
access = Access.denied; access = Access.denied;
} }
} }
if (this.service.getProtocol() == Protocol.smb) { if (this.service.getProtocol() == Protocol.smb) {
try { try {
MultiProtocolURI uri = new MultiProtocolURI(this.service.toString()); final MultiProtocolURI uri = new MultiProtocolURI(this.service.toString());
String[] list = uri.list(); final String[] list = uri.list();
access = list == null || list.length == 0 ? Access.empty : Access.granted; access = list == null || list.length == 0 ? Access.empty : Access.granted;
} catch (IOException e) { } catch (final IOException e) {
access = Access.denied; access = Access.denied;
} }
} }
} }
if (access != Access.unknown) services.put(this.service, access); if (access != Access.unknown) Scanner.this.services.put(this.service, access);
} }
} catch (ExecutionException e) { } catch (final ExecutionException e) {
} }
Object r = runner.remove(this); final Object r = Scanner.this.runner.remove(this);
assert r != null; assert r != null;
} }
public long age() { public long age() {
return System.currentTimeMillis() - this.starttime; return System.currentTimeMillis() - this.starttime;
} }
@Override @Override
public boolean equals(Object o) { public boolean equals(final Object o) {
return (o instanceof Runner) && this.service.equals(((Runner) o).service); return (o instanceof Runner) && this.service.equals(((Runner) o).service);
} }
@Override @Override
@ -297,76 +297,76 @@ public class Scanner extends Thread {
return this.service.hashCode(); return this.service.hashCode();
} }
} }
public void addHTTP(boolean bigrange) { public void addHTTP(final boolean bigrange) {
addProtocol(Protocol.http, bigrange); addProtocol(Protocol.http, bigrange);
} }
public void addHTTPS(boolean bigrange) { public void addHTTPS(final boolean bigrange) {
addProtocol(Protocol.https, bigrange); addProtocol(Protocol.https, bigrange);
} }
public void addSMB(boolean bigrange) { public void addSMB(final boolean bigrange) {
addProtocol(Protocol.smb, bigrange); addProtocol(Protocol.smb, bigrange);
} }
public void addFTP(boolean bigrange) { public void addFTP(final boolean bigrange) {
addProtocol(Protocol.ftp, bigrange); addProtocol(Protocol.ftp, bigrange);
} }
private void addProtocol(Protocol protocol, boolean bigrange) { private void addProtocol(final Protocol protocol, final boolean bigrange) {
for (InetAddress i: genlist(bigrange)) { for (final InetAddress i: genlist(bigrange)) {
try { try {
this.scanqueue.put(new Service(protocol, i)); this.scanqueue.put(new Service(protocol, i));
} catch (InterruptedException e) { } catch (final InterruptedException e) {
} }
} }
} }
private final List<InetAddress> genlist(boolean bigrange) { private final List<InetAddress> genlist(final boolean bigrange) {
ArrayList<InetAddress> c = new ArrayList<InetAddress>(10); final ArrayList<InetAddress> c = new ArrayList<InetAddress>(10);
for (InetAddress i: scanrange) { for (final InetAddress i: this.scanrange) {
for (int br = bigrange ? 1 : i.getAddress()[2]; br < (bigrange ? 255 : i.getAddress()[2] + 1); br++) { for (int br = bigrange ? 1 : i.getAddress()[2]; br < (bigrange ? 255 : i.getAddress()[2] + 1); br++) {
for (int j = 1; j < 255; j++) { for (int j = 1; j < 255; j++) {
byte[] address = i.getAddress(); final byte[] address = i.getAddress();
address[2] = (byte) br; address[2] = (byte) br;
address[3] = (byte) j; address[3] = (byte) j;
try { try {
c.add(InetAddress.getByAddress(address)); c.add(InetAddress.getByAddress(address));
} catch (UnknownHostException e) { } catch (final UnknownHostException e) {
} }
} }
} }
} }
return c; return c;
} }
public Map<Service, Access> services() { public Map<Service, Access> services() {
return this.services; return this.services;
} }
public static byte[] inIndex(Map<byte[], String> commentCache, String url) { public static byte[] inIndex(final Map<byte[], String> commentCache, final String url) {
for (Map.Entry<byte[], String> comment: commentCache.entrySet()) { for (final Map.Entry<byte[], String> comment: commentCache.entrySet()) {
if (comment.getValue().contains(url)) return comment.getKey(); if (comment.getValue().contains(url)) return comment.getKey();
} }
return null; return null;
} }
public static void main(String[] args) { public static void main(final String[] args) {
//try {System.out.println("192.168.1.91: " + ping(new MultiProtocolURI("smb://192.168.1.91/"), 1000));} catch (MalformedURLException e) {} //try {System.out.println("192.168.1.91: " + ping(new MultiProtocolURI("smb://192.168.1.91/"), 1000));} catch (MalformedURLException e) {}
Scanner scanner = new Scanner(100, 10); final Scanner scanner = new Scanner(100, 10);
scanner.addFTP(false); scanner.addFTP(false);
scanner.addHTTP(false); scanner.addHTTP(false);
scanner.addHTTPS(false); scanner.addHTTPS(false);
scanner.addSMB(false); scanner.addSMB(false);
scanner.start(); scanner.start();
scanner.terminate(); scanner.terminate();
for (Service service: scanner.services().keySet()) { for (final Service service: scanner.services().keySet()) {
System.out.println(service.toString()); System.out.println(service.toString());
} }
try { try {
HTTPClient.closeConnectionManager(); HTTPClient.closeConnectionManager();
} catch (InterruptedException e) { } catch (final InterruptedException e) {
} }
} }
} }

@ -37,7 +37,6 @@ import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.storage.ConfigurationSet; import net.yacy.cora.storage.ConfigurationSet;
@ -103,7 +102,7 @@ public class SolrScheme extends ConfigurationSet {
addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before) addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before)
addSolr(solrdoc, "id", id); addSolr(solrdoc, "id", id);
addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f); addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f);
final InetAddress address = Domains.dnsResolve(digestURI.getHost()); final InetAddress address = digestURI.getInetAddress();
if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress()); if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress());
if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost()); if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost());
addSolr(solrdoc, "title", yacydoc.dc_title()); addSolr(solrdoc, "title", yacydoc.dc_title());
@ -354,16 +353,16 @@ public class SolrScheme extends ConfigurationSet {
return solrdoc; return solrdoc;
} }
private int relEval(String[] rel) { private int relEval(final String[] rel) {
int i = 0; int i = 0;
for (String s: rel) { for (final String s: rel) {
String s0 = s.toLowerCase().trim(); final String s0 = s.toLowerCase().trim();
if ("me".equals(s0)) i += 1; if ("me".equals(s0)) i += 1;
if ("nofollow".equals(s0)) i += 2; if ("nofollow".equals(s0)) i += 2;
} }
return i; return i;
} }
public String solrGetID(final SolrDocument solr) { public String solrGetID(final SolrDocument solr) {
return (String) solr.getFieldValue("id"); return (String) solr.getFieldValue("id");
} }

@ -279,7 +279,7 @@ public class SolrSingleConnector implements SolrConnector {
final SolrInputDocument solrdoc = new SolrInputDocument(); final SolrInputDocument solrdoc = new SolrInputDocument();
solrdoc.addField("id", ASCII.String(digestURI.hash())); solrdoc.addField("id", ASCII.String(digestURI.hash()));
solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f); solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
final InetAddress address = Domains.dnsResolve(digestURI.getHost()); final InetAddress address = digestURI.getInetAddress();
if (address != null) solrdoc.addField("ip_s", address.getHostAddress()); if (address != null) solrdoc.addField("ip_s", address.getHostAddress());
if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost()); if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost());

@ -232,6 +232,7 @@ public class ReferenceContainer<ReferenceType extends Reference> extends RowSet
int pos = 0; int pos = 0;
while (i.hasNext()) { while (i.hasNext()) {
r = i.next(); r = i.next();
if (r == null) continue;
mod = r.lastModified(); mod = r.lastModified();
positions = tm.get(mod); positions = tm.get(mod);
if (positions == null) positions = new ArrayList<Integer>(); if (positions == null) positions = new ArrayList<Integer>();

@ -1909,8 +1909,8 @@ public final class Switchboard extends serverSwitch {
this.log.logFine("processResourceStack processCase=" + processCase + this.log.logFine("processResourceStack processCase=" + processCase +
", depth=" + response.depth() + ", depth=" + response.depth() +
", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) +
", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString()) +
", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString()) +
", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator())) + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator())) +
//", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) + //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
", url=" + response.url()); // DEBUG ", url=" + response.url()); // DEBUG

Loading…
Cancel
Save