From b250e6466d541e0ac15d915b56055300b0eb84e1 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 29 Sep 2011 15:17:39 +0000 Subject: [PATCH] implemented crawl restrictions for IP pattern and country lists git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7980 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CrawlProfileEditor_p.java | 41 ++--- htroot/CrawlStartExpert_p.html | 2 +- htroot/CrawlStartExpert_p.java | 10 +- htroot/Crawler_p.java | 20 +- htroot/QuickCrawlLink_p.java | 8 +- source/de/anomic/crawler/CrawlProfile.java | 91 +++++++-- source/de/anomic/crawler/CrawlQueues.java | 4 +- source/de/anomic/crawler/CrawlStacker.java | 67 +++++-- .../de/anomic/crawler/CrawlSwitchboard.java | 16 +- .../yacy/cora/document/MultiProtocolURI.java | 18 +- source/net/yacy/cora/protocol/Domains.java | 7 +- source/net/yacy/cora/protocol/Scanner.java | 172 +++++++++--------- .../services/federated/solr/SolrScheme.java | 11 +- .../federated/solr/SolrSingleConnector.java | 2 +- .../yacy/kelondro/rwi/ReferenceContainer.java | 1 + source/net/yacy/search/Switchboard.java | 4 +- 16 files changed, 291 insertions(+), 183 deletions(-) diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index d1b0e7e00..660970d8b 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -38,10 +38,9 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; - +import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlStacker; import de.anomic.crawler.CrawlSwitchboard; -import de.anomic.crawler.CrawlProfile; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.servletProperties; @@ -63,7 +62,7 @@ public class CrawlProfileEditor_p { ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES); ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES); } - + public static class eentry { public static final int BOOLEAN = 0; public static final int INTEGER = 1; @@ -73,7 +72,7 @@ public class CrawlProfileEditor_p { public final String label; public final boolean readonly; public final int type; - + public eentry(final String name, final String label, final boolean readonly, final int type) { this.name = name; this.label = label; @@ -81,7 +80,7 @@ public class CrawlProfileEditor_p { this.type = type; } } - + private static final List labels = new ArrayList(); static { labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING)); @@ -100,14 +99,14 @@ public class CrawlProfileEditor_p { labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN)); } - + public static serverObjects respond( - final RequestHeader header, + final RequestHeader header, final serverObjects post, final serverSwitch env) { final servletProperties prop = new servletProperties(); final Switchboard sb = (Switchboard)env; - + // read post for handle final String handle = (post == null) ? "" : post.get("handle", ""); if (post != null) { @@ -117,8 +116,8 @@ public class CrawlProfileEditor_p { if (p != null) sb.crawler.putPassive(handle.getBytes(), p); // delete all entries from the crawl queue that are deleted here sb.crawler.removeActive(handle.getBytes()); - sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000); - } catch (RowSpaceExceededException e) { + sb.crawlQueues.noticeURL.removeByProfileHandle(handle, 10000); + } catch (final RowSpaceExceededException e) { Log.logException(e); } if (post.containsKey("delete")) { @@ -131,7 +130,7 @@ public class CrawlProfileEditor_p { } } } - + // generate handle list: first sort by handle name CrawlProfile selentry; final Map orderdHandles = new TreeMap(); @@ -141,7 +140,7 @@ public class CrawlProfileEditor_p { orderdHandles.put(selentry.name(), selentry.handle()); } } - + // then write into pop-up menu list int count = 0; for (final Map.Entry NameHandle: orderdHandles.entrySet()) { @@ -159,8 +158,8 @@ public class CrawlProfileEditor_p { if ((post != null) && (selentry != null)) { if (post.containsKey("submit")) { try { - Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL)); - Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER)); + Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL_STRING)); + Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER_STRING)); final Iterator lit = labels.iterator(); eentry tee; while (lit.hasNext()) { @@ -179,7 +178,7 @@ public class CrawlProfileEditor_p { } } } - + // generate crawl profile table count = 0; boolean dark = true; @@ -231,10 +230,10 @@ public class CrawlProfileEditor_p { } prop.put("edit_entries", count); } - + return prop; } - + private static void putProfileEntry( final servletProperties prop, final CrawlStacker crawlStacker, @@ -253,8 +252,8 @@ public class CrawlProfileEditor_p { prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", profile.startURL()); prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", profile.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", profile.depth()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString()); + prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.urlMustMatchPattern().toString()); + prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.urlMustNotMatchPattern().toString()); prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder())); prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive"); @@ -270,7 +269,7 @@ public class CrawlProfileEditor_p { i++; } } - + prop.put(CRAWL_PROFILE_PREFIX+count+"_crawlingDomFilterContent", i); prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (profile.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(profile.domMaxPages())); diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index 91ffbb23f..b91ea7b52 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -185,7 +185,7 @@ : Use filter   - +
no country code restriction diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index 812f17f59..4b1793e68 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -42,11 +42,11 @@ public class CrawlStartExpert_p { prop.put("starturl", /*(intranet) ? repository :*/ "http://"); prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0))); - prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL); - prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER); - prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL)); - prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER)); - prop.put("crawlingCountryMustMatch", sb.getConfig("crawlingCountryMustMatch", "")); + prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING); + prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); + prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING)); + prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING)); + prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", "")); prop.put("crawlingIfOlderCheck", "0"); prop.put("crawlingIfOlderUnitYearCheck", "0"); diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 0dadc3ce4..fbef760bf 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -153,12 +153,12 @@ public class Crawler_p { final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start // set the crawl filter - String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); - final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); - if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted - String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL); - final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER); - if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL; + String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING); + final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); + if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted + String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING); + final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING); + if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL_STRING; final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : ""; sb.setConfig("crawlingIPMustMatch", ipMustMatch); sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch); @@ -439,7 +439,7 @@ public class Crawler_p { crawlingFileName, crawlURL, newcrawlingMustMatch, - CrawlProfile.MATCH_NEVER, + CrawlProfile.MATCH_NEVER_STRING, ipMustMatch, ipMustNotMatch, countryMustMatch, @@ -478,8 +478,8 @@ public class Crawler_p { final CrawlProfile pe = new CrawlProfile( sitemapURLStr, sitemapURL, - CrawlProfile.MATCH_ALL, - CrawlProfile.MATCH_NEVER, + CrawlProfile.MATCH_ALL_STRING, + CrawlProfile.MATCH_NEVER_STRING, ipMustMatch, ipMustNotMatch, countryMustMatch, @@ -523,7 +523,7 @@ public class Crawler_p { sitelistURL.getHost(), sitelistURL, newcrawlingMustMatch, - CrawlProfile.MATCH_NEVER, + CrawlProfile.MATCH_NEVER_STRING, ipMustMatch, ipMustNotMatch, countryMustMatch, diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index a61d07de2..7de24d99a 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -108,8 +108,8 @@ public class QuickCrawlLink_p { final String title = post.get("title",null); // get other parameters if set - final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); - final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); + final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING); + final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); final int CrawlingDepth = post.getInt("crawlingDepth", 0); final boolean crawlDynamic = post.get("crawlingQ", "").equals("on"); final boolean indexText = post.get("indexText", "on").equals("on"); @@ -149,8 +149,8 @@ public class QuickCrawlLink_p { crawlingStartURL.getHost(), crawlingStartURL, crawlingMustMatch, - CrawlProfile.MATCH_ALL, - CrawlProfile.MATCH_NEVER, + CrawlProfile.MATCH_ALL_STRING, + CrawlProfile.MATCH_NEVER_STRING, "", crawlingMustNotMatch, CrawlingDepth, diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 4705fa7c2..cce5e2688 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -41,8 +41,10 @@ public class CrawlProfile extends ConcurrentHashMap implements M private static final long serialVersionUID = 5527325718810703504L; - public static final String MATCH_ALL = ".*"; - public static final String MATCH_NEVER = ""; + public static final String MATCH_ALL_STRING = ".*"; + public static final String MATCH_NEVER_STRING = ""; + public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING); + public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING); // this is a simple record structure that hold all properties of a single crawl start public static final String HANDLE = "handle"; @@ -67,7 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String FILTER_IP_MUSTNOTMATCH = "crawlingIPMustNotMatch"; public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch"; - private Pattern mustmatch = null, mustnotmatch = null; + private Pattern urlmustmatch = null, urlmustnotmatch = null, ipmustmatch = null, ipmustnotmatch = null; /** * Constructor which creates CrawlPofile from parameters. @@ -119,10 +121,10 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(HANDLE, handle); put(NAME, name); put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false)); - put(FILTER_URL_MUSTMATCH, (urlMustMatch == null) ? CrawlProfile.MATCH_ALL : urlMustMatch); - put(FILTER_URL_MUSTNOTMATCH, (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : urlMustNotMatch); - put(FILTER_IP_MUSTMATCH, (ipMustMatch == null) ? CrawlProfile.MATCH_ALL : ipMustMatch); - put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : ipMustNotMatch); + put(FILTER_URL_MUSTMATCH, (urlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : urlMustMatch); + put(FILTER_URL_MUSTNOTMATCH, (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : urlMustNotMatch); + put(FILTER_IP_MUSTMATCH, (ipMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : ipMustMatch); + put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : ipMustNotMatch); put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch); put(DEPTH, depth); put(RECRAWL_IF_OLDER, recrawlIfOlder); @@ -207,26 +209,77 @@ public class CrawlProfile extends ConcurrentHashMap implements M * Gets the regex which must be matched by URLs in order to be crawled. * @return regex which must be matched */ - public Pattern mustMatchPattern() { - if (this.mustmatch == null) { - String r = get(FILTER_URL_MUSTMATCH); - if (r == null) r = CrawlProfile.MATCH_ALL; - this.mustmatch = Pattern.compile(r); + public Pattern urlMustMatchPattern() { + if (this.urlmustmatch == null) { + final String r = get(FILTER_URL_MUSTMATCH); + if (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) { + this.urlmustmatch = CrawlProfile.MATCH_ALL_PATTERN; + } else { + this.urlmustmatch = Pattern.compile(r); + } } - return this.mustmatch; + return this.urlmustmatch; } /** * Gets the regex which must not be matched by URLs in order to be crawled. * @return regex which must not be matched */ - public Pattern mustNotMatchPattern() { - if (this.mustnotmatch == null) { - String r = get(FILTER_URL_MUSTNOTMATCH); - if (r == null) r = CrawlProfile.MATCH_NEVER; - this.mustnotmatch = Pattern.compile(r); + public Pattern urlMustNotMatchPattern() { + if (this.urlmustnotmatch == null) { + final String r = get(FILTER_URL_MUSTNOTMATCH); + if (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) { + this.urlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; + } else { + this.urlmustnotmatch = Pattern.compile(r); + } } - return this.mustnotmatch; + return this.urlmustnotmatch; + } + + /** + * Gets the regex which must be matched by IPs in order to be crawled. + * @return regex which must be matched + */ + public Pattern ipMustMatchPattern() { + if (this.ipmustmatch == null) { + final String r = get(FILTER_IP_MUSTMATCH); + if (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) { + this.ipmustmatch = CrawlProfile.MATCH_ALL_PATTERN; + } else { + this.ipmustmatch = Pattern.compile(r); + } + } + return this.ipmustmatch; + } + + /** + * Gets the regex which must not be matched by IPs in order to be crawled. + * @return regex which must not be matched + */ + public Pattern ipMustNotMatchPattern() { + if (this.ipmustnotmatch == null) { + final String r = get(FILTER_IP_MUSTNOTMATCH); + if (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) { + this.ipmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; + } else { + this.ipmustnotmatch = Pattern.compile(r); + } + } + return this.ipmustnotmatch; + } + + /** + * get the list of countries that must match for the locations of the URLs IPs + * @return a list of country codes + */ + public String[] countryMustMatchList() { + String countryMustMatch = get(FILTER_COUNTRY_MUSTMATCH); + if (countryMustMatch == null) countryMustMatch = ""; + if (countryMustMatch.length() == 0) return new String[0]; + String[] list = countryMustMatch.split(","); + if (list.length == 1 && list.length == 0) list = new String[0]; + return list; } /** diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 6413ea782..2bdce5c45 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -299,8 +299,8 @@ public class CrawlQueues { + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.depth() - + ", must-match=" + profile.mustMatchPattern().toString() - + ", must-not-match=" + profile.mustNotMatchPattern().toString() + + ", must-match=" + profile.urlMustMatchPattern().toString() + + ", must-not-match=" + profile.urlMustNotMatchPattern().toString() + ", permission=" + ((this.sb.peers == null) ? "undefined" : (((this.sb.peers.mySeed().isSenior()) || (this.sb.peers.mySeed().isPrincipal())) ? "true" : "false"))); // work off one Crawl stack entry diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index fc6f2a283..149e01632 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -34,6 +34,7 @@ import java.net.MalformedURLException; import java.net.UnknownHostException; import java.util.Date; import java.util.Iterator; +import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.concurrent.BlockingQueue; @@ -438,8 +439,9 @@ public final class CrawlStacker { // check if the protocol is supported final String urlProtocol = url.getProtocol(); + final String urlstring = url.toString(); if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) { - this.log.logSevere("Unsupported protocol in URL '" + url.toString() + "'."); + this.log.logSevere("Unsupported protocol in URL '" + urlstring + "'."); return "unsupported protocol"; } @@ -452,31 +454,31 @@ public final class CrawlStacker { // check blacklist if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) { - if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is in blacklist."); + if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is in blacklist."); return "url in blacklist"; } - // filter with must-match - if ((depth > 0) && !profile.mustMatchPattern().matcher(url.toString()).matches()) { - if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'."); + // filter with must-match for URLs + if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) { + if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'."); return "url does not match must-match filter"; } - // filter with must-not-match - if ((depth > 0) && profile.mustNotMatchPattern().matcher(url.toString()).matches()) { - if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'."); + // filter with must-not-match for URLs + if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) { + if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'."); return "url matches must-not-match filter"; } // deny cgi if (url.isIndividual() && !(profile.crawlingQ())) { // TODO: make special property for crawlingIndividual - if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL."); + if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is CGI URL."); return "individual url (sessionid etc) not wanted"; } // deny post properties if (url.isPOST() && !(profile.crawlingQ())) { - if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is post URL."); + if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is post URL."); return "post url not allowed"; } @@ -486,7 +488,7 @@ public final class CrawlStacker { if (oldEntry == null) { if (dbocc != null) { // do double-check - if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'."); + if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is double registered in '" + dbocc + "'."); if (dbocc.equals("errors")) { final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); return "double in: errors (" + errorEntry.anycause() + ")"; @@ -498,13 +500,13 @@ public final class CrawlStacker { final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime(); if (recrawl) { if (this.log.isInfo()) - this.log.logInfo("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " + + this.log.logInfo("RE-CRAWL of URL '" + urlstring + "': this url was crawled " + ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago."); } else { if (dbocc == null) { return "double in: LURL-DB"; } else { - if (this.log.isInfo()) this.log.logInfo("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:"); + if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc + "'. " + "Stack processing time:"); if (dbocc.equals("errors")) { final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); return "double in: errors (" + errorEntry.anycause() + ")"; @@ -520,16 +522,51 @@ public final class CrawlStacker { if (maxAllowedPagesPerDomain < Integer.MAX_VALUE) { final DomProfile dp = this.doms.get(url.getHost()); if (dp != null && dp.count >= maxAllowedPagesPerDomain) { - if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed."); + if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed."); return "crawl stack domain counter exceeded"; } if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= profile.domMaxPages()) { - if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often in result stack, a maximum of " + profile.domMaxPages() + " is allowed."); + if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + profile.domMaxPages() + " is allowed."); return "result stack domain counter exceeded"; } } + // the following filters use a DNS lookup to check if the url matches with IP filter + // this is expensive and those filters are check at the end of all other tests + + // filter with must-match for IPs + if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_ALL_PATTERN && !profile.ipMustMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) { + if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'."); + return "ip " + url.getInetAddress().getHostAddress() + " of url does not match must-match filter"; + } + + // filter with must-not-match for IPs + if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.ipMustNotMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) { + if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'."); + return "ip " + url.getInetAddress().getHostAddress() + " of url matches must-not-match filter"; + } + + // filter with must-match for IPs + final String[] countryMatchList = profile.countryMustMatchList(); + if (depth > 0 && countryMatchList != null && countryMatchList.length > 0) { + final Locale locale = url.getLocale(); + if (locale != null) { + final String c0 = locale.getCountry(); + boolean granted = false; + matchloop: for (final String c: countryMatchList) { + if (c0.equals(c)) { + granted = true; + break matchloop; + } + } + if (!granted) { + if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'."); + return "country " + c0 + " of url does not match must-match filter for countries"; + } + } + } + return null; } diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index fe2013edf..ea2fb73d4 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -229,8 +229,8 @@ public final class CrawlSwitchboard { // generate new default entry for proxy crawling this.defaultProxyProfile = new CrawlProfile( "proxy", null, - CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, - CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, + CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, + CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, @@ -243,38 +243,38 @@ public final class CrawlSwitchboard { } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling - this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", CrawlProfile.MATCH_NEVER, 0, + this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0, -1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, + this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, + this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); } this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, + this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, + this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing - this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, + this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); } diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 9eab711b5..1847de5f4 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -88,6 +88,7 @@ public class MultiProtocolURI implements Serializable, Comparable 0) return locale; + } return Domains.getLocale(this.host); } diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index 193b263e3..cddd090e5 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -550,6 +550,11 @@ public class Domains { cacheHit_Insert++; } + /** + * resolve a host address using a local DNS cache and a DNS lookup if necessary + * @param host + * @return the hosts InetAddress or null if the address cannot be resolved + */ public static InetAddress dnsResolve(String host) { if ((host == null) || (host.length() == 0)) return null; host = host.toLowerCase().trim(); @@ -921,7 +926,7 @@ public class Domains { public static Locale getLocale(final String host) { if (host == null) return null; final Locale locale = getLocale(dnsResolve(host)); - if (locale != null) return locale; + if (locale != null && locale.getCountry() != null && locale.getCountry().length() > 0) return locale; final int p = host.lastIndexOf('.'); if (p < 0) return null; String tld = host.substring(p + 1).toUpperCase(); diff --git a/source/net/yacy/cora/protocol/Scanner.java b/source/net/yacy/cora/protocol/Scanner.java index 7e6954d37..8c508e1a0 100644 --- a/source/net/yacy/cora/protocol/Scanner.java +++ b/source/net/yacy/cora/protocol/Scanner.java @@ -11,12 +11,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -53,22 +53,22 @@ public class Scanner extends Thread { private static final Service POISONSERVICE = new Service(Protocol.http, null); private static final Object PRESENT = new Object(); - + public static enum Access {unknown, empty, granted, denied;} public static enum Protocol {http(80), https(443), ftp(21), smb(445); public int port; - private Protocol(int port) {this.port = port;} + private Protocol(final int port) {this.port = port;} } public static class Service { public Protocol protocol; public InetAddress inetAddress; private String hostname; - public Service(Protocol protocol, InetAddress inetAddress) { + public Service(final Protocol protocol, final InetAddress inetAddress) { this.protocol = protocol; this.inetAddress = inetAddress; this.hostname = null; } - public Service(String protocol, InetAddress inetAddress) { + public Service(final String protocol, final InetAddress inetAddress) { this.protocol = protocol.equals("http") ? Protocol.http : protocol.equals("https") ? Protocol.https : protocol.equals("ftp") ? Protocol.ftp : Protocol.smb; this.inetAddress = inetAddress; this.hostname = null; @@ -92,7 +92,7 @@ public class Scanner extends Thread { try { this.hostname = TimeoutRequest.getHostName(this.inetAddress, 100); Domains.setHostName(this.inetAddress, this.hostname); - } catch (ExecutionException e) { + } catch (final ExecutionException e) { this.hostname = this.inetAddress.getHostAddress(); } //this.hostname = Domains.getHostName(this.inetAddress); @@ -105,7 +105,7 @@ public class Scanner extends Thread { public String toString() { try { return new MultiProtocolURI(this.protocol.name() + "://" + this.inetAddress.getHostAddress() + "/").toNormalform(true, false); - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { return ""; } } @@ -114,11 +114,11 @@ public class Scanner extends Thread { return this.inetAddress.hashCode(); } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { return (o instanceof Service) && ((Service) o).protocol == this.protocol && ((Service) o).inetAddress.equals(this.inetAddress); } } - + private final static Map scancache = new ConcurrentHashMap(); //private static long scancacheUpdateTime = 0; //private static long scancacheValidUntilTime = Long.MAX_VALUE; @@ -127,17 +127,17 @@ public class Scanner extends Thread { public static int scancacheSize() { return scancache.size(); } - - public static void scancacheReplace(Scanner newScanner, long validTime) { + + public static void scancacheReplace(final Scanner newScanner, final long validTime) { scancache.clear(); scancache.putAll(newScanner.services()); //scancacheUpdateTime = System.currentTimeMillis(); //scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime; scancacheScanrange = newScanner.scanrange; } - - public static void scancacheExtend(Scanner newScanner, long validTime) { - Iterator> i = Scanner.scancache.entrySet().iterator(); + + public static void scancacheExtend(final Scanner newScanner, final long validTime) { + final Iterator> i = Scanner.scancache.entrySet().iterator(); Map.Entry entry; while (i.hasNext()) { entry = i.next(); @@ -148,11 +148,11 @@ public class Scanner extends Thread { //scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime; scancacheScanrange = newScanner.scanrange; } - + public static Iterator> scancacheEntries() { return scancache.entrySet().iterator(); } - + /** * check if the url can be accepted by the scanner. the scanner accepts the url if: * - the host of the url is not supervised (it is not in the scan range), or @@ -160,100 +160,100 @@ public class Scanner extends Thread { * @param url * @return true if the url shall be part of a search result */ - public static boolean acceptURL(MultiProtocolURI url) { + public static boolean acceptURL(final MultiProtocolURI url) { // if the scan range is empty, then all urls are accepted if (scancacheScanrange == null || scancacheScanrange.isEmpty()) return true; - + //if (System.currentTimeMillis() > scancacheValidUntilTime) return true; - InetAddress a = Domains.dnsResolve(url.getHost()); // try to avoid that! + final InetAddress a = url.getInetAddress(); // try to avoid that! if (a == null) return true; - InetAddress n = normalize(a); + final InetAddress n = normalize(a); if (!scancacheScanrange.contains(n)) return true; - Access access = scancache.get(new Service(url.getProtocol(), a)); + final Access access = scancache.get(new Service(url.getProtocol(), a)); if (access == null) return false; return access == Access.granted; } - private static InetAddress normalize(InetAddress a) { + private static InetAddress normalize(final InetAddress a) { if (a == null) return null; - byte[] b = a.getAddress(); + final byte[] b = a.getAddress(); if (b[3] == 1) return a; b[3] = 1; try { return InetAddress.getByAddress(b); - } catch (UnknownHostException e) { + } catch (final UnknownHostException e) { return a; } } - - private int runnerCount; - private Set scanrange; - private BlockingQueue scanqueue; - private Map services; - private Map runner; - private int timeout; - public Scanner(Set scanrange, int concurrentRunner, int timeout) { + private final int runnerCount; + private final Set scanrange; + private final BlockingQueue scanqueue; + private final Map services; + private final Map runner; + private final int timeout; + + public Scanner(final Set scanrange, final int concurrentRunner, final int timeout) { this.runnerCount = concurrentRunner; this.scanrange = new HashSet(); - for (InetAddress a: scanrange) this.scanrange.add(normalize(a)); + for (final InetAddress a: scanrange) this.scanrange.add(normalize(a)); this.scanqueue = new LinkedBlockingQueue(); this.services = Collections.synchronizedMap(new HashMap()); this.runner = new ConcurrentHashMap(); this.timeout = timeout; } - - public Scanner(int concurrentRunner, int timeout) { + + public Scanner(final int concurrentRunner, final int timeout) { this(Domains.myIntranetIPs(), concurrentRunner, timeout); } - + @Override public void run() { Service uri; try { - while ((uri = scanqueue.take()) != POISONSERVICE) { - while (runner.size() >= this.runnerCount) { + while ((uri = this.scanqueue.take()) != POISONSERVICE) { + while (this.runner.size() >= this.runnerCount) { /*for (Runner r: runner.keySet()) { if (r.age() > 3000) synchronized(r) { r.interrupt(); } }*/ - if (runner.size() >= this.runnerCount) Thread.sleep(20); + if (this.runner.size() >= this.runnerCount) Thread.sleep(20); } - Runner runner = new Runner(uri); + final Runner runner = new Runner(uri); this.runner.put(runner, PRESENT); runner.start(); } - } catch (InterruptedException e) { + } catch (final InterruptedException e) { } } public int pending() { return this.scanqueue.size(); } - + public void terminate() { - for (int i = 0; i < runnerCount; i++) try { + for (int i = 0; i < this.runnerCount; i++) try { this.scanqueue.put(POISONSERVICE); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { } try { this.join(); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { } } - + public class Runner extends Thread { - private Service service; - private long starttime; - public Runner(Service service) { + private final Service service; + private final long starttime; + public Runner(final Service service) { this.service = service; this.starttime = System.currentTimeMillis(); } @Override public void run() { try { - if (TimeoutRequest.ping(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port, timeout)) { + if (TimeoutRequest.ping(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port, Scanner.this.timeout)) { Access access = this.service.getProtocol() == Protocol.http || this.service.getProtocol() == Protocol.https ? Access.granted : Access.unknown; - services.put(service, access); + Scanner.this.services.put(this.service, access); if (access == Access.unknown) { // ask the service if it lets us in if (this.service.getProtocol() == Protocol.ftp) { @@ -261,35 +261,35 @@ public class Scanner extends Thread { try { ftpClient.open(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port); ftpClient.login("anonymous", "anomic@"); - List list = ftpClient.list("/", false); + final List list = ftpClient.list("/", false); ftpClient.CLOSE(); access = list == null || list.isEmpty() ? Access.empty : Access.granted; - } catch (IOException e) { + } catch (final IOException e) { access = Access.denied; } } if (this.service.getProtocol() == Protocol.smb) { try { - MultiProtocolURI uri = new MultiProtocolURI(this.service.toString()); - String[] list = uri.list(); + final MultiProtocolURI uri = new MultiProtocolURI(this.service.toString()); + final String[] list = uri.list(); access = list == null || list.length == 0 ? Access.empty : Access.granted; - } catch (IOException e) { + } catch (final IOException e) { access = Access.denied; } } } - if (access != Access.unknown) services.put(this.service, access); + if (access != Access.unknown) Scanner.this.services.put(this.service, access); } - } catch (ExecutionException e) { + } catch (final ExecutionException e) { } - Object r = runner.remove(this); + final Object r = Scanner.this.runner.remove(this); assert r != null; } public long age() { return System.currentTimeMillis() - this.starttime; } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { return (o instanceof Runner) && this.service.equals(((Runner) o).service); } @Override @@ -297,76 +297,76 @@ public class Scanner extends Thread { return this.service.hashCode(); } } - - public void addHTTP(boolean bigrange) { + + public void addHTTP(final boolean bigrange) { addProtocol(Protocol.http, bigrange); } - public void addHTTPS(boolean bigrange) { + public void addHTTPS(final boolean bigrange) { addProtocol(Protocol.https, bigrange); } - public void addSMB(boolean bigrange) { + public void addSMB(final boolean bigrange) { addProtocol(Protocol.smb, bigrange); } - - public void addFTP(boolean bigrange) { + + public void addFTP(final boolean bigrange) { addProtocol(Protocol.ftp, bigrange); } - - private void addProtocol(Protocol protocol, boolean bigrange) { - for (InetAddress i: genlist(bigrange)) { + + private void addProtocol(final Protocol protocol, final boolean bigrange) { + for (final InetAddress i: genlist(bigrange)) { try { this.scanqueue.put(new Service(protocol, i)); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { } } } - - private final List genlist(boolean bigrange) { - ArrayList c = new ArrayList(10); - for (InetAddress i: scanrange) { + + private final List genlist(final boolean bigrange) { + final ArrayList c = new ArrayList(10); + for (final InetAddress i: this.scanrange) { for (int br = bigrange ? 1 : i.getAddress()[2]; br < (bigrange ? 255 : i.getAddress()[2] + 1); br++) { for (int j = 1; j < 255; j++) { - byte[] address = i.getAddress(); + final byte[] address = i.getAddress(); address[2] = (byte) br; address[3] = (byte) j; try { c.add(InetAddress.getByAddress(address)); - } catch (UnknownHostException e) { + } catch (final UnknownHostException e) { } } } } return c; } - + public Map services() { return this.services; } - - public static byte[] inIndex(Map commentCache, String url) { - for (Map.Entry comment: commentCache.entrySet()) { + + public static byte[] inIndex(final Map commentCache, final String url) { + for (final Map.Entry comment: commentCache.entrySet()) { if (comment.getValue().contains(url)) return comment.getKey(); } return null; } - - public static void main(String[] args) { + + public static void main(final String[] args) { //try {System.out.println("192.168.1.91: " + ping(new MultiProtocolURI("smb://192.168.1.91/"), 1000));} catch (MalformedURLException e) {} - Scanner scanner = new Scanner(100, 10); + final Scanner scanner = new Scanner(100, 10); scanner.addFTP(false); scanner.addHTTP(false); scanner.addHTTPS(false); scanner.addSMB(false); scanner.start(); scanner.terminate(); - for (Service service: scanner.services().keySet()) { + for (final Service service: scanner.services().keySet()) { System.out.println(service.toString()); } try { HTTPClient.closeConnectionManager(); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { } } } diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java index 6b2a95ff2..28d43f778 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java @@ -37,7 +37,6 @@ import java.util.Set; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; -import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.storage.ConfigurationSet; @@ -103,7 +102,7 @@ public class SolrScheme extends ConfigurationSet { addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before) addSolr(solrdoc, "id", id); addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f); - final InetAddress address = Domains.dnsResolve(digestURI.getHost()); + final InetAddress address = digestURI.getInetAddress(); if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress()); if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost()); addSolr(solrdoc, "title", yacydoc.dc_title()); @@ -354,16 +353,16 @@ public class SolrScheme extends ConfigurationSet { return solrdoc; } - private int relEval(String[] rel) { + private int relEval(final String[] rel) { int i = 0; - for (String s: rel) { - String s0 = s.toLowerCase().trim(); + for (final String s: rel) { + final String s0 = s.toLowerCase().trim(); if ("me".equals(s0)) i += 1; if ("nofollow".equals(s0)) i += 2; } return i; } - + public String solrGetID(final SolrDocument solr) { return (String) solr.getFieldValue("id"); } diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java index e9c9dadf5..847da1bbc 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java @@ -279,7 +279,7 @@ public class SolrSingleConnector implements SolrConnector { final SolrInputDocument solrdoc = new SolrInputDocument(); solrdoc.addField("id", ASCII.String(digestURI.hash())); solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f); - final InetAddress address = Domains.dnsResolve(digestURI.getHost()); + final InetAddress address = digestURI.getInetAddress(); if (address != null) solrdoc.addField("ip_s", address.getHostAddress()); if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost()); diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainer.java b/source/net/yacy/kelondro/rwi/ReferenceContainer.java index 5227a0288..1af1fd270 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainer.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainer.java @@ -232,6 +232,7 @@ public class ReferenceContainer extends RowSet int pos = 0; while (i.hasNext()) { r = i.next(); + if (r == null) continue; mod = r.lastModified(); positions = tm.get(mod); if (positions == null) positions = new ArrayList(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 267cfd419..b0c6ac375 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1909,8 +1909,8 @@ public final class Switchboard extends serverSwitch { this.log.logFine("processResourceStack processCase=" + processCase + ", depth=" + response.depth() + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) + - ", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) + - ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) + + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString()) + + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString()) + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator())) + //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) + ", url=" + response.url()); // DEBUG