From fe46536f6e203b4a8ea0816b998aedb33765f081 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 26 Dec 2010 16:25:17 +0000 Subject: [PATCH] enhanced network scanner (less name resolving during scanning and no name resolving during search) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7392 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CrawlStartScanner_p.java | 77 ++++--- source/net/yacy/cora/protocol/Domains.java | 21 +- source/net/yacy/cora/protocol/Scanner.java | 199 ++++++++++-------- .../yacy/cora/protocol/TimeoutRequest.java | 5 +- 4 files changed, 180 insertions(+), 122 deletions(-) diff --git a/htroot/CrawlStartScanner_p.java b/htroot/CrawlStartScanner_p.java index e5d4ccd4c..7b2841831 100644 --- a/htroot/CrawlStartScanner_p.java +++ b/htroot/CrawlStartScanner_p.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.net.InetAddress; +import java.net.MalformedURLException; import java.util.ConcurrentModificationException; import java.util.HashSet; import java.util.Iterator; @@ -28,7 +29,6 @@ import java.util.Map; import java.util.Set; import java.util.TreeMap; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.Scanner; @@ -39,6 +39,7 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import de.anomic.data.WorkTables; +import de.anomic.search.SearchEventCache; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -50,6 +51,9 @@ public class CrawlStartScanner_p { final serverObjects prop = new serverObjects(); final Switchboard sb = (Switchboard)env; + // clean up all search events + SearchEventCache.cleanupEvents(true); + prop.put("noserverdetected", 0); prop.put("servertable", 0); prop.put("hosts", ""); @@ -124,12 +128,17 @@ public class CrawlStartScanner_p { // check crawl request if (post.containsKey("crawl")) { // make a pk/url mapping - Iterator> se = Scanner.scancacheEntries(); + Iterator> se = Scanner.scancacheEntries(); Map pkmap = new TreeMap(Base64Order.enhancedCoder); while (se.hasNext()) { - MultiProtocolURI u = se.next().getKey(); - DigestURI uu = new DigestURI(u); - pkmap.put(uu.hash(), uu); + Scanner.Service u = se.next().getKey(); + DigestURI uu; + try { + uu = new DigestURI(u.url()); + pkmap.put(uu.hash(), uu); + } catch (MalformedURLException e) { + Log.logException(e); + } } // search for crawl start requests in this mapping for (Map.Entry entry: post.entrySet()) { @@ -163,18 +172,22 @@ public class CrawlStartScanner_p { DigestURI u; try { int i = 0; - Iterator> se = Scanner.scancacheEntries(); - Map.Entry host; + Iterator> se = Scanner.scancacheEntries(); + Map.Entry host; while (se.hasNext()) { host = se.next(); - u = new DigestURI(host.getKey()); - urlString = u.toNormalform(true, false); - if (host.getValue() == Access.granted && inIndex(apiCommentCache, urlString) == null) { - String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99"; - path += "&crawlingURL=" + urlString; - WorkTables.execAPICall("localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""), path, u.hash()); + try { + u = new DigestURI(host.getKey().url()); + urlString = u.toNormalform(true, false); + if (host.getValue() == Access.granted && inIndex(apiCommentCache, urlString) == null) { + String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99"; + path += "&crawlingURL=" + urlString; + WorkTables.execAPICall("localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""), path, u.hash()); + } + i++; + } catch (MalformedURLException e) { + Log.logException(e); } - i++; } } catch (ConcurrentModificationException e) {} } @@ -194,24 +207,28 @@ public class CrawlStartScanner_p { table: while (true) { try { int i = 0; - Iterator> se = Scanner.scancacheEntries(); - Map.Entry host; + Iterator> se = Scanner.scancacheEntries(); + Map.Entry host; while (se.hasNext()) { host = se.next(); - u = new DigestURI(host.getKey()); - urlString = u.toNormalform(true, false); - prop.put("servertable_list_" + i + "_pk", new String(u.hash())); - prop.put("servertable_list_" + i + "_count", i); - prop.putHTML("servertable_list_" + i + "_protocol", u.getProtocol()); - prop.putHTML("servertable_list_" + i + "_ip", Domains.dnsResolve(u.getHost()).getHostAddress()); - prop.putHTML("servertable_list_" + i + "_url", urlString); - prop.put("servertable_list_" + i + "_accessUnknown", host.getValue() == Access.unknown ? 1 : 0); - prop.put("servertable_list_" + i + "_accessEmpty", host.getValue() == Access.empty ? 1 : 0); - prop.put("servertable_list_" + i + "_accessGranted", host.getValue() == Access.granted ? 1 : 0); - prop.put("servertable_list_" + i + "_accessDenied", host.getValue() == Access.denied ? 1 : 0); - prop.put("servertable_list_" + i + "_process", inIndex(apiCommentCache, urlString) == null ? 0 : 1); - prop.put("servertable_list_" + i + "_preselected", host.getValue() == Access.granted && inIndex(apiCommentCache, urlString) == null ? 1 : 0); - i++; + try { + u = new DigestURI(host.getKey().url()); + urlString = u.toNormalform(true, false); + prop.put("servertable_list_" + i + "_pk", new String(u.hash())); + prop.put("servertable_list_" + i + "_count", i); + prop.putHTML("servertable_list_" + i + "_protocol", u.getProtocol()); + prop.putHTML("servertable_list_" + i + "_ip", Domains.dnsResolve(u.getHost()).getHostAddress()); + prop.putHTML("servertable_list_" + i + "_url", urlString); + prop.put("servertable_list_" + i + "_accessUnknown", host.getValue() == Access.unknown ? 1 : 0); + prop.put("servertable_list_" + i + "_accessEmpty", host.getValue() == Access.empty ? 1 : 0); + prop.put("servertable_list_" + i + "_accessGranted", host.getValue() == Access.granted ? 1 : 0); + prop.put("servertable_list_" + i + "_accessDenied", host.getValue() == Access.denied ? 1 : 0); + prop.put("servertable_list_" + i + "_process", inIndex(apiCommentCache, urlString) == null ? 0 : 1); + prop.put("servertable_list_" + i + "_preselected", host.getValue() == Access.granted && inIndex(apiCommentCache, urlString) == null ? 1 : 0); + i++; + } catch (MalformedURLException e) { + Log.logException(e); + } } prop.put("servertable_list", i); prop.put("servertable_num", i); diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index 716f1a23f..3ad6198e0 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -464,7 +464,9 @@ public class Domains { public static String getHostName(final InetAddress i) { Collection hosts = nameCacheHit.getKeys(i); if (hosts.size() > 0) return hosts.iterator().next(); - return i.getHostName(); + String host = i.getHostName(); + nameCacheHit.put(host, i); + return host; /* // call i.getHostName() using concurrency to interrupt execution in case of a time-out try { @@ -572,6 +574,7 @@ public class Domains { private static String localHostName = "127.0.0.1"; private static Set localHostAddresses = new HashSet(); + private static Set localHostNames = new HashSet(); static { try { InetAddress localHostAddress = InetAddress.getLocalHost(); @@ -616,6 +619,15 @@ public class Domains { } catch (UnknownHostException e) { Log.logException(e); } + + // fill a cache of local host names + for (InetAddress a: localHostAddresses) { + String hostname = getHostName(a); + if (hostname != null) { + localHostNames.add(hostname); + localHostNames.add(a.getHostAddress()); + } + } } }.start(); } @@ -746,14 +758,17 @@ public class Domains { if (matchesList(host, localhostPatterns)) return true; if (host.startsWith("0:0:0:0:0:0:0:1")) return true; - // finally check if there are other local IP addresses that are not in + // check if there are other local IP addresses that are not in // the standard IP range + if (localHostNames.contains(host)) return true; + /* for (InetAddress a: localHostAddresses) { String hostname = getHostName(a); if (hostname != null && hostname.equals(host)) return true; if (a.getHostAddress().equals(host)) return true; } - + */ + // check dns lookup: may be a local address even if the domain name looks global if (!recursive) return false; InetAddress a = dnsResolve(host); diff --git a/source/net/yacy/cora/protocol/Scanner.java b/source/net/yacy/cora/protocol/Scanner.java index 00c5c6e00..f8ad23d7b 100644 --- a/source/net/yacy/cora/protocol/Scanner.java +++ b/source/net/yacy/cora/protocol/Scanner.java @@ -26,12 +26,12 @@ import java.net.MalformedURLException; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; @@ -48,14 +48,60 @@ import net.yacy.kelondro.logging.Log; */ public class Scanner extends Thread { - private static final MultiProtocolURI POISONURI = new MultiProtocolURI(); + private static final Service POISONSERVICE = new Service(Protocol.http, null); private static final Object PRESENT = new Object(); public static enum Access {unknown, empty, granted, denied;} + public static enum Protocol {http(80), https(443), ftp(21), smb(445); + public int port; + private Protocol(int port) {this.port = port;} + } + public static class Service { + public Protocol protocol; + public InetAddress inetAddress; + private String hostname; + public Service(Protocol protocol, InetAddress inetAddress) { + this.protocol = protocol; + this.inetAddress = inetAddress; + this.hostname = null; + } + public Service(String protocol, InetAddress inetAddress) { + this.protocol = protocol.equals("http") ? Protocol.http : protocol.equals("https") ? Protocol.https : protocol.equals("ftp") ? Protocol.ftp : Protocol.smb; + this.inetAddress = inetAddress; + this.hostname = null; + } + public Protocol getProtocol() { + return this.protocol; + } + public InetAddress getInetAddress() { + return this.inetAddress; + } + public String getHostName() { + if (this.hostname != null) return this.hostname; + this.hostname = Domains.getHostName(this.inetAddress); + return this.hostname; + } + public MultiProtocolURI url() throws MalformedURLException { + return new MultiProtocolURI(this.protocol.name() + "://" + getHostName() + "/"); + } + public String toString() { + try { + return new MultiProtocolURI(this.protocol.name() + "://" + this.inetAddress.getHostAddress() + "/").toNormalform(true, false); + } catch (MalformedURLException e) { + return ""; + } + } + public int hashCode() { + return this.inetAddress.hashCode(); + } + public boolean equals(Object o) { + return (o instanceof Service) && ((Service) o).protocol == this.protocol && ((Service) o).inetAddress.equals(this.inetAddress); + } + } - private final static Map scancache = new TreeMap(); - private static long scancacheUpdateTime = 0; - private static long scancacheValidUntilTime = Long.MAX_VALUE; + private final static Map scancache = new HashMap(); + //private static long scancacheUpdateTime = 0; + //private static long scancacheValidUntilTime = Long.MAX_VALUE; private static Set scancacheScanrange = new HashSet(); public static int scancacheSize() { @@ -65,43 +111,45 @@ public class Scanner extends Thread { public static void scancacheReplace(Scanner newScanner, long validTime) { scancache.clear(); scancache.putAll(newScanner.services()); - scancacheUpdateTime = System.currentTimeMillis(); - scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime; + //scancacheUpdateTime = System.currentTimeMillis(); + //scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime; scancacheScanrange = newScanner.scanrange; } public static void scancacheExtend(Scanner newScanner, long validTime) { - Iterator> i = Scanner.scancache.entrySet().iterator(); - Map.Entry entry; + Iterator> i = Scanner.scancache.entrySet().iterator(); + Map.Entry entry; while (i.hasNext()) { entry = i.next(); if (entry.getValue() != Access.granted) i.remove(); } scancache.putAll(newScanner.services()); - scancacheUpdateTime = System.currentTimeMillis(); - scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime; + //scancacheUpdateTime = System.currentTimeMillis(); + //scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime; scancacheScanrange = newScanner.scanrange; } - public static Iterator> scancacheEntries() { + public static Iterator> scancacheEntries() { return scancache.entrySet().iterator(); } + /** + * check if the url can be accepted by the scanner. the scanner accepts the url if: + * - the host of the url is not supervised (it is not in the scan range), or + * - the host is supervised (it is in the scan range) and the host is in the scan cache + * @param url + * @return true if the url shall be part of a search result + */ public static boolean acceptURL(MultiProtocolURI url) { + // if the scan range is empty, then all urls are accepted if (scancacheScanrange == null || scancacheScanrange.size() == 0) return true; //if (System.currentTimeMillis() > scancacheValidUntilTime) return true; - InetAddress a = Domains.dnsResolve(url.getHost()); + InetAddress a = Domains.dnsResolve(url.getHost()); // try to avoid that! if (a == null) return true; InetAddress n = normalize(a); if (!scancacheScanrange.contains(n)) return true; - MultiProtocolURI uri; - try { - uri = produceURI(url.getProtocol(), a); - return scancache.containsKey(uri); - } catch (MalformedURLException e) { - return false; - } + return scancache.containsKey(new Service(url.getProtocol(), a)); } private static InetAddress normalize(InetAddress a) { @@ -118,27 +166,17 @@ public class Scanner extends Thread { private int runnerCount; private Set scanrange; - private BlockingQueue scanqueue; - private Map services; + private BlockingQueue scanqueue; + private Map services; private Map runner; private int timeout; - public Scanner(InetAddress scanrange, int concurrentRunner, int timeout) { - this.runnerCount = concurrentRunner; - this.scanrange = new HashSet(); - this.scanrange.add(normalize(scanrange)); - this.scanqueue = new LinkedBlockingQueue(); - this.services = Collections.synchronizedMap(new TreeMap()); - this.runner = new ConcurrentHashMap(); - this.timeout = timeout; - } - public Scanner(Set scanrange, int concurrentRunner, int timeout) { this.runnerCount = concurrentRunner; this.scanrange = new HashSet(); for (InetAddress a: scanrange) this.scanrange.add(normalize(a)); - this.scanqueue = new LinkedBlockingQueue(); - this.services = Collections.synchronizedMap(new TreeMap()); + this.scanqueue = new LinkedBlockingQueue(); + this.services = Collections.synchronizedMap(new HashMap()); this.runner = new ConcurrentHashMap(); this.timeout = timeout; } @@ -148,9 +186,9 @@ public class Scanner extends Thread { } public void run() { - MultiProtocolURI uri; + Service uri; try { - while ((uri = scanqueue.take()) != POISONURI) { + while ((uri = scanqueue.take()) != POISONSERVICE) { while (runner.size() >= this.runnerCount) { /*for (Runner r: runner.keySet()) { if (r.age() > 3000) synchronized(r) { r.interrupt(); } @@ -172,7 +210,7 @@ public class Scanner extends Thread { public void terminate() { for (int i = 0; i < runnerCount; i++) try { - this.scanqueue.put(POISONURI); + this.scanqueue.put(POISONSERVICE); } catch (InterruptedException e) { } try { @@ -181,52 +219,43 @@ public class Scanner extends Thread { } } - private static MultiProtocolURI produceURI(String protocol, InetAddress a) throws MalformedURLException { - return new MultiProtocolURI(protocol + "://" + Domains.getHostName(a) + "/"); - } - public class Runner extends Thread { - private MultiProtocolURI uri; + private Service service; private long starttime; - public Runner(MultiProtocolURI uri) { - this.uri = uri; + public Runner(Service service) { + this.service = service; this.starttime = System.currentTimeMillis(); } public void run() { try { - if (TimeoutRequest.ping(this.uri, timeout)) { - try { - MultiProtocolURI uri = produceURI(this.uri.getProtocol(), Domains.dnsResolve(this.uri.getHost())); - String protocol = uri.getProtocol(); - Access access = protocol.equals("http") || protocol.equals("https") ? Access.granted : Access.unknown; - services.put(uri, access); - if (access == Access.unknown) { - // ask the service if it lets us in - if (protocol.equals("ftp")) { - final FTPClient ftpClient = new FTPClient(); - try { - ftpClient.open(uri.getHost(), uri.getPort()); - ftpClient.login("anonymous", "anomic@"); - List list = ftpClient.list("/", false); - ftpClient.CLOSE(); - access = list == null || list.size() == 0 ? Access.empty : Access.granted; - } catch (IOException e) { - access = Access.denied; - } + if (TimeoutRequest.ping(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port, timeout)) { + Access access = this.service.getProtocol() == Protocol.http || this.service.getProtocol() == Protocol.https ? Access.granted : Access.unknown; + services.put(service, access); + if (access == Access.unknown) { + // ask the service if it lets us in + if (this.service.getProtocol() == Protocol.ftp) { + final FTPClient ftpClient = new FTPClient(); + try { + ftpClient.open(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port); + ftpClient.login("anonymous", "anomic@"); + List list = ftpClient.list("/", false); + ftpClient.CLOSE(); + access = list == null || list.size() == 0 ? Access.empty : Access.granted; + } catch (IOException e) { + access = Access.denied; } - if (protocol.equals("smb")) { - try { - String[] list = uri.list(); - access = list == null || list.length == 0 ? Access.empty : Access.granted; - } catch (IOException e) { - access = Access.denied; - } + } + if (this.service.getProtocol() == Protocol.smb) { + try { + MultiProtocolURI uri = new MultiProtocolURI(this.service.toString()); + String[] list = uri.list(); + access = list == null || list.length == 0 ? Access.empty : Access.granted; + } catch (IOException e) { + access = Access.denied; } } - if (access != Access.unknown) services.put(uri, access); - } catch (MalformedURLException e) { - e.printStackTrace(); } + if (access != Access.unknown) services.put(this.service, access); } } catch (ExecutionException e) { } @@ -237,35 +266,33 @@ public class Scanner extends Thread { return System.currentTimeMillis() - this.starttime; } public boolean equals(Object o) { - return (o instanceof Runner) && this.uri.toNormalform(true, false).equals(((Runner) o).uri.toNormalform(true, false)); + return (o instanceof Runner) && this.service.equals(((Runner) o).service); } public int hashCode() { - return this.uri.hashCode(); + return this.service.hashCode(); } } public void addHTTP(boolean bigrange) { - addProtocol("http", bigrange); + addProtocol(Protocol.http, bigrange); } public void addHTTPS(boolean bigrange) { - addProtocol("https", bigrange); + addProtocol(Protocol.https, bigrange); } public void addSMB(boolean bigrange) { - addProtocol("smb", bigrange); + addProtocol(Protocol.smb, bigrange); } public void addFTP(boolean bigrange) { - addProtocol("ftp", bigrange); + addProtocol(Protocol.ftp, bigrange); } - private void addProtocol(String protocol, boolean bigrange) { + private void addProtocol(Protocol protocol, boolean bigrange) { for (InetAddress i: genlist(bigrange)) { try { - this.scanqueue.put(new MultiProtocolURI(protocol + "://" + i.getHostAddress() + "/")); - } catch (MalformedURLException e) { - Log.logException(e); + this.scanqueue.put(new Service(protocol, i)); } catch (InterruptedException e) { Log.logException(e); } @@ -290,7 +317,7 @@ public class Scanner extends Thread { return c; } - public Map services() { + public Map services() { return this.services; } @@ -303,8 +330,8 @@ public class Scanner extends Thread { scanner.addSMB(false); scanner.start(); scanner.terminate(); - for (MultiProtocolURI service: scanner.services().keySet()) { - System.out.println(service.toNormalform(true, false)); + for (Service service: scanner.services().keySet()) { + System.out.println(service.toString()); } try { HTTPClient.closeConnectionManager(); diff --git a/source/net/yacy/cora/protocol/TimeoutRequest.java b/source/net/yacy/cora/protocol/TimeoutRequest.java index 77c41ef93..11e87eb36 100644 --- a/source/net/yacy/cora/protocol/TimeoutRequest.java +++ b/source/net/yacy/cora/protocol/TimeoutRequest.java @@ -37,7 +37,6 @@ import java.util.concurrent.TimeoutException; import jcifs.smb.SmbException; import jcifs.smb.SmbFile; -import net.yacy.cora.document.MultiProtocolURI; import net.yacy.kelondro.logging.Log; /** @@ -105,12 +104,12 @@ public class TimeoutRequest { * @return true if the server exists and replies within the given time-out * @throws ExecutionException */ - public static boolean ping(final MultiProtocolURI uri, final int timeout) throws ExecutionException { + public static boolean ping(final String host, final int port, final int timeout) throws ExecutionException { return new TimeoutRequest(new Callable() { public Boolean call() { try { Socket socket = new Socket(); - socket.connect(new InetSocketAddress(uri.getHost(), uri.getPort()), timeout); + socket.connect(new InetSocketAddress(host, port), timeout); if (socket.isConnected()) { socket.close(); return Boolean.TRUE;