enhanced network scanner (less name resolving during scanning and no name resolving during search)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7392 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent a083cf531e
commit fe46536f6e

@ -21,6 +21,7 @@
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.ConcurrentModificationException;
import java.util.HashSet;
import java.util.Iterator;
@ -28,7 +29,6 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.Scanner;
@ -39,6 +39,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import de.anomic.data.WorkTables;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -50,6 +51,9 @@ public class CrawlStartScanner_p {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard)env;
// clean up all search events
SearchEventCache.cleanupEvents(true);
prop.put("noserverdetected", 0);
prop.put("servertable", 0);
prop.put("hosts", "");
@ -124,12 +128,17 @@ public class CrawlStartScanner_p {
// check crawl request
if (post.containsKey("crawl")) {
// make a pk/url mapping
Iterator<Map.Entry<MultiProtocolURI, Scanner.Access>> se = Scanner.scancacheEntries();
Iterator<Map.Entry<Scanner.Service, Scanner.Access>> se = Scanner.scancacheEntries();
Map<byte[], DigestURI> pkmap = new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
while (se.hasNext()) {
MultiProtocolURI u = se.next().getKey();
DigestURI uu = new DigestURI(u);
pkmap.put(uu.hash(), uu);
Scanner.Service u = se.next().getKey();
DigestURI uu;
try {
uu = new DigestURI(u.url());
pkmap.put(uu.hash(), uu);
} catch (MalformedURLException e) {
Log.logException(e);
}
}
// search for crawl start requests in this mapping
for (Map.Entry<String, String> entry: post.entrySet()) {
@ -163,18 +172,22 @@ public class CrawlStartScanner_p {
DigestURI u;
try {
int i = 0;
Iterator<Map.Entry<MultiProtocolURI, Scanner.Access>> se = Scanner.scancacheEntries();
Map.Entry<MultiProtocolURI, Scanner.Access> host;
Iterator<Map.Entry<Scanner.Service, Scanner.Access>> se = Scanner.scancacheEntries();
Map.Entry<Scanner.Service, Scanner.Access> host;
while (se.hasNext()) {
host = se.next();
u = new DigestURI(host.getKey());
urlString = u.toNormalform(true, false);
if (host.getValue() == Access.granted && inIndex(apiCommentCache, urlString) == null) {
String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99";
path += "&crawlingURL=" + urlString;
WorkTables.execAPICall("localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""), path, u.hash());
try {
u = new DigestURI(host.getKey().url());
urlString = u.toNormalform(true, false);
if (host.getValue() == Access.granted && inIndex(apiCommentCache, urlString) == null) {
String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99";
path += "&crawlingURL=" + urlString;
WorkTables.execAPICall("localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""), path, u.hash());
}
i++;
} catch (MalformedURLException e) {
Log.logException(e);
}
i++;
}
} catch (ConcurrentModificationException e) {}
}
@ -194,24 +207,28 @@ public class CrawlStartScanner_p {
table: while (true) {
try {
int i = 0;
Iterator<Map.Entry<MultiProtocolURI, Scanner.Access>> se = Scanner.scancacheEntries();
Map.Entry<MultiProtocolURI, Scanner.Access> host;
Iterator<Map.Entry<Scanner.Service, Scanner.Access>> se = Scanner.scancacheEntries();
Map.Entry<Scanner.Service, Scanner.Access> host;
while (se.hasNext()) {
host = se.next();
u = new DigestURI(host.getKey());
urlString = u.toNormalform(true, false);
prop.put("servertable_list_" + i + "_pk", new String(u.hash()));
prop.put("servertable_list_" + i + "_count", i);
prop.putHTML("servertable_list_" + i + "_protocol", u.getProtocol());
prop.putHTML("servertable_list_" + i + "_ip", Domains.dnsResolve(u.getHost()).getHostAddress());
prop.putHTML("servertable_list_" + i + "_url", urlString);
prop.put("servertable_list_" + i + "_accessUnknown", host.getValue() == Access.unknown ? 1 : 0);
prop.put("servertable_list_" + i + "_accessEmpty", host.getValue() == Access.empty ? 1 : 0);
prop.put("servertable_list_" + i + "_accessGranted", host.getValue() == Access.granted ? 1 : 0);
prop.put("servertable_list_" + i + "_accessDenied", host.getValue() == Access.denied ? 1 : 0);
prop.put("servertable_list_" + i + "_process", inIndex(apiCommentCache, urlString) == null ? 0 : 1);
prop.put("servertable_list_" + i + "_preselected", host.getValue() == Access.granted && inIndex(apiCommentCache, urlString) == null ? 1 : 0);
i++;
try {
u = new DigestURI(host.getKey().url());
urlString = u.toNormalform(true, false);
prop.put("servertable_list_" + i + "_pk", new String(u.hash()));
prop.put("servertable_list_" + i + "_count", i);
prop.putHTML("servertable_list_" + i + "_protocol", u.getProtocol());
prop.putHTML("servertable_list_" + i + "_ip", Domains.dnsResolve(u.getHost()).getHostAddress());
prop.putHTML("servertable_list_" + i + "_url", urlString);
prop.put("servertable_list_" + i + "_accessUnknown", host.getValue() == Access.unknown ? 1 : 0);
prop.put("servertable_list_" + i + "_accessEmpty", host.getValue() == Access.empty ? 1 : 0);
prop.put("servertable_list_" + i + "_accessGranted", host.getValue() == Access.granted ? 1 : 0);
prop.put("servertable_list_" + i + "_accessDenied", host.getValue() == Access.denied ? 1 : 0);
prop.put("servertable_list_" + i + "_process", inIndex(apiCommentCache, urlString) == null ? 0 : 1);
prop.put("servertable_list_" + i + "_preselected", host.getValue() == Access.granted && inIndex(apiCommentCache, urlString) == null ? 1 : 0);
i++;
} catch (MalformedURLException e) {
Log.logException(e);
}
}
prop.put("servertable_list", i);
prop.put("servertable_num", i);

@ -464,7 +464,9 @@ public class Domains {
public static String getHostName(final InetAddress i) {
Collection<String> hosts = nameCacheHit.getKeys(i);
if (hosts.size() > 0) return hosts.iterator().next();
return i.getHostName();
String host = i.getHostName();
nameCacheHit.put(host, i);
return host;
/*
// call i.getHostName() using concurrency to interrupt execution in case of a time-out
try {
@ -572,6 +574,7 @@ public class Domains {
private static String localHostName = "127.0.0.1";
private static Set<InetAddress> localHostAddresses = new HashSet<InetAddress>();
private static Set<String> localHostNames = new HashSet<String>();
static {
try {
InetAddress localHostAddress = InetAddress.getLocalHost();
@ -616,6 +619,15 @@ public class Domains {
} catch (UnknownHostException e) {
Log.logException(e);
}
// fill a cache of local host names
for (InetAddress a: localHostAddresses) {
String hostname = getHostName(a);
if (hostname != null) {
localHostNames.add(hostname);
localHostNames.add(a.getHostAddress());
}
}
}
}.start();
}
@ -746,14 +758,17 @@ public class Domains {
if (matchesList(host, localhostPatterns)) return true;
if (host.startsWith("0:0:0:0:0:0:0:1")) return true;
// finally check if there are other local IP addresses that are not in
// check if there are other local IP addresses that are not in
// the standard IP range
if (localHostNames.contains(host)) return true;
/*
for (InetAddress a: localHostAddresses) {
String hostname = getHostName(a);
if (hostname != null && hostname.equals(host)) return true;
if (a.getHostAddress().equals(host)) return true;
}
*/
// check dns lookup: may be a local address even if the domain name looks global
if (!recursive) return false;
InetAddress a = dnsResolve(host);

@ -26,12 +26,12 @@ import java.net.MalformedURLException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
@ -48,14 +48,60 @@ import net.yacy.kelondro.logging.Log;
*/
public class Scanner extends Thread {
private static final MultiProtocolURI POISONURI = new MultiProtocolURI();
private static final Service POISONSERVICE = new Service(Protocol.http, null);
private static final Object PRESENT = new Object();
public static enum Access {unknown, empty, granted, denied;}
public static enum Protocol {http(80), https(443), ftp(21), smb(445);
public int port;
private Protocol(int port) {this.port = port;}
}
public static class Service {
public Protocol protocol;
public InetAddress inetAddress;
private String hostname;
public Service(Protocol protocol, InetAddress inetAddress) {
this.protocol = protocol;
this.inetAddress = inetAddress;
this.hostname = null;
}
public Service(String protocol, InetAddress inetAddress) {
this.protocol = protocol.equals("http") ? Protocol.http : protocol.equals("https") ? Protocol.https : protocol.equals("ftp") ? Protocol.ftp : Protocol.smb;
this.inetAddress = inetAddress;
this.hostname = null;
}
public Protocol getProtocol() {
return this.protocol;
}
public InetAddress getInetAddress() {
return this.inetAddress;
}
public String getHostName() {
if (this.hostname != null) return this.hostname;
this.hostname = Domains.getHostName(this.inetAddress);
return this.hostname;
}
public MultiProtocolURI url() throws MalformedURLException {
return new MultiProtocolURI(this.protocol.name() + "://" + getHostName() + "/");
}
public String toString() {
try {
return new MultiProtocolURI(this.protocol.name() + "://" + this.inetAddress.getHostAddress() + "/").toNormalform(true, false);
} catch (MalformedURLException e) {
return "";
}
}
public int hashCode() {
return this.inetAddress.hashCode();
}
public boolean equals(Object o) {
return (o instanceof Service) && ((Service) o).protocol == this.protocol && ((Service) o).inetAddress.equals(this.inetAddress);
}
}
private final static Map<MultiProtocolURI, Access> scancache = new TreeMap<MultiProtocolURI, Access>();
private static long scancacheUpdateTime = 0;
private static long scancacheValidUntilTime = Long.MAX_VALUE;
private final static Map<Service, Access> scancache = new HashMap<Service, Access>();
//private static long scancacheUpdateTime = 0;
//private static long scancacheValidUntilTime = Long.MAX_VALUE;
private static Set<InetAddress> scancacheScanrange = new HashSet<InetAddress>();
public static int scancacheSize() {
@ -65,43 +111,45 @@ public class Scanner extends Thread {
public static void scancacheReplace(Scanner newScanner, long validTime) {
scancache.clear();
scancache.putAll(newScanner.services());
scancacheUpdateTime = System.currentTimeMillis();
scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime;
//scancacheUpdateTime = System.currentTimeMillis();
//scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime;
scancacheScanrange = newScanner.scanrange;
}
public static void scancacheExtend(Scanner newScanner, long validTime) {
Iterator<Map.Entry<MultiProtocolURI, Access>> i = Scanner.scancache.entrySet().iterator();
Map.Entry<MultiProtocolURI, Access> entry;
Iterator<Map.Entry<Service, Access>> i = Scanner.scancache.entrySet().iterator();
Map.Entry<Service, Access> entry;
while (i.hasNext()) {
entry = i.next();
if (entry.getValue() != Access.granted) i.remove();
}
scancache.putAll(newScanner.services());
scancacheUpdateTime = System.currentTimeMillis();
scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime;
//scancacheUpdateTime = System.currentTimeMillis();
//scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime;
scancacheScanrange = newScanner.scanrange;
}
public static Iterator<Map.Entry<MultiProtocolURI, Scanner.Access>> scancacheEntries() {
public static Iterator<Map.Entry<Service, Scanner.Access>> scancacheEntries() {
return scancache.entrySet().iterator();
}
/**
* check if the url can be accepted by the scanner. the scanner accepts the url if:
* - the host of the url is not supervised (it is not in the scan range), or
* - the host is supervised (it is in the scan range) and the host is in the scan cache
* @param url
* @return true if the url shall be part of a search result
*/
public static boolean acceptURL(MultiProtocolURI url) {
// if the scan range is empty, then all urls are accepted
if (scancacheScanrange == null || scancacheScanrange.size() == 0) return true;
//if (System.currentTimeMillis() > scancacheValidUntilTime) return true;
InetAddress a = Domains.dnsResolve(url.getHost());
InetAddress a = Domains.dnsResolve(url.getHost()); // try to avoid that!
if (a == null) return true;
InetAddress n = normalize(a);
if (!scancacheScanrange.contains(n)) return true;
MultiProtocolURI uri;
try {
uri = produceURI(url.getProtocol(), a);
return scancache.containsKey(uri);
} catch (MalformedURLException e) {
return false;
}
return scancache.containsKey(new Service(url.getProtocol(), a));
}
private static InetAddress normalize(InetAddress a) {
@ -118,27 +166,17 @@ public class Scanner extends Thread {
private int runnerCount;
private Set<InetAddress> scanrange;
private BlockingQueue<MultiProtocolURI> scanqueue;
private Map<MultiProtocolURI, Access> services;
private BlockingQueue<Service> scanqueue;
private Map<Service, Access> services;
private Map<Runner, Object> runner;
private int timeout;
public Scanner(InetAddress scanrange, int concurrentRunner, int timeout) {
this.runnerCount = concurrentRunner;
this.scanrange = new HashSet<InetAddress>();
this.scanrange.add(normalize(scanrange));
this.scanqueue = new LinkedBlockingQueue<MultiProtocolURI>();
this.services = Collections.synchronizedMap(new TreeMap<MultiProtocolURI, Access>());
this.runner = new ConcurrentHashMap<Runner, Object>();
this.timeout = timeout;
}
public Scanner(Set<InetAddress> scanrange, int concurrentRunner, int timeout) {
this.runnerCount = concurrentRunner;
this.scanrange = new HashSet<InetAddress>();
for (InetAddress a: scanrange) this.scanrange.add(normalize(a));
this.scanqueue = new LinkedBlockingQueue<MultiProtocolURI>();
this.services = Collections.synchronizedMap(new TreeMap<MultiProtocolURI, Access>());
this.scanqueue = new LinkedBlockingQueue<Service>();
this.services = Collections.synchronizedMap(new HashMap<Service, Access>());
this.runner = new ConcurrentHashMap<Runner, Object>();
this.timeout = timeout;
}
@ -148,9 +186,9 @@ public class Scanner extends Thread {
}
public void run() {
MultiProtocolURI uri;
Service uri;
try {
while ((uri = scanqueue.take()) != POISONURI) {
while ((uri = scanqueue.take()) != POISONSERVICE) {
while (runner.size() >= this.runnerCount) {
/*for (Runner r: runner.keySet()) {
if (r.age() > 3000) synchronized(r) { r.interrupt(); }
@ -172,7 +210,7 @@ public class Scanner extends Thread {
public void terminate() {
for (int i = 0; i < runnerCount; i++) try {
this.scanqueue.put(POISONURI);
this.scanqueue.put(POISONSERVICE);
} catch (InterruptedException e) {
}
try {
@ -181,52 +219,43 @@ public class Scanner extends Thread {
}
}
private static MultiProtocolURI produceURI(String protocol, InetAddress a) throws MalformedURLException {
return new MultiProtocolURI(protocol + "://" + Domains.getHostName(a) + "/");
}
public class Runner extends Thread {
private MultiProtocolURI uri;
private Service service;
private long starttime;
public Runner(MultiProtocolURI uri) {
this.uri = uri;
public Runner(Service service) {
this.service = service;
this.starttime = System.currentTimeMillis();
}
public void run() {
try {
if (TimeoutRequest.ping(this.uri, timeout)) {
try {
MultiProtocolURI uri = produceURI(this.uri.getProtocol(), Domains.dnsResolve(this.uri.getHost()));
String protocol = uri.getProtocol();
Access access = protocol.equals("http") || protocol.equals("https") ? Access.granted : Access.unknown;
services.put(uri, access);
if (access == Access.unknown) {
// ask the service if it lets us in
if (protocol.equals("ftp")) {
final FTPClient ftpClient = new FTPClient();
try {
ftpClient.open(uri.getHost(), uri.getPort());
ftpClient.login("anonymous", "anomic@");
List<String> list = ftpClient.list("/", false);
ftpClient.CLOSE();
access = list == null || list.size() == 0 ? Access.empty : Access.granted;
} catch (IOException e) {
access = Access.denied;
}
if (TimeoutRequest.ping(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port, timeout)) {
Access access = this.service.getProtocol() == Protocol.http || this.service.getProtocol() == Protocol.https ? Access.granted : Access.unknown;
services.put(service, access);
if (access == Access.unknown) {
// ask the service if it lets us in
if (this.service.getProtocol() == Protocol.ftp) {
final FTPClient ftpClient = new FTPClient();
try {
ftpClient.open(this.service.getInetAddress().getHostAddress(), this.service.getProtocol().port);
ftpClient.login("anonymous", "anomic@");
List<String> list = ftpClient.list("/", false);
ftpClient.CLOSE();
access = list == null || list.size() == 0 ? Access.empty : Access.granted;
} catch (IOException e) {
access = Access.denied;
}
if (protocol.equals("smb")) {
try {
String[] list = uri.list();
access = list == null || list.length == 0 ? Access.empty : Access.granted;
} catch (IOException e) {
access = Access.denied;
}
}
if (this.service.getProtocol() == Protocol.smb) {
try {
MultiProtocolURI uri = new MultiProtocolURI(this.service.toString());
String[] list = uri.list();
access = list == null || list.length == 0 ? Access.empty : Access.granted;
} catch (IOException e) {
access = Access.denied;
}
}
if (access != Access.unknown) services.put(uri, access);
} catch (MalformedURLException e) {
e.printStackTrace();
}
if (access != Access.unknown) services.put(this.service, access);
}
} catch (ExecutionException e) {
}
@ -237,35 +266,33 @@ public class Scanner extends Thread {
return System.currentTimeMillis() - this.starttime;
}
public boolean equals(Object o) {
return (o instanceof Runner) && this.uri.toNormalform(true, false).equals(((Runner) o).uri.toNormalform(true, false));
return (o instanceof Runner) && this.service.equals(((Runner) o).service);
}
public int hashCode() {
return this.uri.hashCode();
return this.service.hashCode();
}
}
public void addHTTP(boolean bigrange) {
addProtocol("http", bigrange);
addProtocol(Protocol.http, bigrange);
}
public void addHTTPS(boolean bigrange) {
addProtocol("https", bigrange);
addProtocol(Protocol.https, bigrange);
}
public void addSMB(boolean bigrange) {
addProtocol("smb", bigrange);
addProtocol(Protocol.smb, bigrange);
}
public void addFTP(boolean bigrange) {
addProtocol("ftp", bigrange);
addProtocol(Protocol.ftp, bigrange);
}
private void addProtocol(String protocol, boolean bigrange) {
private void addProtocol(Protocol protocol, boolean bigrange) {
for (InetAddress i: genlist(bigrange)) {
try {
this.scanqueue.put(new MultiProtocolURI(protocol + "://" + i.getHostAddress() + "/"));
} catch (MalformedURLException e) {
Log.logException(e);
this.scanqueue.put(new Service(protocol, i));
} catch (InterruptedException e) {
Log.logException(e);
}
@ -290,7 +317,7 @@ public class Scanner extends Thread {
return c;
}
public Map<MultiProtocolURI, Access> services() {
public Map<Service, Access> services() {
return this.services;
}
@ -303,8 +330,8 @@ public class Scanner extends Thread {
scanner.addSMB(false);
scanner.start();
scanner.terminate();
for (MultiProtocolURI service: scanner.services().keySet()) {
System.out.println(service.toNormalform(true, false));
for (Service service: scanner.services().keySet()) {
System.out.println(service.toString());
}
try {
HTTPClient.closeConnectionManager();

@ -37,7 +37,6 @@ import java.util.concurrent.TimeoutException;
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.logging.Log;
/**
@ -105,12 +104,12 @@ public class TimeoutRequest<E> {
* @return true if the server exists and replies within the given time-out
* @throws ExecutionException
*/
public static boolean ping(final MultiProtocolURI uri, final int timeout) throws ExecutionException {
public static boolean ping(final String host, final int port, final int timeout) throws ExecutionException {
return new TimeoutRequest<Boolean>(new Callable<Boolean>() {
public Boolean call() {
try {
Socket socket = new Socket();
socket.connect(new InetSocketAddress(uri.getHost(), uri.getPort()), timeout);
socket.connect(new InetSocketAddress(host, port), timeout);
if (socket.isConnected()) {
socket.close();
return Boolean.TRUE;

Loading…
Cancel
Save