From 091dd3f6ec5f5b6e52eaaa402de787f02057475c Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 8 Oct 2010 10:54:13 +0000 Subject: [PATCH] - enhanced intranet search speed - enhanced intranet portscan speed (better time-out) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7227 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacysearchitem.java | 2 +- .../anomic/crawler/retrieval/FileLoader.java | 7 +- .../anomic/crawler/retrieval/SMBLoader.java | 7 +- source/de/anomic/search/RankingProcess.java | 2 +- .../yacy/cora/document/MultiProtocolURI.java | 19 +- source/net/yacy/cora/protocol/Domains.java | 113 ++---- source/net/yacy/cora/protocol/Scanner.java | 42 +-- .../yacy/cora/protocol/TimeoutRequest.java | 325 ++++++++++++++++++ .../yacy/kelondro/data/meta/DigestURI.java | 15 +- 9 files changed, 390 insertions(+), 142 deletions(-) create mode 100644 source/net/yacy/cora/protocol/TimeoutRequest.java diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index bb5a1b5e5..a346bab02 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -106,7 +106,7 @@ public class yacysearchitem { final int port=result.url().getPort(); DigestURI faviconURL = null; - if (!result.url().isLocal()) try { + if (isHtml && !sb.isIntranetMode() && !result.url().isLocal()) try { faviconURL = new DigestURI(result.url().getProtocol() + "://" + result.url().getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico", null); } catch (final MalformedURLException e1) { Log.logException(e1); diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java index de3b055a3..238845e5d 100644 --- a/source/de/anomic/crawler/retrieval/FileLoader.java +++ b/source/de/anomic/crawler/retrieval/FileLoader.java @@ -64,14 +64,17 @@ public class FileLoader { } // process directories: transform them to html with meta robots=noindex (using the ftpc lib) - if (url.isDirectory()) { - String[] l = url.list(); + String[] l = null; + try {l = url.list();} catch (IOException e) {} + if (l != null) { + /* if (l == null) { // this can only happen if there is no connection or the directory does not exist //log.logInfo("directory listing not available. URL = " + request.url().toString()); sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString()); throw new IOException("directory listing not available. URL = " + request.url().toString()); } + */ String u = url.toNormalform(true, true); List list = new ArrayList(); for (String s: l) { diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index 0dc563d02..67cf1b9b1 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -78,14 +78,17 @@ public class SMBLoader { } // process directories: transform them to html with meta robots=noindex (using the ftpc lib) - if (url.isDirectory()) { - String[] l = url.list(); + String[] l = null; + try {l = url.list();} catch (IOException e) {} + if (l != null) { + /* if (l == null) { // this can only happen if there is no connection or the directory does not exist //log.logInfo("directory listing not available. URL = " + request.url().toString()); sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, "directory listing not available. URL = " + request.url().toString()); throw new IOException("directory listing not available. URL = " + request.url().toString()); } + */ String u = url.toNormalform(true, true); List list = new ArrayList(); for (String s: l) { diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index a49a18858..c4e1a0614 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -203,7 +203,7 @@ public final class RankingProcess extends Thread { } // check tld domain - if (!DigestURI.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) { + if ((DigestURI.domDomain(iEntry.metadataHash()) & this.query.zonecode) == 0) { // filter out all tld that do not match with wanted tld domain continue; } diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 8ff68abdb..d24a6dfc8 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -42,6 +42,7 @@ import jcifs.smb.SmbFileInputStream; import net.yacy.cora.document.Punycode.PunycodeException; import net.yacy.cora.protocol.Domains; +import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.http.HTTPClient; @@ -53,6 +54,8 @@ public class MultiProtocolURI implements Serializable, Comparable 0) return hosts.iterator().next(); // call i.getHostName() using concurrency to interrupt execution in case of a time-out - final Callable callable = new Callable() { - public String call() { return i.getHostName(); } - }; - ExecutorService service = Executors.newSingleThreadExecutor(); - final Future taskFuture = service.submit(callable); - Runnable t = new Runnable() { - public void run() { taskFuture.cancel(true); } - }; - service.execute(t); - service.shutdown(); try { - return taskFuture.get(500, TimeUnit.MILLISECONDS); - } catch (CancellationException e) { - // callable was interrupted - return i.getHostAddress(); - } catch (InterruptedException e) { - // service was shutdown - return i.getHostAddress(); - } catch(ExecutionException e) { - // callable failed unexpectedly - return i.getHostAddress(); - } catch (TimeoutException e) { - // time-out + return TimeoutRequest.getHostName(i, 500); + } catch (ExecutionException e) { return i.getHostAddress(); } } - public static InetAddress dnsResolve(final String hostx) { - if ((hostx == null) || (hostx.length() == 0)) return null; - final String host = hostx.toLowerCase().trim(); + public static InetAddress dnsResolve(String host) { + if ((host == null) || (host.length() == 0)) return null; + host = host.toLowerCase().trim(); // try to simply parse the address InetAddress ip = parseInetAddress(host); if (ip != null) return ip; @@ -509,59 +482,9 @@ public class Domains { if (nameCacheMiss.containsKey(host)) return null; // call dnsResolveNetBased(host) using concurrency to interrupt execution in case of a time-out - final Callable callable = new Callable() { - public InetAddress call() { return dnsResolveNetBased(host); } - }; - ExecutorService service = Executors.newSingleThreadExecutor(); - final Future taskFuture = service.submit(callable); - Runnable t = new Runnable() { - public void run() { taskFuture.cancel(true); } - }; - service.execute(t); - service.shutdown(); - try { - return taskFuture.get(500, TimeUnit.MILLISECONDS); - } catch (CancellationException e) { - // callable was interrupted - return null; - } catch (InterruptedException e) { - // service was shutdown - return null; - } catch(ExecutionException e) { - // callable failed unexpectedly - return null; - } catch (TimeoutException e) { - // time-out - return null; - } - } - - - private static final InetAddress parseInetAddress(final String ip) { - if (ip == null) return null; - if (ip.length() < 8) return null; - final String[] ips = ip.split("\\."); - if (ips.length != 4) return null; - final byte[] ipb = new byte[4]; - try { - ipb[0] = (byte) Integer.parseInt(ips[0]); - ipb[1] = (byte) Integer.parseInt(ips[1]); - ipb[2] = (byte) Integer.parseInt(ips[2]); - ipb[3] = (byte) Integer.parseInt(ips[3]); - } catch (final NumberFormatException e) { - return null; - } - try { - return InetAddress.getByAddress(ipb); - } catch (final UnknownHostException e) { - return null; - } - } - - private static InetAddress dnsResolveNetBased(String host) { try { boolean doCaching = true; - InetAddress ip = InetAddress.getByName(host); // this makes the DNS request to backbone + ip = TimeoutRequest.getByName(host, 500); // this makes the DNS request to backbone if ((ip == null) || (ip.isLoopbackAddress()) || (nameCacheNoCachingList.containsKey(host)) @@ -580,7 +503,7 @@ public class Domains { nameCacheHit.put(host, ip); } return ip; - } catch (final UnknownHostException e) { + } catch (final ExecutionException e) { // remove old entries flushMissNameCache(); @@ -589,7 +512,27 @@ public class Domains { } return null; } - + + private static final InetAddress parseInetAddress(final String ip) { + if (ip == null) return null; + if (ip.length() < 8) return null; + final String[] ips = ip.split("\\."); + if (ips.length != 4) return null; + final byte[] ipb = new byte[4]; + try { + ipb[0] = (byte) Integer.parseInt(ips[0]); + ipb[1] = (byte) Integer.parseInt(ips[1]); + ipb[2] = (byte) Integer.parseInt(ips[2]); + ipb[3] = (byte) Integer.parseInt(ips[3]); + } catch (final NumberFormatException e) { + return null; + } + try { + return InetAddress.getByAddress(ipb); + } catch (final UnknownHostException e) { + return null; + } + } /** * Returns the number of entries in the nameCacheHit map diff --git a/source/net/yacy/cora/protocol/Scanner.java b/source/net/yacy/cora/protocol/Scanner.java index 578068044..27e9d6030 100644 --- a/source/net/yacy/cora/protocol/Scanner.java +++ b/source/net/yacy/cora/protocol/Scanner.java @@ -20,11 +20,8 @@ package net.yacy.cora.protocol; -import java.io.IOException; import java.net.InetAddress; -import java.net.InetSocketAddress; import java.net.MalformedURLException; -import java.net.Socket; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collection; @@ -34,6 +31,7 @@ import java.util.Map; import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.document.MultiProtocolURI; @@ -122,7 +120,7 @@ public class Scanner extends Thread { for (InetAddress i: genlist(bigrange)) { try { - this.scanqueue.put(new MultiProtocolURI(protocol + "://" + Domains.getHostName(i) + "/")); + this.scanqueue.put(new MultiProtocolURI(protocol + "://" + i.getHostAddress() + "/")); } catch (MalformedURLException e) { Log.logException(e); } catch (InterruptedException e) { @@ -154,22 +152,17 @@ public class Scanner extends Thread { this.starttime = System.currentTimeMillis(); } public void run() { - if (ping(this.uri, timeout)) { - services.put(this.uri, ""); - /* - try { - byte[] b = this.uri.get(MultiProtocolURI.yacybotUserAgent, timeout); - if (b != null) services.put(this.uri, ""); - } catch (Exception e) { - // try a list + try { + if (TimeoutRequest.ping(this.uri, timeout)) { try { - String[] l = this.uri.list(); - if (l != null) services.put(this.uri, ""); - } catch (Exception e1) { - // this just failed. do nothing + services.put(new MultiProtocolURI(this.uri.getProtocol() + "://" + Domains.getHostName(InetAddress.getByName(this.uri.getHost())) + "/"), ""); + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (UnknownHostException e) { + e.printStackTrace(); } } - */ + } catch (ExecutionException e) { } Object r = runner.remove(this); assert r != null; @@ -185,21 +178,6 @@ public class Scanner extends Thread { } } - private static boolean ping(MultiProtocolURI uri, int timeout) { - try { - Socket socket = new Socket(); - socket.connect(new InetSocketAddress(Domains.dnsResolve(uri.getHost()), uri.getPort()), timeout); - if (socket.isConnected()) { - socket.close(); - return true; - } - return false; - } catch (UnknownHostException e) { - return false; - } catch (IOException e) { - return false; - } - } public Collection services() { return this.services.keySet(); diff --git a/source/net/yacy/cora/protocol/TimeoutRequest.java b/source/net/yacy/cora/protocol/TimeoutRequest.java new file mode 100644 index 000000000..7796aa300 --- /dev/null +++ b/source/net/yacy/cora/protocol/TimeoutRequest.java @@ -0,0 +1,325 @@ +/** + * TimeoutRequest + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany + * First released 08.10.2007 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.protocol; + +import java.io.IOException; +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.net.Socket; +import java.net.UnknownHostException; +import java.util.concurrent.Callable; +import java.util.concurrent.CancellationException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import jcifs.smb.SmbException; +import jcifs.smb.SmbFile; + +import net.yacy.cora.document.MultiProtocolURI; + +/** + * TimeoutRequest is a class that can apply a timeout on method calls that may block + * for undefined time. Some network operations can only be accessed without a given + * time-out value. Using this class all network operations may be timed out. + * This class provides also some static methods that give already solutions for typical + * network operations that should be timed-out, like dns resolving and reverse domain name resolving. + */ +public class TimeoutRequest { + + private Callable call; + + /** + * initialize the TimeoutRequest with a callable method + */ + public TimeoutRequest(Callable call) { + this.call = call; + } + + /** + * call the method using a time-out + * @param timeout + * @return + * @throws ExecutionException + */ + public E call(long timeout) throws ExecutionException { + ExecutorService service = Executors.newSingleThreadExecutor(); + final Future taskFuture = service.submit(this.call); + Runnable t = new Runnable() { + public void run() { taskFuture.cancel(true); } + }; + service.execute(t); + service.shutdown(); + try { + return taskFuture.get(timeout, TimeUnit.MILLISECONDS); + } catch (CancellationException e) { + // callable was interrupted + throw new ExecutionException(e); + } catch (InterruptedException e) { + // service was shutdown + throw new ExecutionException(e); + } catch (ExecutionException e) { + // callable failed unexpectedly + throw e; + } catch (TimeoutException e) { + // time-out + throw new ExecutionException(e); + } + } + + /** + * ping a remote server using a given uri and a time-out + * @param uri + * @param timeout + * @return true if the server exists and replies within the given time-out + * @throws ExecutionException + */ + public static boolean ping(final MultiProtocolURI uri, final int timeout) throws ExecutionException { + return new TimeoutRequest(new Callable() { + public Boolean call() { + try { + Socket socket = new Socket(); + socket.connect(new InetSocketAddress(uri.getHost(), uri.getPort()), timeout); + if (socket.isConnected()) { + socket.close(); + return Boolean.TRUE; + } + return Boolean.FALSE; + } catch (UnknownHostException e) { + return Boolean.FALSE; + } catch (IOException e) { + return Boolean.FALSE; + } + } + }).call(timeout).booleanValue(); + } + + /** + * do a DNS lookup within a given time + * @param host + * @param timeout + * @return the InetAddress for a given domain name + * @throws ExecutionException + */ + public static InetAddress getByName(final String host, final long timeout) throws ExecutionException { + return new TimeoutRequest(new Callable() { + public InetAddress call() { + try { + return InetAddress.getByName(host); + } catch (UnknownHostException e) { + return null; + } + } + }).call(timeout); + } + + /** + * perform a reverse domain name lookup for a given InetAddress within a given timeout + * @param i + * @param timeout + * @return the host name of a given InetAddress + * @throws ExecutionException + */ + public static String getHostName(final InetAddress i, final long timeout) throws ExecutionException { + return new TimeoutRequest(new Callable() { + public String call() { return i.getHostName(); } + }).call(timeout); + } + + /** + * check if a smb file exists + * @param file + * @param timeout + * @return + * @throws IOException + */ + public static boolean exists(final SmbFile file, final long timeout) throws IOException { + try { + return new TimeoutRequest(new Callable() { + public Boolean call() { try { + return file.exists(); + } catch (SmbException e) { + return Boolean.FALSE; + } } + }).call(timeout).booleanValue(); + } catch (ExecutionException e) { + throw new IOException(e.getMessage()); + } + } + + /** + * check if a smb file can be read + * @param file + * @param timeout + * @return + * @throws IOException + */ + public static boolean canRead(final SmbFile file, final long timeout) throws IOException { + try { + return new TimeoutRequest(new Callable() { + public Boolean call() { try { + return file.canRead(); + } catch (SmbException e) { + return Boolean.FALSE; + } } + }).call(timeout).booleanValue(); + } catch (ExecutionException e) { + throw new IOException(e.getMessage()); + } + } + + /** + * check if a smb file ran be written + * @param file + * @param timeout + * @return + * @throws IOException + */ + public static boolean canWrite(final SmbFile file, final long timeout) throws IOException { + try { + return new TimeoutRequest(new Callable() { + public Boolean call() { try { + return file.canWrite(); + } catch (SmbException e) { + return Boolean.FALSE; + } } + }).call(timeout).booleanValue(); + } catch (ExecutionException e) { + throw new IOException(e.getMessage()); + } + } + + /** + * check if a smb file is hidden + * @param file + * @param timeout + * @return + * @throws IOException + */ + public static boolean isHidden(final SmbFile file, final long timeout) throws IOException { + try { + return new TimeoutRequest(new Callable() { + public Boolean call() { try { + return file.isHidden(); + } catch (SmbException e) { + return Boolean.FALSE; + } } + }).call(timeout).booleanValue(); + } catch (ExecutionException e) { + throw new IOException(e.getMessage()); + } + } + + /** + * check if a smb file is a directory + * @param file + * @param timeout + * @return + * @throws IOException + */ + public static boolean isDirectory(final SmbFile file, final long timeout) throws IOException { + try { + return new TimeoutRequest(new Callable() { + public Boolean call() { try { + return file.isDirectory(); + } catch (SmbException e) { + return Boolean.FALSE; + } } + }).call(timeout).booleanValue(); + } catch (ExecutionException e) { + throw new IOException(e.getMessage()); + } + } + + /** + * get the size of a smb file + * @param file + * @param timeout + * @return + * @throws IOException + */ + public static long length(final SmbFile file, final long timeout) throws IOException { + try { + return new TimeoutRequest(new Callable() { + public Long call() { try { + return file.length(); + } catch (SmbException e) { + return Long.valueOf(0); + } } + }).call(timeout).longValue(); + } catch (ExecutionException e) { + throw new IOException(e.getMessage()); + } + } + + /** + * get last-modified time of a smb file + * @param file + * @param timeout + * @return + * @throws IOException + */ + public static long lastModified(final SmbFile file, final long timeout) throws IOException { + try { + return new TimeoutRequest(new Callable() { + public Long call() { try { + return file.lastModified(); + } catch (SmbException e) { + return Long.valueOf(0); + } } + }).call(timeout).longValue(); + } catch (ExecutionException e) { + throw new IOException(e.getMessage()); + } + } + + /** + * get list of a smb directory + * @param file + * @param timeout + * @return + * @throws IOException + */ + public static String[] list(final SmbFile file, final long timeout) throws IOException { + try { + return new TimeoutRequest(new Callable() { + public String[] call() { try { + return file.list(); + } catch (SmbException e) { + return null; + } } + }).call(timeout); + } catch (ExecutionException e) { + throw new IOException(e.getMessage()); + } + } + + public static void main(String[] args) { + try { + System.out.println(getByName("yacy.net", 100)); + } catch (ExecutionException e) { + e.printStackTrace(); + } + } +} diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index cfc39a142..0e32a2795 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -265,22 +265,15 @@ public class DigestURI extends MultiProtocolURI implements Serializable { public static boolean isDomDomain(final byte[] urlHash, final int id) { return domDomain(urlHash) == id; } - - public static boolean matchesAnyDomDomain(final byte[] urlHash, final int idset) { - // this is a boolean matching on a set of domDomains - return (domDomain(urlHash) | idset) != 0; - } // checks for local/global IP range and local IP public final boolean isLocal() { if (this.isSMB() || this.isFile()) return true; - if (this.hash == null) { - if (super.isLocal()) return true; - synchronized (this) { - if (this.hash == null) this.hash = urlHashComputation(); - } + if (this.hash == null) synchronized (this) { + // this is synchronized because another thread may also call the same method in between + // that is the reason that this.hash is checked again + if (this.hash == null) this.hash = urlHashComputation(); } - //if (domDomain(this.hash) != 7) System.out.println("*** DEBUG - not local: " + this.toNormalform(true, false)); return domDomain(this.hash) == 7; }