- a collection of bug fixes and some redesign of the Scanner class

- fixed smb crawling
- added smbget to download script generation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7381 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 4d5bb4c4ca
commit 58b59f9bc8

@ -21,12 +21,11 @@
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.ConcurrentModificationException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import net.yacy.cora.document.MultiProtocolURI;
@ -58,18 +57,14 @@ public class CrawlStartScanner_p {
// make a scanhosts entry
String hosts = post == null ? "" : post.get("scanhosts", "");
List<InetAddress> ips = Domains.myIntranetIPs();
Set<InetAddress> ips = Domains.myIntranetIPs();
prop.put("intranethosts", ips.toString());
prop.put("intranetHint", sb.isIntranetMode() ? 0 : 1);
if (hosts.length() == 0) {
InetAddress ip;
if (sb.isIntranetMode()) {
if (ips.size() > 0) ip = ips.get(0); else try {
ip = InetAddress.getByName("192.168.0.1");
} catch (UnknownHostException e) {
ip = null;
e.printStackTrace();
}
if (ips.size() > 0) ip = ips.iterator().next();
else ip = Domains.dnsResolve("192.168.0.1");
} else {
ip = Domains.myPublicLocalIP();
if (Domains.isThisHostIP(ip)) ip = sb.peers.mySeed().getInetAddress();
@ -80,18 +75,31 @@ public class CrawlStartScanner_p {
// parse post requests
if (post != null) {
int repeat_time = 0;
String repeat_unit = "seldays";
long validTime = 0;
// check scheduler
if (post.get("rescan", "").equals("scheduler")) {
repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
repeat_unit = post.get("repeat_unit", "selminutes"); // selminutes, selhours, seldays
if (repeat_unit.equals("selminutes")) validTime = repeat_time * 60 * 1000;
if (repeat_unit.equals("selhours")) validTime = repeat_time * 60 * 60 * 1000;
if (repeat_unit.equals("seldays")) validTime = repeat_time * 24 * 60 * 60 * 1000;
}
// case: an IP range was given; scan the range for services and display result
if (post.containsKey("scan") && post.get("source", "").equals("hosts")) {
List<InetAddress> ia = new ArrayList<InetAddress>();
for (String host: hosts.split(",")) try {
Set<InetAddress> ia = new HashSet<InetAddress>();
for (String host: hosts.split(",")) {
if (host.startsWith("http://")) host = host.substring(7);
if (host.startsWith("https://")) host = host.substring(8);
if (host.startsWith("ftp://")) host = host.substring(6);
if (host.startsWith("smb://")) host = host.substring(6);
int p = host.indexOf('/');
if (p >= 0) host = host.substring(0, p);
ia.add(InetAddress.getByName(host));
} catch (UnknownHostException e) {}
ia.add(Domains.dnsResolve(host));
}
Scanner scanner = new Scanner(ia, 100, sb.isIntranetMode() ? 100 : 1000);
if (post.get("scanftp", "").equals("on")) scanner.addFTP(false);
if (post.get("scanhttp", "").equals("on")) scanner.addHTTP(false);
@ -99,7 +107,7 @@ public class CrawlStartScanner_p {
if (post.get("scansmb", "").equals("on")) scanner.addSMB(false);
scanner.start();
scanner.terminate();
if (post.get("accumulatescancache", "").equals("on") && !post.get("rescan", "").equals("scheduler")) enlargeScancache(scanner.services()); else Scanner.scancache = scanner.services();
if (post.get("accumulatescancache", "").equals("on") && !post.get("rescan", "").equals("scheduler")) Scanner.scancacheExtend(scanner, validTime); else Scanner.scancacheReplace(scanner, validTime);
}
if (post.containsKey("scan") && post.get("source", "").equals("intranet")) {
@ -110,14 +118,16 @@ public class CrawlStartScanner_p {
if (post.get("scansmb", "").equals("on")) scanner.addSMB(false);
scanner.start();
scanner.terminate();
if (post.get("accumulatescancache", "").equals("on") && !post.get("rescan", "").equals("scheduler")) enlargeScancache(scanner.services()); else Scanner.scancache = scanner.services();
if (post.get("accumulatescancache", "").equals("on") && !post.get("rescan", "").equals("scheduler")) Scanner.scancacheExtend(scanner, validTime); else Scanner.scancacheReplace(scanner, validTime);
}
// check crawl request
if (post.containsKey("crawl")) {
// make a pk/url mapping
Iterator<Map.Entry<MultiProtocolURI, Scanner.Access>> se = Scanner.scancacheEntries();
Map<byte[], DigestURI> pkmap = new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
for (MultiProtocolURI u: Scanner.scancache.keySet()) {
while (se.hasNext()) {
MultiProtocolURI u = se.next().getKey();
DigestURI uu = new DigestURI(u);
pkmap.put(uu.hash(), uu);
}
@ -138,9 +148,6 @@ public class CrawlStartScanner_p {
// check scheduler
if (post.get("rescan", "").equals("scheduler")) {
int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
final String repeat_unit = post.get("repeat_unit", "selminutes"); // selminutes, selhours, seldays
// store this call as api call
if (repeat_time > 0) {
// store as scheduled api call
@ -148,7 +155,7 @@ public class CrawlStartScanner_p {
}
// execute the scan results
if (Scanner.scancache.size() > 0) {
if (Scanner.scancacheSize() > 0) {
// make a comment cache
Map<byte[], String> apiCommentCache = commentCache(sb);
@ -156,7 +163,10 @@ public class CrawlStartScanner_p {
DigestURI u;
try {
int i = 0;
for (final Map.Entry<MultiProtocolURI, Scanner.Access> host: Scanner.scancache.entrySet()) {
Iterator<Map.Entry<MultiProtocolURI, Scanner.Access>> se = Scanner.scancacheEntries();
Map.Entry<MultiProtocolURI, Scanner.Access> host;
while (se.hasNext()) {
host = se.next();
u = new DigestURI(host.getKey());
urlString = u.toNormalform(true, false);
if (host.getValue() == Access.granted && inIndex(apiCommentCache, urlString) == null) {
@ -173,7 +183,7 @@ public class CrawlStartScanner_p {
}
// write scan table
if (Scanner.scancache.size() > 0) {
if (Scanner.scancacheSize() > 0) {
// make a comment cache
Map<byte[], String> apiCommentCache = commentCache(sb);
@ -184,7 +194,10 @@ public class CrawlStartScanner_p {
table: while (true) {
try {
int i = 0;
for (final Map.Entry<MultiProtocolURI, Scanner.Access> host: Scanner.scancache.entrySet()) {
Iterator<Map.Entry<MultiProtocolURI, Scanner.Access>> se = Scanner.scancacheEntries();
Map.Entry<MultiProtocolURI, Scanner.Access> host;
while (se.hasNext()) {
host = se.next();
u = new DigestURI(host.getKey());
urlString = u.toNormalform(true, false);
prop.put("servertable_list_" + i + "_pk", new String(u.hash()));
@ -197,7 +210,7 @@ public class CrawlStartScanner_p {
prop.put("servertable_list_" + i + "_accessGranted", host.getValue() == Access.granted ? 1 : 0);
prop.put("servertable_list_" + i + "_accessDenied", host.getValue() == Access.denied ? 1 : 0);
prop.put("servertable_list_" + i + "_process", inIndex(apiCommentCache, urlString) == null ? 0 : 1);
prop.put("servertable_list_" + i + "_preselected", interesting(apiCommentCache, u, host.getValue()) ? 1 : 0);
prop.put("servertable_list_" + i + "_preselected", host.getValue() == Access.granted && inIndex(apiCommentCache, urlString) == null ? 1 : 0);
i++;
}
prop.put("servertable_list", i);
@ -211,23 +224,6 @@ public class CrawlStartScanner_p {
return prop;
}
private static void enlargeScancache(Map<MultiProtocolURI, Access> newCache) {
if (Scanner.scancache == null) {
Scanner.scancache = newCache;
return;
}
Iterator<Map.Entry<MultiProtocolURI, Access>> i = Scanner.scancache.entrySet().iterator();
Map.Entry<MultiProtocolURI, Access> entry;
while (i.hasNext()) {
entry = i.next();
if (entry.getValue() != Access.granted) i.remove();
}
Scanner.scancache.putAll(newCache);
}
private static boolean interesting(Map<byte[], String> commentCache, MultiProtocolURI uri, Access access) {
return inIndex(commentCache, uri.toNormalform(true, false)) == null && access == Access.granted && (uri.getProtocol().equals("smb") || uri.getProtocol().equals("ftp"));
}
private static byte[] inIndex(Map<byte[], String> commentCache, String url) {
for (Map.Entry<byte[], String> comment: commentCache.entrySet()) {

@ -207,13 +207,13 @@ public class Crawler_p {
final boolean crawlingQ = "on".equals(post.get("crawlingQ", "off"));
env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
final boolean indexText = "on".equals(post.get("indexText", "off"));
final boolean indexText = "on".equals(post.get("indexText", "on"));
env.setConfig("indexText", (indexText) ? "true" : "false");
final boolean indexMedia = "on".equals(post.get("indexMedia", "off"));
final boolean indexMedia = "on".equals(post.get("indexMedia", "on"));
env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
boolean storeHTCache = "on".equals(post.get("storeHTCache", "off"));
boolean storeHTCache = "on".equals(post.get("storeHTCache", "on"));
if (crawlingStartURL!= null &&(crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false;
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");

@ -185,7 +185,7 @@ function resultLine(type, item, linenumber) {
}
// update download script
script += "curl -OL \"" + item.link + "\"\n";
if (item.link.indexOf("smb://") >= 0) script += "smbget -n -a -r \"" + item.link + "\"\n"; else script += "curl -OL \"" + item.link + "\"\n";
// make table row
var html = "";

@ -330,7 +330,7 @@ public class ResultFetcher {
page = rankingProcess.takeURL(true, this.timeout - System.currentTimeMillis());
//if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis());
if (page == null) {
System.out.println("page == null");
//System.out.println("page == null");
break; // no more available
}
if (workTables.failURLsContains(page.hash())) continue;

@ -150,6 +150,7 @@ public class Segments implements Iterable<Segment> {
}
public long URLCount() {
if (this.segments == null) return 0;
long c = 0;
for (Segment s: this.segments.values()) c += (long) s.urlMetadata().size();
return c;

@ -56,7 +56,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
private static final long serialVersionUID = -1173233022912141884L;
private static final long SMB_TIMEOUT = 500;
private static final long SMB_TIMEOUT = 1500;
public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(?<!/\\.{1,2})/)[.]{2}(?=/|$)|/\\.(?=/)|/(?=/)");
@ -1107,6 +1107,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
if (isFile()) return getFSFile().list();
if (isSMB()) try {
SmbFile sf = getSmbFile();
if (!sf.isDirectory()) return null;
try {
return TimeoutRequest.list(sf, SMB_TIMEOUT);
} catch (SmbException e) {

@ -676,10 +676,10 @@ public class Domains {
* generate a list of intranet InetAddresses without the loopback address 127.0.0.1
* @return list of all intranet addresses
*/
public static List<InetAddress> myIntranetIPs() {
public static Set<InetAddress> myIntranetIPs() {
// list all local addresses
if (localHostAddresses.size() < 1) try {Thread.sleep(1000);} catch (InterruptedException e) {}
ArrayList<InetAddress> list = new ArrayList<InetAddress>(localHostAddresses.size());
Set<InetAddress> list = new HashSet<InetAddress>();
if (localHostAddresses.size() == 0) return list; // give up
for (InetAddress a: localHostAddresses) {
if ((0Xff & a.getAddress()[0]) == 127) continue;

@ -26,8 +26,11 @@ import java.net.MalformedURLException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
@ -50,10 +53,71 @@ public class Scanner extends Thread {
public static enum Access {unknown, empty, granted, denied;}
public static Map<MultiProtocolURI, Access> scancache = new TreeMap<MultiProtocolURI, Access>();
private final static Map<MultiProtocolURI, Access> scancache = new TreeMap<MultiProtocolURI, Access>();
private static long scancacheUpdateTime = 0;
private static long scancacheValidUntilTime = Long.MAX_VALUE;
private static Set<InetAddress> scancacheScanrange = new HashSet<InetAddress>();
public static int scancacheSize() {
return scancache.size();
}
public static void scancacheReplace(Scanner newScanner, long validTime) {
scancache.clear();
scancache.putAll(newScanner.services());
scancacheUpdateTime = System.currentTimeMillis();
scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime;
scancacheScanrange = newScanner.scanrange;
}
public static void scancacheExtend(Scanner newScanner, long validTime) {
Iterator<Map.Entry<MultiProtocolURI, Access>> i = Scanner.scancache.entrySet().iterator();
Map.Entry<MultiProtocolURI, Access> entry;
while (i.hasNext()) {
entry = i.next();
if (entry.getValue() != Access.granted) i.remove();
}
scancache.putAll(newScanner.services());
scancacheUpdateTime = System.currentTimeMillis();
scancacheValidUntilTime = validTime == Long.MAX_VALUE ? Long.MAX_VALUE : scancacheUpdateTime + validTime;
scancacheScanrange = newScanner.scanrange;
}
public static Iterator<Map.Entry<MultiProtocolURI, Scanner.Access>> scancacheEntries() {
return scancache.entrySet().iterator();
}
public static boolean acceptURL(MultiProtocolURI url) {
if (scancacheScanrange == null || scancacheScanrange.size() == 0) return true;
if (System.currentTimeMillis() > scancacheValidUntilTime) return true;
InetAddress a = Domains.dnsResolve(url.getHost());
if (a == null) return true;
InetAddress n = normalize(a);
if (!scancacheScanrange.contains(n)) return true;
MultiProtocolURI uri;
try {
uri = produceURI(url.getProtocol(), a);
return scancache.containsKey(uri);
} catch (MalformedURLException e) {
return false;
}
}
private static InetAddress normalize(InetAddress a) {
if (a == null) return null;
byte[] b = a.getAddress();
if (b[3] == 1) return a;
b[3] = 1;
try {
return InetAddress.getByAddress(b);
} catch (UnknownHostException e) {
return a;
}
}
private int runnerCount;
private List<InetAddress> scanrange;
private Set<InetAddress> scanrange;
private BlockingQueue<MultiProtocolURI> scanqueue;
private Map<MultiProtocolURI, Access> services;
private Map<Runner, Object> runner;
@ -61,17 +125,18 @@ public class Scanner extends Thread {
public Scanner(InetAddress scanrange, int concurrentRunner, int timeout) {
this.runnerCount = concurrentRunner;
this.scanrange = new ArrayList<InetAddress>();
this.scanrange.add(scanrange);
this.scanrange = new HashSet<InetAddress>();
this.scanrange.add(normalize(scanrange));
this.scanqueue = new LinkedBlockingQueue<MultiProtocolURI>();
this.services = Collections.synchronizedMap(new TreeMap<MultiProtocolURI, Access>());
this.runner = new ConcurrentHashMap<Runner, Object>();
this.timeout = timeout;
}
public Scanner(List<InetAddress> scanrange, int concurrentRunner, int timeout) {
public Scanner(Set<InetAddress> scanrange, int concurrentRunner, int timeout) {
this.runnerCount = concurrentRunner;
this.scanrange = scanrange;
this.scanrange = new HashSet<InetAddress>();
for (InetAddress a: scanrange) this.scanrange.add(normalize(a));
this.scanqueue = new LinkedBlockingQueue<MultiProtocolURI>();
this.services = Collections.synchronizedMap(new TreeMap<MultiProtocolURI, Access>());
this.runner = new ConcurrentHashMap<Runner, Object>();
@ -116,6 +181,10 @@ public class Scanner extends Thread {
}
}
private static MultiProtocolURI produceURI(String protocol, InetAddress a) throws MalformedURLException {
return new MultiProtocolURI(protocol + "://" + Domains.getHostName(a) + "/");
}
public class Runner extends Thread {
private MultiProtocolURI uri;
private long starttime;
@ -127,7 +196,7 @@ public class Scanner extends Thread {
try {
if (TimeoutRequest.ping(this.uri, timeout)) {
try {
MultiProtocolURI uri = new MultiProtocolURI(this.uri.getProtocol() + "://" + Domains.getHostName(InetAddress.getByName(this.uri.getHost())) + "/");
MultiProtocolURI uri = produceURI(this.uri.getProtocol(), Domains.dnsResolve(this.uri.getHost()));
String protocol = uri.getProtocol();
Access access = protocol.equals("http") || protocol.equals("https") ? Access.granted : Access.unknown;
services.put(uri, access);
@ -157,8 +226,6 @@ public class Scanner extends Thread {
if (access != Access.unknown) services.put(uri, access);
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (UnknownHostException e) {
e.printStackTrace();
}
}
} catch (ExecutionException e) {

@ -38,6 +38,7 @@ import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.logging.Log;
/**
* TimeoutRequest is a class that can apply a timeout on method calls that may block
@ -316,6 +317,7 @@ public class TimeoutRequest<E> {
public String[] call() { try {
return file.list();
} catch (SmbException e) {
Log.logWarning("TimeoutRequest:list", file.toString() + " - no list", e);
return null;
} }
}).call(timeout);

Loading…
Cancel
Save