diff --git a/htroot/CrawlStartIntranet_p.html b/htroot/CrawlStartIntranet_p.html deleted file mode 100644 index e85dafdad..000000000 --- a/htroot/CrawlStartIntranet_p.html +++ /dev/null @@ -1,67 +0,0 @@ - - - - YaCy '#[clientname]#': Intranet Crawl Start - #%env/templates/metas.template%# - - - - - #%env/templates/header.template%# - #%env/templates/submenuIndexCreate.template%# -

Intranet Crawl Start

-

- When an index domain is configured to contain intranet links, - the intranet may be scanned for available servers. - Please select below the servers in your intranet that you want to fetch into the search index. -

- - #(notintranet)#:: -

- This network definition does not allow intranet links. - A list of intranet servers is only available if you confiugure YaCy to index intranet targets. - To do so, open the Basic Configuration servlet and select the 'Intranet Indexing' use case. -

- #(/notintranet)# - - #(servertable)#:: -
- - - - - - - - - #{list}# - - - - - #(process)#::#(/process)# - - #{/list}# -
IPURLProcess
#[ip]##[url]#not in indexindexed
-

- - -

-
- #(/servertable)# - - - #%env/templates/footer.template%# - - diff --git a/htroot/CrawlStartIntranet_p.java b/htroot/CrawlStartIntranet_p.java deleted file mode 100644 index 035763fd6..000000000 --- a/htroot/CrawlStartIntranet_p.java +++ /dev/null @@ -1,122 +0,0 @@ -/** - * CrawlStartIntranet_p - * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany - * First released 28.10.2010 at http://yacy.net - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - - -import java.io.IOException; -import java.util.Iterator; -import java.util.Map; - -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.protocol.Domains; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.protocol.Scanner; -import net.yacy.kelondro.blob.Tables; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.logging.Log; - -import de.anomic.data.WorkTables; -import de.anomic.search.Switchboard; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public class CrawlStartIntranet_p { - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - - final serverObjects prop = new serverObjects(); - final Switchboard sb = (Switchboard)env; - - prop.put("notintranet", 0); - prop.put("servertable", 0); - - // check if there is a intranet configuration - if (!sb.isIntranetMode()) { - prop.put("notintranet", 1); - return prop; - } - - // if there are no intranet addresses known, scan the net - if (Scanner.intranetURLs.size() == 0) { - Scanner scanner = new Scanner(100, 10); - scanner.addFTP(false); - scanner.addHTTP(false); - scanner.addHTTPS(false); - scanner.addSMB(false); - scanner.start(); - scanner.terminate(); - DigestURI url; - for (MultiProtocolURI service: scanner.services()) { - url = new DigestURI(service); - Scanner.intranetURLs.put(url.hash(), url); - } - } - - // check crawl request - if (post != null && post.containsKey("crawl")) { - for (Map.Entry entry: post.entrySet()) { - if (entry.getValue().startsWith("mark_")) { - byte [] pk = entry.getValue().substring(5).getBytes(); - DigestURI url = Scanner.intranetURLs.get(pk); - if (url != null) { - String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99"; - path += "&crawlingURL=" + url.toNormalform(true, false); - WorkTables.execAPICall("localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""), path, pk); - } - } - } - } - - // show server table - prop.put("servertable", 1); - int i = 0; - String urlString; - for (final DigestURI url: Scanner.intranetURLs.values()) { - urlString = url.toNormalform(true, false); - prop.put("servertable_list_" + i + "_pk", new String(url.hash())); - prop.put("servertable_list_" + i + "_count", i); - prop.putHTML("servertable_list_" + i + "_ip", Domains.dnsResolve(url.getHost()).getHostAddress()); - prop.putHTML("servertable_list_" + i + "_url", urlString); - prop.put("servertable_list_" + i + "_process", inIndex(sb, urlString) == null ? 0 : 1); - i++; - } - prop.put("servertable_list", i); - prop.put("servertable_num", i); - return prop; - } - - private static byte[] inIndex(Switchboard sb, String url) { - Iterator i; - try { - i = sb.tables.iterator(WorkTables.TABLE_API_NAME); - Tables.Row row; - String comment; - while (i.hasNext()) { - row = i.next(); - comment = new String(row.get(WorkTables.TABLE_API_COL_COMMENT)); - if (comment.contains(url)) return row.getPK(); - } - return null; - } catch (IOException e) { - Log.logException(e); - return null; - } - } - -} diff --git a/htroot/CrawlStartScanner_p.html b/htroot/CrawlStartScanner_p.html index c9c5b4189..239615401 100644 --- a/htroot/CrawlStartScanner_p.html +++ b/htroot/CrawlStartScanner_p.html @@ -35,24 +35,32 @@ No servers had been detected in the given IP range #[iprange]#. Please enter a different IP range for another scan.

#(/noserverdetected)# - + #(enterrange)#:: -

-

- +
+ + + +
+
+
. . .[1-254] - -
-
- +

+
+
+
- -
-

+

+
+
+ + #(intranetHint)#::
Do not use intranet scan results, you are not in an intranet environment!
#(/intranetHint)# + #[intranethosts]# 
+
#(/enterrange)# - + #(servertable)#::

The following servers had been detected: @@ -65,15 +73,20 @@ Protocol IP URL + Access Process #{list}# - + #[protocol]# #[ip]# #[url]# - #(process)#not in index::indexed#(/process)# + #(accessUnknown)#::unknown#(/accessUnknown)# + #(accessEmpty)#::empty#(/accessEmpty)# + #(accessGranted)#::granted#(/accessGranted)# + #(accessDenied)#::denied#(/accessDenied)# + #(process)#not in index::indexed#(/process)# #{/list}# diff --git a/htroot/CrawlStartScanner_p.java b/htroot/CrawlStartScanner_p.java index 083f5b781..61b54ac93 100644 --- a/htroot/CrawlStartScanner_p.java +++ b/htroot/CrawlStartScanner_p.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; +import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -31,6 +32,7 @@ import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.Scanner; +import net.yacy.cora.protocol.Scanner.Access; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -64,7 +66,6 @@ public class CrawlStartScanner_p { // case: an IP range was given; scan the range for services and display result if (post.containsKey("scanip") || post.containsKey("scanhost")) { - addSelectIPRange(sb, prop); InetAddress ia; try { if (post.containsKey("scanip")) { @@ -80,16 +81,28 @@ public class CrawlStartScanner_p { scanner.addSMB(false); scanner.start(); scanner.terminate(); - Scanner.scancache = scanner.services(); + enlargeScancache(sb, scanner); addScantable(sb, prop); } catch (UnknownHostException e) {} } + if (post.containsKey("scanintranet")) { + Scanner scanner = new Scanner(Domains.myIntranetIPs(), 100, sb.isIntranetMode() ? 100 : 3000); + scanner.addFTP(false); + scanner.addHTTP(false); + scanner.addHTTPS(false); + scanner.addSMB(false); + scanner.start(); + scanner.terminate(); + enlargeScancache(sb, scanner); + addScantable(sb, prop); + } + // check crawl request if (post != null && post.containsKey("crawl")) { // make a pk/url mapping Map pkmap = new TreeMap(Base64Order.enhancedCoder); - for (MultiProtocolURI u: Scanner.scancache) { + for (MultiProtocolURI u: Scanner.scancache.keySet()) { DigestURI uu = new DigestURI(u); pkmap.put(uu.hash(), uu); } @@ -112,8 +125,10 @@ public class CrawlStartScanner_p { private static void addSelectIPRange(Switchboard sb, serverObjects prop) { InetAddress ip; + List ips = Domains.myIntranetIPs(); + prop.put("enterrange_intranethosts", ips.toString()); + prop.put("enterrange_intranetHint", 0); if (sb.isIntranetMode()) { - List ips = Domains.myIntranetIPs(); if (ips.size() > 0) ip = ips.get(0); else try { ip = InetAddress.getByName("192.168.0.1"); } catch (UnknownHostException e) { @@ -121,6 +136,7 @@ public class CrawlStartScanner_p { e.printStackTrace(); } } else { + prop.put("enterrange_intranetHint", 1); ip = Domains.myPublicLocalIP(); } addSelectIPRange(ip, prop); @@ -139,25 +155,55 @@ public class CrawlStartScanner_p { if (Scanner.scancache.size() > 0) { // show scancache table prop.put("servertable", 1); - int i = 0; String urlString; DigestURI u; - for (final MultiProtocolURI url: Scanner.scancache) { - u = new DigestURI(url); - urlString = u.toNormalform(true, false); - prop.put("servertable_list_" + i + "_pk", new String(u.hash())); - prop.put("servertable_list_" + i + "_count", i); - prop.putHTML("servertable_list_" + i + "_protocol", u.getProtocol()); - prop.putHTML("servertable_list_" + i + "_ip", Domains.dnsResolve(u.getHost()).getHostAddress()); - prop.putHTML("servertable_list_" + i + "_url", urlString); - prop.put("servertable_list_" + i + "_process", inIndex(sb, urlString) == null ? 0 : 1); - i++; + table: while (true) { + try { + int i = 0; + for (final Map.Entry host: Scanner.scancache.entrySet()) { + u = new DigestURI(host.getKey()); + urlString = u.toNormalform(true, false); + prop.put("servertable_list_" + i + "_pk", new String(u.hash())); + prop.put("servertable_list_" + i + "_count", i); + prop.putHTML("servertable_list_" + i + "_protocol", u.getProtocol()); + prop.putHTML("servertable_list_" + i + "_ip", Domains.dnsResolve(u.getHost()).getHostAddress()); + prop.putHTML("servertable_list_" + i + "_url", urlString); + prop.put("servertable_list_" + i + "_accessUnknown", host.getValue() == Access.unknown ? 1 : 0); + prop.put("servertable_list_" + i + "_accessEmpty", host.getValue() == Access.empty ? 1 : 0); + prop.put("servertable_list_" + i + "_accessGranted", host.getValue() == Access.granted ? 1 : 0); + prop.put("servertable_list_" + i + "_accessDenied", host.getValue() == Access.denied ? 1 : 0); + prop.put("servertable_list_" + i + "_process", inIndex(sb, urlString) == null ? 0 : 1); + prop.put("servertable_list_" + i + "_preselected", interesting(sb, u, host.getValue()) ? 1 : 0); + i++; + } + prop.put("servertable_list", i); + prop.put("servertable_num", i); + break table; + } catch (ConcurrentModificationException e) { + continue table; + } } - prop.put("servertable_list", i); - prop.put("servertable_num", i); } } + private static void enlargeScancache(Switchboard sb, Scanner scanner) { + if (Scanner.scancache == null) { + Scanner.scancache = scanner.services(); + return; + } + Iterator> i = Scanner.scancache.entrySet().iterator(); + Map.Entry entry; + while (i.hasNext()) { + entry = i.next(); + if (!interesting(sb, entry.getKey(), entry.getValue())) i.remove(); + } + Scanner.scancache.putAll(scanner.services()); + } + + private static boolean interesting(Switchboard sb, MultiProtocolURI uri, Access access) { + return inIndex(sb, uri.toNormalform(true, false)) == null && access == Access.granted && (uri.getProtocol().equals("smb") || uri.getProtocol().equals("ftp")); + } + private static byte[] inIndex(Switchboard sb, String url) { Iterator i; try { diff --git a/source/net/yacy/cora/protocol/Scanner.java b/source/net/yacy/cora/protocol/Scanner.java index 5eef48f7e..5976ff67f 100644 --- a/source/net/yacy/cora/protocol/Scanner.java +++ b/source/net/yacy/cora/protocol/Scanner.java @@ -20,11 +20,11 @@ package net.yacy.cora.protocol; +import java.io.IOException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.UnknownHostException; import java.util.ArrayList; -import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Map; @@ -35,10 +35,9 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.http.HTTPClient; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.order.Base64Order; /** * a protocol scanner @@ -49,13 +48,14 @@ public class Scanner extends Thread { private static final MultiProtocolURI POISONURI = new MultiProtocolURI(); private static final Object PRESENT = new Object(); - public static Map intranetURLs = new TreeMap(Base64Order.enhancedCoder); // deprecated - public static Collection scancache = new ArrayList(1); + public static enum Access {unknown, empty, granted, denied;} + + public static Map scancache = new TreeMap(); private int runnerCount; private List scanrange; private BlockingQueue scanqueue; - private Map services; + private Map services; private Map runner; private int timeout; @@ -64,7 +64,7 @@ public class Scanner extends Thread { this.scanrange = new ArrayList(); this.scanrange.add(scanrange); this.scanqueue = new LinkedBlockingQueue(); - this.services = Collections.synchronizedMap(new TreeMap()); + this.services = Collections.synchronizedMap(new TreeMap()); this.runner = new ConcurrentHashMap(); this.timeout = timeout; } @@ -73,7 +73,7 @@ public class Scanner extends Thread { this.runnerCount = concurrentRunner; this.scanrange = scanrange; this.scanqueue = new LinkedBlockingQueue(); - this.services = Collections.synchronizedMap(new TreeMap()); + this.services = Collections.synchronizedMap(new TreeMap()); this.runner = new ConcurrentHashMap(); this.timeout = timeout; } @@ -127,7 +127,34 @@ public class Scanner extends Thread { try { if (TimeoutRequest.ping(this.uri, timeout)) { try { - services.put(new MultiProtocolURI(this.uri.getProtocol() + "://" + Domains.getHostName(InetAddress.getByName(this.uri.getHost())) + "/"), ""); + MultiProtocolURI uri = new MultiProtocolURI(this.uri.getProtocol() + "://" + Domains.getHostName(InetAddress.getByName(this.uri.getHost())) + "/"); + String protocol = uri.getProtocol(); + Access access = protocol.equals("http") || protocol.equals("https") ? Access.granted : Access.unknown; + services.put(uri, access); + if (access == Access.unknown) { + // ask the service if it lets us in + if (protocol.equals("ftp")) { + final FTPClient ftpClient = new FTPClient(); + try { + ftpClient.open(uri.getHost(), uri.getPort()); + ftpClient.login("anonymous", "anomic@"); + List list = ftpClient.list("/", false); + ftpClient.CLOSE(); + access = list == null || list.size() == 0 ? Access.empty : Access.granted; + } catch (IOException e) { + access = Access.denied; + } + } + if (protocol.equals("smb")) { + try { + String[] list = uri.list(); + access = list == null || list.length == 0 ? Access.empty : Access.granted; + } catch (IOException e) { + access = Access.denied; + } + } + } + if (access != Access.unknown) services.put(uri, access); } catch (MalformedURLException e) { e.printStackTrace(); } catch (UnknownHostException e) { @@ -196,8 +223,8 @@ public class Scanner extends Thread { return c; } - public Collection services() { - return this.services.keySet(); + public Map services() { + return this.services; } public static void main(String[] args) { @@ -209,7 +236,7 @@ public class Scanner extends Thread { scanner.addSMB(false); scanner.start(); scanner.terminate(); - for (MultiProtocolURI service: scanner.services()) { + for (MultiProtocolURI service: scanner.services().keySet()) { System.out.println(service.toNormalform(true, false)); } try { diff --git a/source/net/yacy/cora/protocol/ftp/FTPClient.java b/source/net/yacy/cora/protocol/ftp/FTPClient.java index 289425902..d2ba01e7e 100644 --- a/source/net/yacy/cora/protocol/ftp/FTPClient.java +++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java @@ -1336,7 +1336,7 @@ public class FTPClient { log.info("---- ^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^"); } - private List list(final String path, final boolean extended) throws IOException { + public List list(final String path, final boolean extended) throws IOException { createDataSocket(); // send command to the control port @@ -2364,7 +2364,7 @@ public class FTPClient { * @param password * @throws IOException */ - private void login(final String account, final String password) throws IOException { + public void login(final String account, final String password) throws IOException { unsetLoginData(); // send user name diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index de10e0cd7..777da510c 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -314,7 +314,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable { * checks for local/global IP range and local IP */ public final boolean isLocal() { - if (this.isSMB() || this.isFile()) return true; + if (this.isFile()) return true; if (this.hash == null) synchronized (this) { // this is synchronized because another thread may also call the same method in between // that is the reason that this.hash is checked again