From e63896f2a811f81fceadd263844dcd4ce6f2e3ab Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 28 Sep 2010 12:18:54 +0000 Subject: [PATCH] added an intranet scanner and a servlet which shows all intranet addresses and an option to start a site-crawl for all these addresses at once. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7203 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CrawlStartIntranet_p.html | 65 +++++ htroot/CrawlStartIntranet_p.java | 120 ++++++++++ htroot/Table_API_p.java | 2 +- .../env/templates/submenuIndexCreate.template | 1 + source/de/anomic/data/WorkTables.java | 20 +- source/de/anomic/search/Switchboard.java | 17 +- source/net/yacy/cora/protocol/Scanner.java | 224 ++++++++++++++++++ 7 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 htroot/CrawlStartIntranet_p.html create mode 100644 htroot/CrawlStartIntranet_p.java create mode 100644 source/net/yacy/cora/protocol/Scanner.java diff --git a/htroot/CrawlStartIntranet_p.html b/htroot/CrawlStartIntranet_p.html new file mode 100644 index 000000000..dbc8e663c --- /dev/null +++ b/htroot/CrawlStartIntranet_p.html @@ -0,0 +1,65 @@ + + + + YaCy '#[clientname]#': Intranet Crawl Start + #%env/templates/metas.template%# + + + + + #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%# +

Intranet Crawl Start

+

+ When an index domain is configured to contain intranet links, + the intranet may be scanned for available servers. + Please select below the servers in your intranet that you want to fetch into the search index. +

+ + #(notintranet)#:: +

+ This network definition does not allow intranet links. + A list of intranet servers is only available if you confiugure YaCy to index intranet targets. + To do so, open the Basic Configuration servlet and select the 'Intranet Indexing' use case. +

+ #(/notintranet)# + + #(servertable)#:: +
+ + + + + + + + #{list}# + + + + #(process)#::#(/process)# + + #{/list}# +
URLProcess
#[url]#not in indexindexed
+

+ + +

+
+ #(/servertable)# + + + #%env/templates/footer.template%# + + diff --git a/htroot/CrawlStartIntranet_p.java b/htroot/CrawlStartIntranet_p.java new file mode 100644 index 000000000..c52de868d --- /dev/null +++ b/htroot/CrawlStartIntranet_p.java @@ -0,0 +1,120 @@ +/** + * CrawlStartIntranet_p + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 28.10.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.Scanner; +import net.yacy.kelondro.blob.Tables; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; + +import de.anomic.data.WorkTables; +import de.anomic.search.Switchboard; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class CrawlStartIntranet_p { + + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + + final serverObjects prop = new serverObjects(); + final Switchboard sb = (Switchboard)env; + + prop.put("notintranet", 0); + prop.put("servertable", 0); + + // check if there is a intranet configuration + if (!sb.isIntranetMode()) { + prop.put("notintranet", 1); + return prop; + } + + // if there are no intranet addresses known, scan the net + if (sb.intranetURLs.size() == 0) { + Scanner scanner = new Scanner(100, 10); + scanner.addFTP(false); + scanner.addHTTP(false); + scanner.addHTTPS(false); + scanner.addSMB(false); + scanner.start(); + scanner.terminate(); + DigestURI url; + for (MultiProtocolURI service: scanner.services()) { + url = new DigestURI(service); + sb.intranetURLs.put(url.hash(), url); + } + } + + // check crawl request + if (post != null && post.containsKey("crawl")) { + for (Map.Entry entry: post.entrySet()) { + if (entry.getValue().startsWith("mark_")) { + byte [] pk = entry.getValue().substring(5).getBytes(); + DigestURI url = sb.intranetURLs.get(pk); + if (url != null) { + String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99"; + path += "&crawlingURL=" + url.toNormalform(true, false); + WorkTables.execAPICall("localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""), path, pk); + } + } + } + } + + // show server table + prop.put("servertable", 1); + int i = 0; + String urlString; + for (final DigestURI url: sb.intranetURLs.values()) { + urlString = url.toNormalform(true, false); + prop.put("servertable_list_" + i + "_pk", new String(url.hash())); + prop.put("servertable_list_" + i + "_count", i); + prop.putHTML("servertable_list_" + i + "_url", urlString); + prop.put("servertable_list_" + i + "_process", inIndex(sb, urlString) == null ? 0 : 1); + i++; + } + prop.put("servertable_list", i); + prop.put("servertable_num", i); + return prop; + } + + private static byte[] inIndex(Switchboard sb, String url) { + Iterator i; + try { + i = sb.tables.iterator(WorkTables.TABLE_API_NAME); + Tables.Row row; + String comment; + while (i.hasNext()) { + row = i.next(); + comment = new String(row.get(WorkTables.TABLE_API_COL_COMMENT)); + if (comment.contains(url)) return row.getPK(); + } + return null; + } catch (IOException e) { + Log.logException(e); + return null; + } + } + +} diff --git a/htroot/Table_API_p.java b/htroot/Table_API_p.java index be3fb0c47..537f28b8c 100644 --- a/htroot/Table_API_p.java +++ b/htroot/Table_API_p.java @@ -144,7 +144,7 @@ public class Table_API_p { } // now call the api URLs and store the result status - Map l = sb.tables.execAPICall(pks, "localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", "")); + Map l = sb.tables.execAPICalls("localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""), pks); // construct result table prop.put("showexec", l.size() > 0 ? 1 : 0); diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template index 9717cf44b..3fb9d82e2 100644 --- a/htroot/env/templates/submenuIndexCreate.template +++ b/htroot/env/templates/submenuIndexCreate.template @@ -7,6 +7,7 @@ diff --git a/source/de/anomic/data/WorkTables.java b/source/de/anomic/data/WorkTables.java index 1f8d9f943..0032808b4 100644 --- a/source/de/anomic/data/WorkTables.java +++ b/source/de/anomic/data/WorkTables.java @@ -193,7 +193,7 @@ public class WorkTables extends Tables { * @param realm authentification realm * @return a map of the called urls and the http status code of the api call or -1 if any other IOException occurred */ - public Map execAPICall(Collection pks, String host, int port, String realm) { + public Map execAPICalls(String host, int port, String realm, Collection pks) { // now call the api URLs and store the result status final HTTPClient client = new HTTPClient(); client.setRealm(realm); @@ -222,6 +222,22 @@ public class WorkTables extends Tables { return l; } + public static int execAPICall(String host, int port, String realm, String path, byte[] pk) { + // now call the api URLs and store the result status + final HTTPClient client = new HTTPClient(); + client.setRealm(realm); + client.setTimout(120000); + String url = "http://" + host + ":" + port + path; + if (pk != null) url += "&" + WorkTables.TABLE_API_COL_APICALL_PK + "=" + new String(pk); + try { + client.GETbytes(url); + return client.getStatusCode(); + } catch (IOException e) { + Log.logException(e); + return -1; + } + } + /** * simplified call to execute a single entry in the api database table * @param pk the primary key of the entry @@ -233,7 +249,7 @@ public class WorkTables extends Tables { public int execAPICall(String pk, String host, int port, String realm) { ArrayList pks = new ArrayList(); pks.add(pk); - Map m = execAPICall(pks, host, port, realm); + Map m = execAPICalls(host, port, realm, pks); if (m.isEmpty()) return -1; return m.values().iterator().next().intValue(); } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 2e2468e67..09528623d 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -228,6 +228,7 @@ public final class Switchboard extends serverSwitch { public LinkedBlockingQueue trail; public yacySeedDB peers; public WorkTables tables; + public TreeMap intranetURLs = new TreeMap(Base64Order.enhancedCoder); public WorkflowProcessor indexingDocumentProcessor; public WorkflowProcessor indexingCondensementProcessor; @@ -551,9 +552,9 @@ public final class Switchboard extends serverSwitch { this.crawler, this.indexSegments.segment(Segments.Process.LOCALCRAWLING), this.peers, - "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0, - "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0); - + isIntranetMode(), + isGlobalMode()); // Intranet and Global mode may be both true! + // initializing dht chunk generation this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50); @@ -993,6 +994,14 @@ public final class Switchboard extends serverSwitch { return sb; } + public boolean isIntranetMode() { + return "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0; + } + + public boolean isGlobalMode() { + return "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0; + } + public boolean isRobinsonMode() { // we are in robinson mode, if we do not exchange index by dht distribution // we need to take care that search requests and remote indexing requests go only @@ -1523,7 +1532,7 @@ public final class Switchboard extends serverSwitch { Log.logException(e); continue; } - Map callResult = this.tables.execAPICall(pks, "localhost", (int) this.getConfigLong("port", 8080), this.getConfig("adminAccountBase64MD5", "")); + Map callResult = this.tables.execAPICalls("localhost", (int) this.getConfigLong("port", 8080), this.getConfig("adminAccountBase64MD5", ""), pks); for (Map.Entry call: callResult.entrySet()) { log.logInfo("Scheduler executed api call, response " + call.getValue() + ": " + call.getKey()); } diff --git a/source/net/yacy/cora/protocol/Scanner.java b/source/net/yacy/cora/protocol/Scanner.java new file mode 100644 index 000000000..ced8aabad --- /dev/null +++ b/source/net/yacy/cora/protocol/Scanner.java @@ -0,0 +1,224 @@ +/** + * Scanner + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 28.10.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.protocol; + +import java.io.IOException; +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.net.MalformedURLException; +import java.net.Socket; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.LinkedBlockingQueue; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.kelondro.logging.Log; + +/** + * a protocol scanner + * scans given ip's for existing http, https, ftp and smb services + */ +public class Scanner extends Thread { + + private static final MultiProtocolURI POISONURI = new MultiProtocolURI(); + private static final Object PRESENT = new Object(); + + private int runnerCount; + private List a; + private BlockingQueue scanqueue; + private Map services; + private Map runner; + private int timeout; + + public Scanner(int concurrentRunner, int timeout) { + this.runnerCount = concurrentRunner; + this.a = Domains.myIntranetIPs(); + this.scanqueue = new LinkedBlockingQueue(); + this.services = Collections.synchronizedMap(new TreeMap()); + this.runner = new ConcurrentHashMap(); + this.timeout = timeout; + } + + public void run() { + MultiProtocolURI uri; + try { + while ((uri = scanqueue.take()) != POISONURI) { + while (runner.size() >= this.runnerCount) { + /*for (Runner r: runner.keySet()) { + if (r.age() > 3000) synchronized(r) { r.interrupt(); } + }*/ + if (runner.size() >= this.runnerCount) Thread.sleep(1000); + } + Runner runner = new Runner(uri); + this.runner.put(runner, PRESENT); + runner.start(); + } + } catch (InterruptedException e) { + Log.logException(e); + } + } + + private final List genlist(boolean bigrange) { + ArrayList c = new ArrayList(10); + for (InetAddress i: a) { + for (int br = bigrange ? 1 : i.getAddress()[2]; br < (bigrange ? 255 : i.getAddress()[2] + 1); br++) { + for (int j = 1; j < 255; j++) { + byte[] address = i.getAddress(); + address[2] = (byte) br; + address[3] = (byte) j; + try { + c.add(InetAddress.getByAddress(address)); + } catch (UnknownHostException e) { + } + } + } + } + return c; + } + + public void addHTTP(boolean bigrange) { + addProtocol("http", bigrange); + } + + public void addHTTPS(boolean bigrange) { + addProtocol("https", bigrange); + } + + public void addSMB(boolean bigrange) { + addProtocol("smb", bigrange); + } + + public void addFTP(boolean bigrange) { + addProtocol("ftp", bigrange); + } + + private void addProtocol(String protocol, boolean bigrange) { + for (InetAddress i: genlist(bigrange)) { + try { + this.scanqueue.put(new MultiProtocolURI(protocol + "://" + i.getHostAddress() + "/")); + } catch (MalformedURLException e) { + Log.logException(e); + } catch (InterruptedException e) { + Log.logException(e); + } + } + } + + public int pending() { + return this.scanqueue.size(); + } + + public void terminate() { + for (int i = 0; i < runnerCount; i++) try { + this.scanqueue.put(POISONURI); + } catch (InterruptedException e) { + } + try { + this.join(); + } catch (InterruptedException e) { + } + } + + public class Runner extends Thread { + private MultiProtocolURI uri; + private long starttime; + public Runner(MultiProtocolURI uri) { + this.uri = uri; + this.starttime = System.currentTimeMillis(); + } + public void run() { + if (ping(this.uri, timeout)) { + services.put(this.uri, ""); + /* + try { + byte[] b = this.uri.get(MultiProtocolURI.yacybotUserAgent, timeout); + if (b != null) services.put(this.uri, ""); + } catch (Exception e) { + // try a list + try { + String[] l = this.uri.list(); + if (l != null) services.put(this.uri, ""); + } catch (Exception e1) { + // this just failed. do nothing + } + } + */ + } + Object r = runner.remove(this); + assert r != null; + } + public long age() { + return System.currentTimeMillis() - this.starttime; + } + public boolean equals(Object o) { + return (o instanceof Runner) && this.uri.toNormalform(true, false).equals(((Runner) o).uri.toNormalform(true, false)); + } + public int hashCode() { + return this.uri.hashCode(); + } + } + + private static boolean ping(MultiProtocolURI uri, int timeout) { + try { + Socket socket = new Socket(); + socket.connect(new InetSocketAddress(Domains.dnsResolve(uri.getHost()), uri.getPort()), timeout); + if (socket.isConnected()) { + socket.close(); + return true; + } + return false; + } catch (UnknownHostException e) { + return false; + } catch (IOException e) { + return false; + } + } + + public Collection services() { + return this.services.keySet(); + } + + public static void main(String[] args) { + //try {System.out.println("192.168.1.91: " + ping(new MultiProtocolURI("smb://192.168.1.91/"), 1000));} catch (MalformedURLException e) {} + Scanner scanner = new Scanner(100, 10); + scanner.addFTP(false); + scanner.addHTTP(false); + scanner.addHTTPS(false); + scanner.addSMB(false); + scanner.start(); + scanner.terminate(); + for (MultiProtocolURI service: scanner.services()) { + System.out.println(service.toNormalform(true, false)); + } + try { + HTTPClient.closeConnectionManager(); + } catch (InterruptedException e) { + } + } +}