- removed old intranet scanner (the generic scanner now completely subsumes the old one)

- added information about granted access
- enhanced servlet design
- added submit-feedback (because it is a long-running task)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7372 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent acab6801d9
commit 99a7fe87f9

@ -1,67 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Intranet Crawl Start</title>
#%env/templates/metas.template%#
<script type="text/javascript">
<!--
function setall(name) {
var selectForm = document.forms.namedItem(name);
var count = selectForm.elements["num"].value;
for (i = 0; i < count; i++) {
if (selectForm.elements["item_" + i] == null) continue;
selectForm.elements["item_" + i].checked = !selectForm.elements["item_" + i].checked;
}
}
-->
</script>
<script type="text/javascript" src="/js/sorttable.js"></script>
</head>
<body id="IndexCreate">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Intranet Crawl Start</h2>
<p>
When an index domain is configured to contain intranet links,
the intranet may be scanned for available servers.
Please select below the servers in your intranet that you want to fetch into the search index.
</p>
#(notintranet)#::
<p class="error">
This network definition does not allow intranet links.
A list of intranet servers is only available if you confiugure YaCy to index intranet targets.
To do so, open the <a href="ConfigBasic.html">Basic Configuration</a> servlet and select the 'Intranet Indexing' use case.
</p>
#(/notintranet)#
#(servertable)#::
<form id="servertable" name="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
<legend><label for="servertable">Available Intranet Server</label></legend>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name)" /></td>
<td>IP</td>
<td>URL</td>
<td>Process</td>
</tr>
#{list}#
<tr class="TableCellLight">
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
<td><a href="#[url]#">#[ip]#</a></td>
<td><a href="#[url]#">#[url]#</a></td>
#(process)#<td class="error">not in index</td>::<td class="commit">indexed</td>#(/process)#
</tr>
#{/list}#
</table>
<p>
<input type="hidden" name="num" value="#[num]#" />
<input type="submit" name="crawl" value="Add Selected Servers to Crawler" />
</p>
</fieldset></form>
#(/servertable)#
#%env/templates/footer.template%#
</body>
</html>

@ -1,122 +0,0 @@
/**
* CrawlStartIntranet_p
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 28.10.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.Scanner;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.data.WorkTables;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class CrawlStartIntranet_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard)env;
prop.put("notintranet", 0);
prop.put("servertable", 0);
// check if there is a intranet configuration
if (!sb.isIntranetMode()) {
prop.put("notintranet", 1);
return prop;
}
// if there are no intranet addresses known, scan the net
if (Scanner.intranetURLs.size() == 0) {
Scanner scanner = new Scanner(100, 10);
scanner.addFTP(false);
scanner.addHTTP(false);
scanner.addHTTPS(false);
scanner.addSMB(false);
scanner.start();
scanner.terminate();
DigestURI url;
for (MultiProtocolURI service: scanner.services()) {
url = new DigestURI(service);
Scanner.intranetURLs.put(url.hash(), url);
}
}
// check crawl request
if (post != null && post.containsKey("crawl")) {
for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) {
byte [] pk = entry.getValue().substring(5).getBytes();
DigestURI url = Scanner.intranetURLs.get(pk);
if (url != null) {
String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99";
path += "&crawlingURL=" + url.toNormalform(true, false);
WorkTables.execAPICall("localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""), path, pk);
}
}
}
}
// show server table
prop.put("servertable", 1);
int i = 0;
String urlString;
for (final DigestURI url: Scanner.intranetURLs.values()) {
urlString = url.toNormalform(true, false);
prop.put("servertable_list_" + i + "_pk", new String(url.hash()));
prop.put("servertable_list_" + i + "_count", i);
prop.putHTML("servertable_list_" + i + "_ip", Domains.dnsResolve(url.getHost()).getHostAddress());
prop.putHTML("servertable_list_" + i + "_url", urlString);
prop.put("servertable_list_" + i + "_process", inIndex(sb, urlString) == null ? 0 : 1);
i++;
}
prop.put("servertable_list", i);
prop.put("servertable_num", i);
return prop;
}
private static byte[] inIndex(Switchboard sb, String url) {
Iterator<Tables.Row> i;
try {
i = sb.tables.iterator(WorkTables.TABLE_API_NAME);
Tables.Row row;
String comment;
while (i.hasNext()) {
row = i.next();
comment = new String(row.get(WorkTables.TABLE_API_COL_COMMENT));
if (comment.contains(url)) return row.getPK();
}
return null;
} catch (IOException e) {
Log.logException(e);
return null;
}
}
}

@ -35,24 +35,32 @@
No servers had been detected in the given IP range #[iprange]#. Please enter a different IP range for another scan.
</p>
#(/noserverdetected)#
#(enterrange)#::
<p>
<form id="enterrange" name="enterrange" action="CrawlStartScanner_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
<legend><label for="servertable">Enter IP Range for Scanner</label></legend>
<div><table border="0"><tr valign="top">
<td width="270"><form onSubmit="document.getElementById('scanipinfo').innerHTML='Please wait...'" action="CrawlStartScanner_p.html" method="get">
<fieldset height="30">
<legend><label for="servertable">Scan with given IP range</label></legend><br>
<input type="text" name="ip4-0" value="#[ip4-0]#" size="3" maxlength="3" />.
<input type="text" name="ip4-1" value="#[ip4-1]#" size="3" maxlength="3" />.
<input type="text" name="ip4-2" value="#[ip4-2]#" size="3" maxlength="3" />.[1-254]
<input type="submit" name="scanip" value="Scan this IP range for services" />
</fieldset></form>
<form id="enterrange" name="enterrange" action="CrawlStartScanner_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
<legend><label for="servertable">Enter Host Name for Scanner</label></legend>
<input type="submit" name="scanip" value="Scan"/><br><br><div class="error" id="scanipinfo"></div>
</fieldset></form></td>
<td width="270"><form onSubmit="document.getElementById('scanhostinfo').innerHTML='Please wait...'" action="CrawlStartScanner_p.html" method="get">
<fieldset>
<legend><label for="servertable">Scan range with given host name</label></legend><br>
<input type="text" name="host" value="#[host]#" size="28" maxlength="60" />
<input type="submit" name="scanhost" value="Scan this Host for services" />
</fieldset></form>
</p>
<input type="submit" name="scanhost" value="Scan" /><br><br><div class="error" id="scanhostinfo"></div>
</fieldset></form></td>
<td><form onSubmit="document.getElementById('scanintranetinfo').innerHTML='Please wait...'" action="CrawlStartScanner_p.html" method="get">
<fieldset>
<legend><label for="servertable">Full Intranet Scan</label></legend>
#(intranetHint)#::<div class="warning">Do not use intranet scan results, you are not in an intranet environment!</div>#(/intranetHint)#
#[intranethosts]#&nbsp;<input type="submit" name="scanintranet" value="Scan" /><div class="error" id="scanintranetinfo"></div>
</fieldset></form></td>
</tr></table></div>
#(/enterrange)#
#(servertable)#::
<p>
The following servers had been detected:
@ -65,15 +73,20 @@
<td>Protocol</td>
<td>IP</td>
<td>URL</td>
<td>Access</td>
<td>Process</td>
</tr>
#{list}#
<tr class="TableCellLight">
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" #(preselected)#::checked#(/preselected)#/></td>
<td>#[protocol]#</td>
<td><a href="#[url]#">#[ip]#</a></td>
<td><a href="#[url]#">#[url]#</a></td>
#(process)#<td class="error">not in index</td>::<td class="commit">indexed</td>#(/process)#
#(accessUnknown)#::<td class="info">unknown</td>#(/accessUnknown)#
#(accessEmpty)#::<td class="info">empty</td>#(/accessEmpty)#
#(accessGranted)#::<td class="commit">granted</td>#(/accessGranted)#
#(accessDenied)#::<td class="error">denied</td>#(/accessDenied)#
#(process)#<td class="info">not in index</td>::<td class="commit">indexed</td>#(/process)#
</tr>
#{/list}#
</table>

@ -22,6 +22,7 @@
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -31,6 +32,7 @@ import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.Scanner;
import net.yacy.cora.protocol.Scanner.Access;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
@ -64,7 +66,6 @@ public class CrawlStartScanner_p {
// case: an IP range was given; scan the range for services and display result
if (post.containsKey("scanip") || post.containsKey("scanhost")) {
addSelectIPRange(sb, prop);
InetAddress ia;
try {
if (post.containsKey("scanip")) {
@ -80,16 +81,28 @@ public class CrawlStartScanner_p {
scanner.addSMB(false);
scanner.start();
scanner.terminate();
Scanner.scancache = scanner.services();
enlargeScancache(sb, scanner);
addScantable(sb, prop);
} catch (UnknownHostException e) {}
}
if (post.containsKey("scanintranet")) {
Scanner scanner = new Scanner(Domains.myIntranetIPs(), 100, sb.isIntranetMode() ? 100 : 3000);
scanner.addFTP(false);
scanner.addHTTP(false);
scanner.addHTTPS(false);
scanner.addSMB(false);
scanner.start();
scanner.terminate();
enlargeScancache(sb, scanner);
addScantable(sb, prop);
}
// check crawl request
if (post != null && post.containsKey("crawl")) {
// make a pk/url mapping
Map<byte[], DigestURI> pkmap = new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
for (MultiProtocolURI u: Scanner.scancache) {
for (MultiProtocolURI u: Scanner.scancache.keySet()) {
DigestURI uu = new DigestURI(u);
pkmap.put(uu.hash(), uu);
}
@ -112,8 +125,10 @@ public class CrawlStartScanner_p {
private static void addSelectIPRange(Switchboard sb, serverObjects prop) {
InetAddress ip;
List<InetAddress> ips = Domains.myIntranetIPs();
prop.put("enterrange_intranethosts", ips.toString());
prop.put("enterrange_intranetHint", 0);
if (sb.isIntranetMode()) {
List<InetAddress> ips = Domains.myIntranetIPs();
if (ips.size() > 0) ip = ips.get(0); else try {
ip = InetAddress.getByName("192.168.0.1");
} catch (UnknownHostException e) {
@ -121,6 +136,7 @@ public class CrawlStartScanner_p {
e.printStackTrace();
}
} else {
prop.put("enterrange_intranetHint", 1);
ip = Domains.myPublicLocalIP();
}
addSelectIPRange(ip, prop);
@ -139,25 +155,55 @@ public class CrawlStartScanner_p {
if (Scanner.scancache.size() > 0) {
// show scancache table
prop.put("servertable", 1);
int i = 0;
String urlString;
DigestURI u;
for (final MultiProtocolURI url: Scanner.scancache) {
u = new DigestURI(url);
urlString = u.toNormalform(true, false);
prop.put("servertable_list_" + i + "_pk", new String(u.hash()));
prop.put("servertable_list_" + i + "_count", i);
prop.putHTML("servertable_list_" + i + "_protocol", u.getProtocol());
prop.putHTML("servertable_list_" + i + "_ip", Domains.dnsResolve(u.getHost()).getHostAddress());
prop.putHTML("servertable_list_" + i + "_url", urlString);
prop.put("servertable_list_" + i + "_process", inIndex(sb, urlString) == null ? 0 : 1);
i++;
table: while (true) {
try {
int i = 0;
for (final Map.Entry<MultiProtocolURI, Scanner.Access> host: Scanner.scancache.entrySet()) {
u = new DigestURI(host.getKey());
urlString = u.toNormalform(true, false);
prop.put("servertable_list_" + i + "_pk", new String(u.hash()));
prop.put("servertable_list_" + i + "_count", i);
prop.putHTML("servertable_list_" + i + "_protocol", u.getProtocol());
prop.putHTML("servertable_list_" + i + "_ip", Domains.dnsResolve(u.getHost()).getHostAddress());
prop.putHTML("servertable_list_" + i + "_url", urlString);
prop.put("servertable_list_" + i + "_accessUnknown", host.getValue() == Access.unknown ? 1 : 0);
prop.put("servertable_list_" + i + "_accessEmpty", host.getValue() == Access.empty ? 1 : 0);
prop.put("servertable_list_" + i + "_accessGranted", host.getValue() == Access.granted ? 1 : 0);
prop.put("servertable_list_" + i + "_accessDenied", host.getValue() == Access.denied ? 1 : 0);
prop.put("servertable_list_" + i + "_process", inIndex(sb, urlString) == null ? 0 : 1);
prop.put("servertable_list_" + i + "_preselected", interesting(sb, u, host.getValue()) ? 1 : 0);
i++;
}
prop.put("servertable_list", i);
prop.put("servertable_num", i);
break table;
} catch (ConcurrentModificationException e) {
continue table;
}
}
prop.put("servertable_list", i);
prop.put("servertable_num", i);
}
}
private static void enlargeScancache(Switchboard sb, Scanner scanner) {
if (Scanner.scancache == null) {
Scanner.scancache = scanner.services();
return;
}
Iterator<Map.Entry<MultiProtocolURI, Access>> i = Scanner.scancache.entrySet().iterator();
Map.Entry<MultiProtocolURI, Access> entry;
while (i.hasNext()) {
entry = i.next();
if (!interesting(sb, entry.getKey(), entry.getValue())) i.remove();
}
Scanner.scancache.putAll(scanner.services());
}
private static boolean interesting(Switchboard sb, MultiProtocolURI uri, Access access) {
return inIndex(sb, uri.toNormalform(true, false)) == null && access == Access.granted && (uri.getProtocol().equals("smb") || uri.getProtocol().equals("ftp"));
}
private static byte[] inIndex(Switchboard sb, String url) {
Iterator<Tables.Row> i;
try {

@ -20,11 +20,11 @@
package net.yacy.cora.protocol;
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@ -35,10 +35,9 @@ import java.util.concurrent.ExecutionException;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
/**
* a protocol scanner
@ -49,13 +48,14 @@ public class Scanner extends Thread {
private static final MultiProtocolURI POISONURI = new MultiProtocolURI();
private static final Object PRESENT = new Object();
public static Map<byte[], DigestURI> intranetURLs = new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder); // deprecated
public static Collection<MultiProtocolURI> scancache = new ArrayList<MultiProtocolURI>(1);
public static enum Access {unknown, empty, granted, denied;}
public static Map<MultiProtocolURI, Access> scancache = new TreeMap<MultiProtocolURI, Access>();
private int runnerCount;
private List<InetAddress> scanrange;
private BlockingQueue<MultiProtocolURI> scanqueue;
private Map<MultiProtocolURI, String> services;
private Map<MultiProtocolURI, Access> services;
private Map<Runner, Object> runner;
private int timeout;
@ -64,7 +64,7 @@ public class Scanner extends Thread {
this.scanrange = new ArrayList<InetAddress>();
this.scanrange.add(scanrange);
this.scanqueue = new LinkedBlockingQueue<MultiProtocolURI>();
this.services = Collections.synchronizedMap(new TreeMap<MultiProtocolURI, String>());
this.services = Collections.synchronizedMap(new TreeMap<MultiProtocolURI, Access>());
this.runner = new ConcurrentHashMap<Runner, Object>();
this.timeout = timeout;
}
@ -73,7 +73,7 @@ public class Scanner extends Thread {
this.runnerCount = concurrentRunner;
this.scanrange = scanrange;
this.scanqueue = new LinkedBlockingQueue<MultiProtocolURI>();
this.services = Collections.synchronizedMap(new TreeMap<MultiProtocolURI, String>());
this.services = Collections.synchronizedMap(new TreeMap<MultiProtocolURI, Access>());
this.runner = new ConcurrentHashMap<Runner, Object>();
this.timeout = timeout;
}
@ -127,7 +127,34 @@ public class Scanner extends Thread {
try {
if (TimeoutRequest.ping(this.uri, timeout)) {
try {
services.put(new MultiProtocolURI(this.uri.getProtocol() + "://" + Domains.getHostName(InetAddress.getByName(this.uri.getHost())) + "/"), "");
MultiProtocolURI uri = new MultiProtocolURI(this.uri.getProtocol() + "://" + Domains.getHostName(InetAddress.getByName(this.uri.getHost())) + "/");
String protocol = uri.getProtocol();
Access access = protocol.equals("http") || protocol.equals("https") ? Access.granted : Access.unknown;
services.put(uri, access);
if (access == Access.unknown) {
// ask the service if it lets us in
if (protocol.equals("ftp")) {
final FTPClient ftpClient = new FTPClient();
try {
ftpClient.open(uri.getHost(), uri.getPort());
ftpClient.login("anonymous", "anomic@");
List<String> list = ftpClient.list("/", false);
ftpClient.CLOSE();
access = list == null || list.size() == 0 ? Access.empty : Access.granted;
} catch (IOException e) {
access = Access.denied;
}
}
if (protocol.equals("smb")) {
try {
String[] list = uri.list();
access = list == null || list.length == 0 ? Access.empty : Access.granted;
} catch (IOException e) {
access = Access.denied;
}
}
}
if (access != Access.unknown) services.put(uri, access);
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (UnknownHostException e) {
@ -196,8 +223,8 @@ public class Scanner extends Thread {
return c;
}
public Collection<MultiProtocolURI> services() {
return this.services.keySet();
public Map<MultiProtocolURI, Access> services() {
return this.services;
}
public static void main(String[] args) {
@ -209,7 +236,7 @@ public class Scanner extends Thread {
scanner.addSMB(false);
scanner.start();
scanner.terminate();
for (MultiProtocolURI service: scanner.services()) {
for (MultiProtocolURI service: scanner.services().keySet()) {
System.out.println(service.toNormalform(true, false));
}
try {

@ -1336,7 +1336,7 @@ public class FTPClient {
log.info("---- ^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^---^");
}
private List<String> list(final String path, final boolean extended) throws IOException {
public List<String> list(final String path, final boolean extended) throws IOException {
createDataSocket();
// send command to the control port
@ -2364,7 +2364,7 @@ public class FTPClient {
* @param password
* @throws IOException
*/
private void login(final String account, final String password) throws IOException {
public void login(final String account, final String password) throws IOException {
unsetLoginData();
// send user name

@ -314,7 +314,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
* checks for local/global IP range and local IP
*/
public final boolean isLocal() {
if (this.isSMB() || this.isFile()) return true;
if (this.isFile()) return true;
if (this.hash == null) synchronized (this) {
// this is synchronized because another thread may also call the same method in between
// that is the reason that this.hash is checked again

Loading…
Cancel
Save