added an intranet scanner and a servlet which shows all intranet addresses and an option to start a site-crawl for all these addresses at once.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7203 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent e54cb7fb0c
commit e63896f2a8

@ -0,0 +1,65 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Intranet Crawl Start</title>
#%env/templates/metas.template%#
<script type="text/javascript">
<!--
function setall(name) {
var selectForm = document.forms.namedItem(name);
var count = selectForm.elements["num"].value;
for (i = 0; i < count; i++) {
if (selectForm.elements["item_" + i] == null) continue;
selectForm.elements["item_" + i].checked = !selectForm.elements["item_" + i].checked;
}
}
-->
</script>
<script type="text/javascript" src="/js/sorttable.js"></script>
</head>
<body id="IndexCreate">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Intranet Crawl Start</h2>
<p>
When an index domain is configured to contain intranet links,
the intranet may be scanned for available servers.
Please select below the servers in your intranet that you want to fetch into the search index.
</p>
#(notintranet)#::
<p class="error">
This network definition does not allow intranet links.
A list of intranet servers is only available if you confiugure YaCy to index intranet targets.
To do so, open the <a href="ConfigBasic.html">Basic Configuration</a> servlet and select the 'Intranet Indexing' use case.
</p>
#(/notintranet)#
#(servertable)#::
<form name="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" name="apilist" target="_self"><fieldset>
<legend><label for="table">Available Intranet Server</label></legend>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name)" /></td>
<td>URL</td>
<td>Process</td>
</tr>
#{list}#
<tr class="TableCellLight">
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
<td><a href="#[url]#">#[url]#</a></td>
#(process)#<td class="error">not in index</td>::<td class="commit">indexed</td>#(/process)#
</tr>
#{/list}#
</table>
<p>
<input type="hidden" name="num" value="#[num]#" />
<input type="submit" name="crawl" value="Add Selected Servers to Crawler" /></dt>
</p>
</fieldset></form>
#(/servertable)#
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,120 @@
/**
* CrawlStartIntranet_p
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 28.10.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.Scanner;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.data.WorkTables;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class CrawlStartIntranet_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard)env;
prop.put("notintranet", 0);
prop.put("servertable", 0);
// check if there is a intranet configuration
if (!sb.isIntranetMode()) {
prop.put("notintranet", 1);
return prop;
}
// if there are no intranet addresses known, scan the net
if (sb.intranetURLs.size() == 0) {
Scanner scanner = new Scanner(100, 10);
scanner.addFTP(false);
scanner.addHTTP(false);
scanner.addHTTPS(false);
scanner.addSMB(false);
scanner.start();
scanner.terminate();
DigestURI url;
for (MultiProtocolURI service: scanner.services()) {
url = new DigestURI(service);
sb.intranetURLs.put(url.hash(), url);
}
}
// check crawl request
if (post != null && post.containsKey("crawl")) {
for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) {
byte [] pk = entry.getValue().substring(5).getBytes();
DigestURI url = sb.intranetURLs.get(pk);
if (url != null) {
String path = "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99";
path += "&crawlingURL=" + url.toNormalform(true, false);
WorkTables.execAPICall("localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""), path, pk);
}
}
}
}
// show server table
prop.put("servertable", 1);
int i = 0;
String urlString;
for (final DigestURI url: sb.intranetURLs.values()) {
urlString = url.toNormalform(true, false);
prop.put("servertable_list_" + i + "_pk", new String(url.hash()));
prop.put("servertable_list_" + i + "_count", i);
prop.putHTML("servertable_list_" + i + "_url", urlString);
prop.put("servertable_list_" + i + "_process", inIndex(sb, urlString) == null ? 0 : 1);
i++;
}
prop.put("servertable_list", i);
prop.put("servertable_num", i);
return prop;
}
private static byte[] inIndex(Switchboard sb, String url) {
Iterator<Tables.Row> i;
try {
i = sb.tables.iterator(WorkTables.TABLE_API_NAME);
Tables.Row row;
String comment;
while (i.hasNext()) {
row = i.next();
comment = new String(row.get(WorkTables.TABLE_API_COL_COMMENT));
if (comment.contains(url)) return row.getPK();
}
return null;
} catch (IOException e) {
Log.logException(e);
return null;
}
}
}

@ -144,7 +144,7 @@ public class Table_API_p {
}
// now call the api URLs and store the result status
Map<String, Integer> l = sb.tables.execAPICall(pks, "localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""));
Map<String, Integer> l = sb.tables.execAPICalls("localhost", (int) sb.getConfigLong("port", 8080), sb.getConfig("adminAccountBase64MD5", ""), pks);
// construct result table
prop.put("showexec", l.size() > 0 ? 1 : 0);

@ -7,6 +7,7 @@
<ul class="SubMenu">
<li><a href="/CrawlStartSite_p.html" class="MenuItemLink lock">Full Site Crawl/<br/>Sitemap Loader</a></li>
<li><a href="/CrawlStartExpert_p.html" class="MenuItemLink lock">Crawl Start<br/>(Expert)</a></li>
<li><a href="/CrawlStartIntranet_p.html" class="MenuItemLink lock">Intranet<br/>Scanner</a></li>
<li><a href="/Load_MediawikiWiki.html" class="MenuItemLink">Crawling of<br/>Media Wikis</a></li>
<li><a href="/Load_PHPBB3.html" class="MenuItemLink">Crawling of<br/>phpBB3 Forums</a></li>
</ul>

@ -193,7 +193,7 @@ public class WorkTables extends Tables {
* @param realm authentification realm
* @return a map of the called urls and the http status code of the api call or -1 if any other IOException occurred
*/
public Map<String, Integer> execAPICall(Collection<String> pks, String host, int port, String realm) {
public Map<String, Integer> execAPICalls(String host, int port, String realm, Collection<String> pks) {
// now call the api URLs and store the result status
final HTTPClient client = new HTTPClient();
client.setRealm(realm);
@ -222,6 +222,22 @@ public class WorkTables extends Tables {
return l;
}
public static int execAPICall(String host, int port, String realm, String path, byte[] pk) {
// now call the api URLs and store the result status
final HTTPClient client = new HTTPClient();
client.setRealm(realm);
client.setTimout(120000);
String url = "http://" + host + ":" + port + path;
if (pk != null) url += "&" + WorkTables.TABLE_API_COL_APICALL_PK + "=" + new String(pk);
try {
client.GETbytes(url);
return client.getStatusCode();
} catch (IOException e) {
Log.logException(e);
return -1;
}
}
/**
* simplified call to execute a single entry in the api database table
* @param pk the primary key of the entry
@ -233,7 +249,7 @@ public class WorkTables extends Tables {
public int execAPICall(String pk, String host, int port, String realm) {
ArrayList<String> pks = new ArrayList<String>();
pks.add(pk);
Map<String, Integer> m = execAPICall(pks, host, port, realm);
Map<String, Integer> m = execAPICalls(host, port, realm, pks);
if (m.isEmpty()) return -1;
return m.values().iterator().next().intValue();
}

@ -228,6 +228,7 @@ public final class Switchboard extends serverSwitch {
public LinkedBlockingQueue<String> trail;
public yacySeedDB peers;
public WorkTables tables;
public TreeMap<byte[], DigestURI> intranetURLs = new TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
public WorkflowProcessor<indexingQueueEntry> indexingDocumentProcessor;
public WorkflowProcessor<indexingQueueEntry> indexingCondensementProcessor;
@ -551,9 +552,9 @@ public final class Switchboard extends serverSwitch {
this.crawler,
this.indexSegments.segment(Segments.Process.LOCALCRAWLING),
this.peers,
"local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0);
isIntranetMode(),
isGlobalMode()); // Intranet and Global mode may be both true!
// initializing dht chunk generation
this.dhtMaxReferenceCount = (int) getConfigLong(SwitchboardConstants.INDEX_DIST_CHUNK_SIZE_START, 50);
@ -993,6 +994,14 @@ public final class Switchboard extends serverSwitch {
return sb;
}
public boolean isIntranetMode() {
return "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0;
}
public boolean isGlobalMode() {
return "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0;
}
public boolean isRobinsonMode() {
// we are in robinson mode, if we do not exchange index by dht distribution
// we need to take care that search requests and remote indexing requests go only
@ -1523,7 +1532,7 @@ public final class Switchboard extends serverSwitch {
Log.logException(e);
continue;
}
Map<String, Integer> callResult = this.tables.execAPICall(pks, "localhost", (int) this.getConfigLong("port", 8080), this.getConfig("adminAccountBase64MD5", ""));
Map<String, Integer> callResult = this.tables.execAPICalls("localhost", (int) this.getConfigLong("port", 8080), this.getConfig("adminAccountBase64MD5", ""), pks);
for (Map.Entry<String, Integer> call: callResult.entrySet()) {
log.logInfo("Scheduler executed api call, response " + call.getValue() + ": " + call.getKey());
}

@ -0,0 +1,224 @@
/**
* Scanner
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 28.10.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.protocol;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.Socket;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.kelondro.logging.Log;
/**
* a protocol scanner
* scans given ip's for existing http, https, ftp and smb services
*/
public class Scanner extends Thread {
private static final MultiProtocolURI POISONURI = new MultiProtocolURI();
private static final Object PRESENT = new Object();
private int runnerCount;
private List<InetAddress> a;
private BlockingQueue<MultiProtocolURI> scanqueue;
private Map<MultiProtocolURI, String> services;
private Map<Runner, Object> runner;
private int timeout;
public Scanner(int concurrentRunner, int timeout) {
this.runnerCount = concurrentRunner;
this.a = Domains.myIntranetIPs();
this.scanqueue = new LinkedBlockingQueue<MultiProtocolURI>();
this.services = Collections.synchronizedMap(new TreeMap<MultiProtocolURI, String>());
this.runner = new ConcurrentHashMap<Runner, Object>();
this.timeout = timeout;
}
public void run() {
MultiProtocolURI uri;
try {
while ((uri = scanqueue.take()) != POISONURI) {
while (runner.size() >= this.runnerCount) {
/*for (Runner r: runner.keySet()) {
if (r.age() > 3000) synchronized(r) { r.interrupt(); }
}*/
if (runner.size() >= this.runnerCount) Thread.sleep(1000);
}
Runner runner = new Runner(uri);
this.runner.put(runner, PRESENT);
runner.start();
}
} catch (InterruptedException e) {
Log.logException(e);
}
}
private final List<InetAddress> genlist(boolean bigrange) {
ArrayList<InetAddress> c = new ArrayList<InetAddress>(10);
for (InetAddress i: a) {
for (int br = bigrange ? 1 : i.getAddress()[2]; br < (bigrange ? 255 : i.getAddress()[2] + 1); br++) {
for (int j = 1; j < 255; j++) {
byte[] address = i.getAddress();
address[2] = (byte) br;
address[3] = (byte) j;
try {
c.add(InetAddress.getByAddress(address));
} catch (UnknownHostException e) {
}
}
}
}
return c;
}
public void addHTTP(boolean bigrange) {
addProtocol("http", bigrange);
}
public void addHTTPS(boolean bigrange) {
addProtocol("https", bigrange);
}
public void addSMB(boolean bigrange) {
addProtocol("smb", bigrange);
}
public void addFTP(boolean bigrange) {
addProtocol("ftp", bigrange);
}
private void addProtocol(String protocol, boolean bigrange) {
for (InetAddress i: genlist(bigrange)) {
try {
this.scanqueue.put(new MultiProtocolURI(protocol + "://" + i.getHostAddress() + "/"));
} catch (MalformedURLException e) {
Log.logException(e);
} catch (InterruptedException e) {
Log.logException(e);
}
}
}
public int pending() {
return this.scanqueue.size();
}
public void terminate() {
for (int i = 0; i < runnerCount; i++) try {
this.scanqueue.put(POISONURI);
} catch (InterruptedException e) {
}
try {
this.join();
} catch (InterruptedException e) {
}
}
public class Runner extends Thread {
private MultiProtocolURI uri;
private long starttime;
public Runner(MultiProtocolURI uri) {
this.uri = uri;
this.starttime = System.currentTimeMillis();
}
public void run() {
if (ping(this.uri, timeout)) {
services.put(this.uri, "");
/*
try {
byte[] b = this.uri.get(MultiProtocolURI.yacybotUserAgent, timeout);
if (b != null) services.put(this.uri, "");
} catch (Exception e) {
// try a list
try {
String[] l = this.uri.list();
if (l != null) services.put(this.uri, "");
} catch (Exception e1) {
// this just failed. do nothing
}
}
*/
}
Object r = runner.remove(this);
assert r != null;
}
public long age() {
return System.currentTimeMillis() - this.starttime;
}
public boolean equals(Object o) {
return (o instanceof Runner) && this.uri.toNormalform(true, false).equals(((Runner) o).uri.toNormalform(true, false));
}
public int hashCode() {
return this.uri.hashCode();
}
}
private static boolean ping(MultiProtocolURI uri, int timeout) {
try {
Socket socket = new Socket();
socket.connect(new InetSocketAddress(Domains.dnsResolve(uri.getHost()), uri.getPort()), timeout);
if (socket.isConnected()) {
socket.close();
return true;
}
return false;
} catch (UnknownHostException e) {
return false;
} catch (IOException e) {
return false;
}
}
public Collection<MultiProtocolURI> services() {
return this.services.keySet();
}
public static void main(String[] args) {
//try {System.out.println("192.168.1.91: " + ping(new MultiProtocolURI("smb://192.168.1.91/"), 1000));} catch (MalformedURLException e) {}
Scanner scanner = new Scanner(100, 10);
scanner.addFTP(false);
scanner.addHTTP(false);
scanner.addHTTPS(false);
scanner.addSMB(false);
scanner.start();
scanner.terminate();
for (MultiProtocolURI service: scanner.services()) {
System.out.println(service.toNormalform(true, false));
}
try {
HTTPClient.closeConnectionManager();
} catch (InterruptedException e) {
}
}
}
Loading…
Cancel
Save