- crawl profile: don't add null-values

- added some settings and statistics for url-fetcher 'server'-mode
- added own stack for fetchable URLs
- added possibility to fill stack via shift from peer's queues, via POST (addurls=$count and url$num=$url) or via file-upload
- added "htroot" to classpath of linux start-script

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3370 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent a46dc43f45
commit d114a0136e

@ -10,6 +10,7 @@ Roland Ramthun
Alexander Schier (Allo)
Matthias Söhnholz
Jan Sandbrink (NN)
Franz Brausse (FB, karlchenofhell)
Designers:
=========
@ -21,6 +22,7 @@ Packagers:
slick
Alexander Schier
Oliver Wunder (daburna)
Franz Brausse
Translators:
============

@ -0,0 +1,70 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': URL Fetcher Stack Management</title>
#%env/templates/metas.template%#
</head>
<body id="CrawlURLFetchStack_p">
#%env/templates/header.template%#
#%env/templates/submenuCrawlURLFetch.template%#
<h2>Manage stack for remote URL fetches</h2>
#(addedUrls)#::<span class="success">Added #[added]# URLs!</span>#(/addedUrls)#
<form method="post" action="CrawlURLFetchStack_p.html" enctype="multipart/form-data">
<fieldset><legend>Statistics</legend>
<dl>
<dt>Currently stacked URLs:</dt><dd>#[urlCount]#</dd>
<dt>Totally fetched / added URLs:</dt><dd>#[totalFetched]# / #[totalAdded]#</dd>
#{peers}#
<dt>Fetched from #[peer]#</dt><dd>#[amount]#</dd>#{/peers}#
</dl>
</fieldset>
<fieldset><legend>Settings</legend>
<dl>
<dt><label for="maxSize">Maximum URLs for each transfer</label>:</dt>
<dd>
<input type="text" name="maxSize" id="maxSize" value="#[maxSize]#" maxlength="3" size="3" />
<input type="submit" name="setMaxSize" value="Set" />
#(set)#::
<span class="success">Set max. size for each transfer to #[value]#</span>::
<span class="error">Setting max. size for each transfer to #[value]# was unsuccessful: may not be negative</span>#(/set)#
</dd>
</dl>
</fieldset>
<fieldset><legend>Add URLs to stack</legend>
<dt><label for="shiftloc">Shift URLs from Local Crawler</label>:</dt>
<dd>
<input type="text" name="shiftloc" id="shiftloc" value="#[locurlsVal]#" size="5" maxlength="5" style="text-align: right;" />
of <span class="tt">#[locurls]#</span> URLs
<input type="submit" name="shiftlcq" value="Shift" />
#(shiftloc)#::
<span class="success">Shifted #[value]# URLs from Local Crawler Queue to URL Fetcher Stack</span>::
<span class="error">Shifting URLs from Local Crawler Queue to URL Fetcher Stack was unsuccessful: #[error]#</span>#(/shiftloc)#
</dd>
<dt><label for="shiftrem">Shift URLs from Remote Crawler</label>:</dt>
<dd>
<input type="text" name="shiftrem" id="shiftrem" value="#[remurlsVal]#" size="5" maxlength="5" style="text-align: right;" />
of <span class="tt">#[remurls]#</span> URLs
<input type="submit" name="shiftrcq" value="Shift" />
#(shiftrem)#::
<span class="success">Shifted #[value]# URLs from Remote Crawler Queue to URL Fetcher Stack</span>::
<span class="error">Shifting URLs from Remote Crawler Queue to URL Fetcher Stack was unsuccessful: #[error]#</span>#(/shiftrem)#
</dd>
<dt><label for="upload">Upload URL-List</label>:</dt>
<dd>
<input type="file" name="upload" id="upload" /> #(uploadError)#::&nbsp;<span class="error">No file entered for upload</span>#(/uploadError)#<br />
<input type="radio" name="uploadType" id="plain" value="plain" checked="checked" /> <label for="plain">Plain text, line-seperated</label><br />
<input type="radio" name="uploadType" id="html" value="html" disabled="disabled" /> <label for="html">HTML file, links will be added</label><br />
<input type="submit" name="subupload" value="Upload File" />
#(upload)#::
<span class="success">Added #[added]# and rejected #[failed]# URLs from uploaded file successfully</span>::
<span class="error">An internal error occured processing the uploaded file: #[error]#</span>#(/upload)#
</dd>
</dl>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,216 @@
// CrawlURLFetchStack_p.java
// -------------------------------------
// part of YACY
//
// (C) 2007 by Franz Brausse
//
// last change: $LastChangedDate: $ by $LastChangedBy: $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.data.URLFetcherStack;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
public class CrawlURLFetchStack_p {
public static final HashMap /* of PeerName, sent URLs */ fetchMap = new HashMap();
private static URLFetcherStack stack = null;
public static int maxURLsPerFetch = 50;
public static URLFetcherStack getURLFetcherStack(serverSwitch env) {
if (stack == null) try {
stack = new URLFetcherStack(env.getConfig(plasmaSwitchboard.DBPATH, plasmaSwitchboard.DBPATH_DEFAULT));
} catch (IOException e) {
serverLog.logSevere("URLFETCHER", "Couldn't initialize URL stack: " + e.getMessage());
}
return stack;
}
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
final serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard)env;
if (post != null) {
if (post.containsKey("addurls")) {
prop.put("addedUrls", 1);
prop.put("addedUrls_added", addURLs(post, post.getInt("addurls", -1), getURLFetcherStack(env)));
}
else if (post.containsKey("setMaxSize")) {
final int count = post.getInt("maxSize", maxURLsPerFetch);
if (count > 0) {
maxURLsPerFetch = count;
prop.put("set", 1);
prop.put("set_value", maxURLsPerFetch);
} else {
prop.put("set", 2);
prop.put("set_value", count);
}
}
else if (post.containsKey("shiftlcq")) {
int count = Math.min(post.getInt("shiftloc", 0), sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
try {
shiftFromNotice(sb.noticeURL, plasmaCrawlNURL.STACK_TYPE_CORE, getURLFetcherStack(env), count);
prop.put("shiftloc", 1);
prop.put("shiftloc_value", count);
} catch (IOException e) {
prop.put("shiftloc", 2);
prop.put("shiftloc_error", e.getMessage());
}
}
else if (post.containsKey("shiftrcq")) {
int count = post.getInt("shiftrem", 0);
try {
shiftFromNotice(sb.noticeURL, plasmaCrawlNURL.STACK_TYPE_LIMIT, getURLFetcherStack(env), count);
prop.put("shiftrem", 1);
prop.put("shiftrem_value", count);
} catch (IOException e) {
prop.put("shiftrem", 2);
prop.put("shiftrem_error", e.getMessage());
}
}
else if (post.containsKey("subupload")) {
if (post.get("upload", "").length() == 0) {
prop.put("uploadError", 1);
} else {
final File file = new File(post.get("upload", ""));
final String content = new String((byte[])post.get("upload$file"));
final String type = post.get("uploadType", "");
if (type.equals("plain")) {
prop.put("upload_added", addURLs(content.split("\n"), getURLFetcherStack(env)));
prop.put("upload_failed", 0);
prop.put("upload", 1);
} else if (type.equals("html")) {
try {
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL(file));
final Writer writer = new htmlFilterWriter(null, null, scraper, null, false);
serverFileUtils.write(content, writer);
writer.close();
final Iterator it = ((HashMap)scraper.getAnchors()).keySet().iterator();
int added = 0, failed = 0;
String url;
while (it.hasNext()) try {
url = (String)it.next();
getURLFetcherStack(env).push(new URL(url));
added++;
} catch (MalformedURLException e) { failed++; }
prop.put("upload", 1);
prop.put("upload_added", added);
prop.put("upload_failed", failed);
} catch (Exception e) {
e.printStackTrace();
prop.put("upload", 2);
prop.put("upload_error", e.getMessage());
}
}
}
}
}
putFetched(prop);
prop.put("urlCount", getURLFetcherStack(env).size());
prop.put("totalFetched", getURLFetcherStack(env).getPopped());
prop.put("totalAdded", getURLFetcherStack(env).getPushed());
prop.put("maxSize", maxURLsPerFetch);
prop.put("locurls", sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));
prop.put("remurls", sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT));
prop.put("locurlsVal", Math.min(sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE), 500));
prop.put("remurlsVal", Math.min(sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT), 500));
return prop;
}
private static void putFetched(serverObjects prop) {
Iterator it = fetchMap.keySet().iterator();
int count = 0;
while (it.hasNext()) {
String key = (String)it.next();
prop.put("peers_" + count + "_peer", key);
prop.put("peers_" + count + "_amount", ((Integer)fetchMap.get(key)).intValue());
count++;
}
prop.put("peers", count);
}
private static int addURLs(String[] urls, URLFetcherStack stack) {
int count = -1;
for (int i=0; i<urls.length; i++) try {
if (urls[i].length() == 0) continue;
stack.push(new URL(urls[i]));
count++;
} catch (MalformedURLException e) { /* ignore this */ }
return count;
}
private static void shiftFromNotice(plasmaCrawlNURL nurl, int fromStackType, URLFetcherStack stack, int count) throws IOException {
plasmaCrawlNURL.Entry entry;
for (int i=0; i<count; i++) {
entry = nurl.pop(fromStackType);
stack.push(entry.url());
}
}
private static int addURLs(serverObjects post, int amount, URLFetcherStack stack) {
int count = 0;
String url;
for (int i=0; i<amount; i++) {
url = post.get("url" + count++, null);
if (url == null || url.length() == 0) continue;
try {
stack.push(new URL(url));
count++;
} catch (MalformedURLException e) {
serverLog.logInfo("URLFETCHER", "retrieved invalid url for adding to the stack: " + url);
}
}
return count;
}
}

@ -1,11 +1,12 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Local Cache Management</title>
<title>YaCy '#[clientname]#': URL Fetcher Management</title>
#%env/templates/metas.template%#
</head>
<body id="CrawlURLFetch_p">
#%env/templates/header.template%#
#%env/templates/submenuCrawlURLFetch.template%#
<h2>URL-Fetcher</h2>
<form method="post" action="CrawlURLFetch_p.html" enctype="multipart/form-data">
<fieldset><legend>Fetch new URLs to crawl</legend>
@ -38,6 +39,7 @@
<option value="random" selected="selected">Choose a random peer</option>#{peers}#
<option value="#[hash]#">#[name]#</option>#{/peers}#
</select>
<input type="submit" name="checkPeerURLCount" value="Check URL count" />
&nbsp;<label for="amount">Amount of URLs to request</label>:
<input type="text" name="amount" id="amount" value="50" maxlength="3" size="3" />
#(peerError)#::
@ -53,9 +55,9 @@
<label for="frequency">every</label>
&nbsp;<input type="text" name="frequency" id="frequency" size="2" style="text-align: right;" maxlength="2"/>
<select name="freq_type">
<option value="weeks">Weeks</option>
<option value="days" selected="selected">Days</option>
<option value="hours">Hours</option>
<option value="days">Days</option>
<option value="hours" selected="selected">Hours</option>
<option value="minutes">Minutes</option>
</select>
#(freqError)#::&nbsp;<span class="error">Invalid period, fetching only once</span>#(/freqError)#
</dd>

@ -1,4 +1,43 @@
// CrawlURLFetch_p.java
// -------------------------------------
// part of YACY
//
// (C) 2007 by Franz Brausse
//
// last change: $LastChangedDate: $ by $LastChangedBy: $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
import java.io.IOException;
import java.net.MalformedURLException;
@ -17,6 +56,7 @@ import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverSwitch;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
@ -41,16 +81,11 @@ public class CrawlURLFetch_p {
private static plasmaCrawlProfile.entry profile = null;
private static ArrayList savedURLs = new ArrayList();
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
serverObjects prop = new serverObjects();
prop.put("host", "");
listURLs(prop); // List previously saved URLs for easy selection
listPeers(prop); // List known hosts
public static plasmaCrawlProfile.entry getCrawlProfile(serverSwitch env) {
if (profile == null) {
profile = ((plasmaSwitchboard)env).profiles.newEntry(
"URLFetcher", // Name
null, // URL
"", // URL
".*", ".*", // General / specific filter
0, 0, // General / specific depth
-1, -1, -1, // Recrawl / Dom-filter depth / Dom-max-pages
@ -60,6 +95,20 @@ public class CrawlURLFetch_p {
false, // Remote indexing
true, false, false); // Exclude static / dynamic / parent stopwords
}
return profile;
}
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
serverObjects prop = new serverObjects();
prop.put("host", "");
// List previously saved URLs for easy selection
listURLs(prop);
// List known hosts
listPeers(prop,
post != null && post.containsKey("checkPeerURLCount"),
((plasmaSwitchboard)env).remoteProxyConfig);
if (post != null) {
if (post.containsKey("start")) {
@ -82,59 +131,65 @@ public class CrawlURLFetch_p {
if (fetcher != null) fetcher.interrupt();
fetcher = null;
if (post.get("source", "").equals("peer") &&
post.get("peerhash", "").equals("random")) {
fetcher = new URLFetcher(
env,
profile,
count,
frequency);
} else {
URL url = null;
if (post.get("source", "").equals("url")) {
try {
url = new URL(post.get("host", null));
if (!savedURLs.contains(url.toNormalform()))
savedURLs.add(url.toNormalform());
prop.put("host", post.get("host", url.toString()));
} catch (MalformedURLException e) {
prop.put("host", post.get("host", ""));
prop.put("hostError", ERR_HOST_MALFORMED_URL);
}
} else if (post.get("source", "").equals("savedURL")) {
try {
url = new URL(post.get("saved", ""));
} catch (MalformedURLException e) {
/* should never appear, except for invalid input, see above */
}
} else if (post.get("source", "").equals("peer")) {
yacySeed ys = null;
try {
ys = yacyCore.seedDB.get(post.get("peerhash", ""));
try {
if (post.get("source", "").equals("peer") &&
post.get("peerhash", "").equals("random")) {
fetcher = new URLFetcher(
env,
getCrawlProfile(env),
count,
frequency);
} else {
URL url = null;
if (post.get("source", "").equals("url")) {
try {
url = new URL(post.get("host", null));
if (!savedURLs.contains(url.toNormalform()))
savedURLs.add(url.toNormalform());
prop.put("host", post.get("host", url.toString()));
} catch (MalformedURLException e) {
prop.put("host", post.get("host", ""));
prop.put("hostError", ERR_HOST_MALFORMED_URL);
}
} else if (post.get("source", "").equals("savedURL")) {
try {
url = new URL(post.get("saved", ""));
} catch (MalformedURLException e) {
/* should never appear, except for invalid input, see above */
}
} else if (post.get("source", "").equals("peer")) {
yacySeed ys = null;
ys = yacyCore.seedDB.get(post.get("peerhash", null));
if (ys != null) {
url = new URL("http://" + ys.getAddress() + URLFetcher.LIST_SERVLET);
if ((url = URLFetcher.getListServletURL(
ys.getAddress(),
URLFetcher.MODE_LIST,
count,
yacyCore.seedDB.mySeed.hash)) == null) {
prop.put("peerError", ERR_PEER_GENERAL_CONN);
prop.put("peerError_hash", post.get("peerhash", ""));
prop.put("peerError_name", ys.getName());
}
} else {
prop.put("peerError", ERR_PEER_OFFLINE);
prop.put("peerError_hash", post.get("peerhash", ""));
}
} catch (MalformedURLException e) {
prop.put("peerError", ERR_PEER_GENERAL_CONN);
prop.put("peerError_hash", post.get("peerhash", ""));
prop.put("peerError_name", ys.getName());
}
if (url != null) {
fetcher = new URLFetcher(
env,
getCrawlProfile(env),
url,
count,
frequency);
}
}
if (url != null) {
fetcher = new URLFetcher(
env,
profile,
url,
count,
frequency);
}
if (fetcher != null)
fetcher.start();
} catch (IOException e) {
e.printStackTrace();
}
if (fetcher != null)
fetcher.start();
}
else if (post.containsKey("stop")) {
if (fetcher != null) {
@ -145,22 +200,26 @@ public class CrawlURLFetch_p {
}
else if (post.containsKey("restart")) {
if (fetcher != null) {
fetcher.interrupt();
if (fetcher.url == null) {
fetcher = new URLFetcher(
env,
profile,
fetcher.count,
fetcher.delay);
} else {
fetcher = new URLFetcher(
env,
profile,
fetcher.url,
fetcher.count,
fetcher.delay);
try {
fetcher.interrupt();
if (fetcher.url == null) {
fetcher = new URLFetcher(
env,
getCrawlProfile(env),
fetcher.count,
fetcher.delay);
} else {
fetcher = new URLFetcher(
env,
getCrawlProfile(env),
fetcher.url,
fetcher.count,
fetcher.delay);
}
fetcher.start();
} catch (IOException e) {
e.printStackTrace();
}
fetcher.start();
} else {
prop.put("threadError", ERR_THREAD_RESUME);
}
@ -204,7 +263,7 @@ public class CrawlURLFetch_p {
return savedURLs.size();
}
private static int listPeers(serverObjects prop) {
private static int listPeers(serverObjects prop, boolean checkURLCount, httpRemoteProxyConfig theRemoteProxyConfig) {
int peerCount = 0;
if (yacyCore.seedDB != null && yacyCore.seedDB.sizeConnected() > 0) {
prop.put("peersKnown", 1);
@ -213,14 +272,15 @@ public class CrawlURLFetch_p {
final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, yacyVersion.YACY_PROVIDES_CRAWLS_VIA_LIST_HTML);
while (e.hasMoreElements()) {
yacySeed seed = (yacySeed) e.nextElement();
if (seed != null) hostList.put(seed.get(yacySeed.NAME, "nameless"),seed.hash);
if (seed != null && (!checkURLCount || getURLs2Fetch(seed, theRemoteProxyConfig) > 0))
hostList.put(seed.get(yacySeed.NAME, "nameless"), seed.hash);
}
String peername;
while ((peername = (String) hostList.firstKey()) != null) {
final String Hash = (String) hostList.get(peername);
if (Hash.equals(yacyCore.seedDB.mySeed.hash)) continue;
prop.put("peersKnown_peers_" + peerCount + "_hash", Hash);
final String hash = (String) hostList.get(peername);
if (hash.equals(yacyCore.seedDB.mySeed.hash)) continue;
prop.put("peersKnown_peers_" + peerCount + "_hash", hash);
prop.put("peersKnown_peers_" + peerCount + "_name", peername);
hostList.remove(peername);
peerCount++;
@ -233,15 +293,37 @@ public class CrawlURLFetch_p {
return peerCount;
}
private static int getURLs2Fetch(yacySeed seed, httpRemoteProxyConfig theRemoteProxyConfig) {
try {
String answer = new String(httpc.wget(
URLFetcher.getListServletURL(seed.getAddress(), URLFetcher.MODE_COUNT, 0, null),
seed.getIP(),
5000,
null, null,
theRemoteProxyConfig));
if (answer.matches("\\d+"))
return Integer.parseInt(answer);
else {
System.err.println("RETRIEVED INVALID ANSWER FROM " + seed.getName() + ": '" + answer + "'");
return -1;
}
} catch (MalformedURLException e) {
/* should not happen */
return -3;
} catch (IOException e) {
return -2;
}
}
private static long getDate(String count, String type) {
long r = 0;
if (count != null && count.matches("\\d+")) r = Long.parseLong(count);
if (r < 1) return -1;
r *= 3600000;
if (type.equals("weeks")) return r * 24 * 7;
else if (type.equals("days")) return r * 24;
else if (type.equals("hours")) return r;
r *= 60000;
if (type.equals("days")) return r * 60 * 24;
else if (type.equals("hours")) return r * 60;
else if (type.equals("minutes")) return r;
else return -1;
}
@ -250,7 +332,8 @@ public class CrawlURLFetch_p {
public static final long DELAY_ONCE = -1;
public static final long DELAY_SELF_DET = 0;
private static final String LIST_SERVLET = "/yacy/list.html?list=queueUrls";
public static final int MODE_LIST = 0;
public static final int MODE_COUNT = 1;
public static int totalRuns = 0;
public static int totalFetchedURLs = 0;
@ -271,12 +354,35 @@ public class CrawlURLFetch_p {
public boolean paused = false;
public static URL getListServletURL(String host, int mode, int count, String peerHash) {
String r = "http://" + host + "/yacy/list.html?list=queueUrls&display=";
switch (mode) {
case MODE_LIST: r += "list"; break;
case MODE_COUNT: r += "count"; break;
}
if (count > 0) r += "&count=" + count;
if (peerHash != null && peerHash.length() > 0) {
r += "&iam=" + peerHash;
} else if (mode == MODE_LIST) {
r += "&iam=" + yacyCore.seedDB.mySeed.hash;
}
try {
return new URL(r);
} catch (MalformedURLException e) {
return null;
}
}
public URLFetcher(
serverSwitch env,
plasmaCrawlProfile.entry profile,
URL url,
int count,
long delayMs) {
long delayMs) throws IOException {
if (env == null || profile == null || url == null)
throw new NullPointerException("env, profile or url must not be null");
this.sb = (plasmaSwitchboard)env;
@ -291,7 +397,7 @@ public class CrawlURLFetch_p {
serverSwitch env,
plasmaCrawlProfile.entry profile,
int count,
long delayMs) {
long delayMs) throws IOException {
if (env == null || profile == null)
throw new NullPointerException("env or profile must not be null");
this.sb = (plasmaSwitchboard)env;
@ -317,6 +423,7 @@ public class CrawlURLFetch_p {
totalFetchedURLs += stackURLs(getURLs(url));
this.lastRun = System.currentTimeMillis() - start;
totalRuns++;
serverLog.logInfo(this.getName(), "Loaded " + this.lastFetchedURLs + " URLs from " + url + " in " + this.lastRun + " ms into stackcrawler.");
if (this.delay < 0 || isInterrupted()) {
return;
} else synchronized (this) {
@ -347,9 +454,7 @@ public class CrawlURLFetch_p {
}
if (ys == null) return null;
try {
return new URL("http://" + ys.getAddress() + LIST_SERVLET + "&count=" + this.count);
} catch (MalformedURLException ee) { return null; }
return getListServletURL(ys.getAddress(), MODE_LIST, this.count, yacyCore.seedDB.mySeed.hash);
}
private int stackURLs(String[] urls) throws InterruptedException {
@ -359,7 +464,6 @@ public class CrawlURLFetch_p {
String reason;
for (int i=0; i<urls.length && !isInterrupted(); i++) {
if (urls[i].trim().length() == 0) continue;
serverLog.logFine(this.getName(), "stacking " + urls[i]);
reason = this.sb.sbStackCrawlThread.stackCrawl(
urls[i],
null,
@ -369,8 +473,10 @@ public class CrawlURLFetch_p {
this.profile.generalDepth(),
this.profile);
if (reason == null) {
serverLog.logFine(this.getName(), "stacked " + urls[i]);
this.lastFetchedURLs++;
} else {
serverLog.logFine(this.getName(), "error on stacking " + urls[i] + ": " + reason);
this.lastFailed++;
totalFailed++;
this.failed.put(urls[i], reason);

@ -0,0 +1,7 @@
<div class="SubMenu">
<h3>URL Fetcher Menu</h3>
<ul class="SubMenu">
<li><a href="/CrawlURLFetch_p.html" class="MenuItemLink lock">URL Fetcher</a></li>
<li><a href="/CrawlURLFetchStack_p.html" class="MenuItemLink lock">URL Stack</a></li>
</ul>
</div>

@ -48,18 +48,21 @@
// javac -classpath .:../../classes list.java
// if the shell's current path is HTROOT
// contains contributions by [FB] to support listing URLs for URL Fetcher
import java.io.File;
import java.io.IOException;
import de.anomic.data.URLFetcherStack;
import de.anomic.data.listManager;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.net.URL;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public final class list {
@ -72,6 +75,7 @@ public final class list {
final String col = post.get("col", "");
final File listsPath = new File(ss.getRootPath(),ss.getConfig("listsPath", "DATA/LISTS"));
final String otherPeerName = yacyCore.seedDB.get(post.get("iam", null)).get(yacySeed.NAME, "unknown");
if (col.equals("black")) {
final StringBuffer out = new StringBuffer();
@ -89,27 +93,39 @@ public final class list {
} // if filenamesarray.length > 0
prop.put("list",out);
} else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
// list urls from remote crawler queue for other peers
int count = 50;
if (post.get("count", "").length() > 0 && post.get("count", "").matches("\\d+"))
count = Integer.parseInt(post.get("count", ""));
final StringBuffer sb = new StringBuffer();
plasmaCrawlNURL.Entry entry;
for (int i=0; i<count && count - i<((plasmaSwitchboard)ss).noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); i++) {
try {
entry = ((plasmaSwitchboard)ss).noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
sb.append(wikiCode.deReplaceHTMLEntities(entry.url().toNormalform())).append("\n");
} catch (IOException e) {
serverLog.logSevere("/yacy/list.html", "CANNOT FETCH ENTRY " + i + "/" + count + ": " + e.getMessage());
}
// start contrib by [FB]
else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
final URLFetcherStack db = CrawlURLFetchStack_p.getURLFetcherStack(ss);
final String display = post.get("display", "list");
if (display.equals("list")) {
// list urls from remote crawler queue for other peers
final int count = Math.min(post.getInt("count", 50), CrawlURLFetchStack_p.maxURLsPerFetch);
if (count > 0 && db.size() > 0) {
final StringBuffer sb = new StringBuffer();
URL url;
int cnt = 0;
for (int i=0; i<count; i++) {
if ((url = db.pop()) == null) continue;
sb.append(wikiCode.deReplaceHTMLEntities(url.toNormalform())).append("\n");
cnt++;
}
prop.put("list", sb);
CrawlURLFetchStack_p.fetchMap.put(otherPeerName, new Integer(cnt));
serverLog.logInfo("URLFETCHER", "sent " + cnt + " URLs to peer " + otherPeerName);
} else {
prop.put("list", "");
serverLog.logInfo("URLFETCHER", "couldn't satisfy URL request of " + otherPeerName + ": stack is empty");
}
} else if (display.equals("count")) {
prop.put("list", db.size());
}
prop.put("list", sb);
// end contrib by [FB]
} else {
prop.putASIS("list","");
}
return prop;
}
}

@ -0,0 +1,138 @@
// URLFetcherStack.java
// -------------------------------------
// part of YACY
//
// (C) 2007 by Franz Brausse
//
// last change: $LastChangedDate: $ by $LastChangedBy: $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.data;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Iterator;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
public class URLFetcherStack {
public static final String DBFILE = "urlRemote2.stack";
private static final kelondroRow rowdef = new kelondroRow(
"String urlstring-256",
kelondroBase64Order.enhancedCoder,
0
);
private final kelondroStack db;
private final serverLog log;
private int popped = 0;
private int pushed = 0;
public URLFetcherStack(String path) throws IOException {
this.db = new kelondroStack(
new File(path + File.separator + DBFILE),
rowdef);
this.log = new serverLog("URLFETCHERSTACK");
}
public int getPopped() { return this.popped; }
public int getPushed() { return this.pushed; }
public void clearStat() { this.popped = 0; this.pushed = 0; }
public void finalize() throws Throwable {
this.db.close();
}
public boolean push(URL url) {
try {
this.db.push(this.db.row().newEntry(
new byte[][] { url.toNormalform().getBytes() }
));
this.pushed++;
return true;
} catch (IOException e) {
this.log.logSevere("error storing entry", e);
return false;
}
}
public URL pop() {
try {
kelondroRow.Entry r = this.db.pop();
if (r == null) return null;
final String url = r.getColString(0, null);
try {
this.popped++;
return new URL(url);
} catch (MalformedURLException e) {
this.log.logSevere("found invalid URL-entry: " + url);
return null;
}
} catch (IOException e) {
this.log.logSevere("error retrieving entry", e);
return null;
}
}
public String[] top(int count) {
try {
final ArrayList ar = new ArrayList();
Iterator it = db.contentRows(500);
kelondroRow.EntryIndex ei;
for (int i=0; i<count && it.hasNext(); i++) {
ei = (kelondroRow.EntryIndex)it.next();
if (ei == null) continue;
ar.add(ei.getColString(0, null));
}
return (String[])ar.toArray(new String[ar.size()]);
} catch (kelondroException e) {
this.log.logSevere("error retrieving entry", e);
return null;
}
}
public int size() {
return this.db.size();
}
}

@ -258,27 +258,28 @@ public class plasmaCrawlProfile {
boolean storeHTCache, boolean storeTXCache,
boolean remoteIndexing,
boolean xsstopw, boolean xdstopw, boolean xpstopw) {
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, crawlProfileHandleLength);
mem = new HashMap();
mem.put("handle", handle);
mem.put("name", name);
mem.put("startURL", startURL);
mem.put("generalFilter", generalFilter);
mem.put("specificFilter", specificFilter);
mem.put("generalDepth", Integer.toString(generalDepth));
mem.put("specificDepth", Integer.toString(specificDepth));
mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder));
mem.put("domFilterDepth", Integer.toString(domFilterDepth));
mem.put("domMaxPages", Integer.toString(domMaxPages));
mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
mem.put("indexText", (indexText) ? "true" : "false");
mem.put("indexMedia", (indexMedia) ? "true" : "false");
mem.put("storeHTCache", (storeHTCache) ? "true" : "false");
mem.put("storeTXCache", (storeTXCache) ? "true" : "false");
mem.put("remoteIndexing", (remoteIndexing) ? "true" : "false");
mem.put("xsstopw", (xsstopw) ? "true" : "false"); // exclude static stop-words
mem.put("xdstopw", (xdstopw) ? "true" : "false"); // exclude dynamic stop-word
mem.put("xpstopw", (xpstopw) ? "true" : "false"); // exclude parent stop-words
mem.put("handle", handle);
mem.put("name", name);
mem.put("startURL", (startURL == null) ? "" : startURL);
mem.put("generalFilter", (generalFilter == null) ? ".*" : generalFilter);
mem.put("specificFilter", (specificFilter == null) ? ".*" : specificFilter);
mem.put("generalDepth", Integer.toString(generalDepth));
mem.put("specificDepth", Integer.toString(specificDepth));
mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder));
mem.put("domFilterDepth", Integer.toString(domFilterDepth));
mem.put("domMaxPages", Integer.toString(domMaxPages));
mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
mem.put("indexText", (indexText) ? "true" : "false");
mem.put("indexMedia", (indexMedia) ? "true" : "false");
mem.put("storeHTCache", (storeHTCache) ? "true" : "false");
mem.put("storeTXCache", (storeTXCache) ? "true" : "false");
mem.put("remoteIndexing", (remoteIndexing) ? "true" : "false");
mem.put("xsstopw", (xsstopw) ? "true" : "false"); // exclude static stop-words
mem.put("xdstopw", (xdstopw) ? "true" : "false"); // exclude dynamic stop-word
mem.put("xpstopw", (xpstopw) ? "true" : "false"); // exclude parent stop-words
doms = new HashMap();
}

@ -93,7 +93,7 @@ fi
CLASSPATH=""
for N in lib/*.jar; do CLASSPATH="$CLASSPATH$N:"; done
for N in libx/*.jar; do CLASSPATH="$CLASSPATH$N:"; done
CLASSPATH="classes:.:$CLASSPATH"
CLASSPATH="classes:.:htroot:$CLASSPATH"
cmdline="";

Loading…
Cancel
Save