- added servlet: remote crawler queue overview

- added servlet: crawl profile editor

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3731 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent 4dc45d6e97
commit 086239da36

@ -0,0 +1,49 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Crawl Profile Editor</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Crawl Profile Editor</h2>
<p>
Crawl profiles hold information about a specific URL which is internally used to perform the crawl it belongs to.
The profiles for remote crawls, <a href="/ProxyIndexingMonitor_p.html">indexing via proxy</a> and snippet fetches
cannot be altered here as they are hard-coded.
</p>
<form action="/CrawlProfileEditor_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>Select the profile to edit</legend>
<select name="handle">#{profiles}#
<option value="#[handle]#"#(selected)#:: selected="selected"#(/selected)#>#[name]#</option>#{/profiles}#
</select>
<input type="submit" name="edit" value="Edit profile" />
</fieldset>
</form>
#(error)#::
<p class="error">An error occured during editing the crawl profile: #[message]#</p>
#(/error)#
#(edit)#::
<form action="/CrawlProfileEditor_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend>Edit Profile #[name]#</legend>
<input type="hidden" name="handle" value="#[handle]#" />
<dl>#{entries}#
<dt>#(readonly)#<label for="#[name]#">#[label]#</label>::#[label]##(/readonly)#</dt>
<dd>#(readonly)#
<input id="#[name]#" name="#[name]#"
#(type)# type="checkbox"#(checked)#:: checked="checked"#(/checked)#::
type="text" value="#[value]#"::
type="text" value="#[value]#"#(/type)# />::
<strong>#(type)##(checked)#false::true#(/checked)#::#[value]#::#[value]##(/type)#</strong>#(/readonly)#
</dd>#{/entries}#
</dl>
<input type="submit" name="submit" value="Submit changes" />
</fieldset>
</form>
#(/edit)#
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,165 @@
// CrawlProfileEditor_p.java
// -------------------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
// last major change: 04.07.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// You must compile this file with
// javac -classpath .:../classes CrawlProfileEditor_p.java
// if the shell's current path is HTROOT
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlProfile.entry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties;
public class CrawlProfileEditor_p {
public static class eentry {
public static final int BOOLEAN = 0;
public static final int INTEGER = 1;
public static final int STRING = 2;
public final String name;
public final String label;
public final boolean readonly;
public final int type;
public eentry(String name, String label, boolean readonly, int type) {
this.name = name;
this.label = label;
this.readonly = readonly;
this.type = type;
}
}
private static final ArrayList /*<eentry>*/ labels = new ArrayList();
static {
labels.add(new eentry(entry.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(entry.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(entry.GENERAL_FILTER, "General Filter", false, eentry.STRING));
labels.add(new eentry(entry.SPECIFIC_FILTER, "Specific Filter", false, eentry.STRING));
labels.add(new eentry(entry.GENERAL_DEPTH, "General Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.SPECIFIC_DEPTH, "Specific Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(entry.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(entry.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(entry.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(entry.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(entry.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(entry.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN));
labels.add(new eentry(entry.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(entry.XPSTOPW, "Parent stop-words", false, eentry.BOOLEAN));
}
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
final servletProperties prop = new servletProperties();
final plasmaSwitchboard sb = (plasmaSwitchboard)env;
String handle = (post == null) ? "" : post.get("handle", "");
int count = 0;
Iterator it = sb.profiles.profiles(true);
entry e;
while (it.hasNext()) {
e = (entry)it.next();
if (e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_PROXY) ||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_REMOTE) ||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_TEXT) ||
e.name().equals(plasmaSwitchboard.CRAWL_PROFILE_SNIPPET_MEDIA))
continue;
prop.put("profiles_" + count + "_name", e.name());
prop.put("profiles_" + count + "_handle", e.handle());
if (handle.equals(e.handle()))
prop.put("profiles_" + count + "_selected", 1);
count++;
}
prop.put("profiles", count);
e = sb.profiles.getEntry(handle);
if (e == null) return prop;
if (post.containsKey("submit")) try {
it = labels.iterator();
eentry tee;
while (it.hasNext()) {
tee = (eentry)it.next();
String cval = (String)e.map().get(tee.name);
String val = (tee.type == eentry.BOOLEAN)
? Boolean.toString(post.containsKey(tee.name))
: post.get(tee.name, cval);
if (!cval.equals(val))
e.changeEntry(tee.name, val);
}
} catch (IOException ex) {
prop.put("error", 1);
prop.put("error_message", ex.getMessage());
}
prop.put("edit", 1);
prop.put("edit_name", e.name());
prop.put("edit_handle", e.handle());
it = labels.iterator();
count = 0;
while (it.hasNext()) {
eentry ee = (eentry)it.next();
Object val = e.map().get(ee.name);
prop.put("edit_entries_" + count + "_readonly", ee.readonly ? 1 : 0);
prop.put("edit_entries_" + count + "_readonly_name", ee.name);
prop.put("edit_entries_" + count + "_readonly_label", ee.label);
prop.put("edit_entries_" + count + "_readonly_type", ee.type);
if (ee.type == eentry.BOOLEAN) {
prop.put("edit_entries_" + count + "_readonly_type_checked", Boolean.parseBoolean((String)val) ? 1 : 0);
} else {
prop.put("edit_entries_" + count + "_readonly_type_value", val);
}
count++;
}
prop.put("edit_entries", count);
return prop;
}
}

@ -0,0 +1,65 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Index Creation / WWW Remote Crawl Queue</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Index Creation: WWW Remote Crawl Queue</h2>
<p>
This queue stores the urls that other peers sent to you in order to perform a remote crawl for them.
</p>
#(crawler-queue)#
<p>The remote crawler queue is empty</p>
::
<form action="IndexCreateWWWRemoteQueue_p.html" method="post" enctype="multipart/form-data">
<fieldset>
<input type="submit" name="clearcrawlqueue" value="clear remote crawl queue" />
</fieldset>
</form>
<p>
There are <strong>#[num]#</strong> entries in the remote crawler queue.
Showing <strong>#[show-num]#</strong> most recent entries.
</p>
<p>
Show last <a href="IndexCreateWWWRemoteQueue_p.html?limit=50">50</a> |
<a href="IndexCreateWWWRemoteQueue_p.html?limit=100">100</a> |
<a href="IndexCreateWWWRemoteQueue_p.html?limit=250">250</a> |
<a href="IndexCreateWWWRemoteQueue_p.html?limit=500">500</a> entries.
</p>
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col width="60" span="2" />
<col width="10" />
<col width="80" />
<col width="180" />
<col />
<col width="10" />
</colgroup>
<tr class="TableHeader">
<th>Initiator</th>
<th>Profile</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>URL</th>
<th>Delete</th>
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[initiator]#</td>
<td>#[profile]#</td>
<td>#[depth]#</td>
<td>#[modified]#</td>
<td>#[anchor]#</td>
<td><a href="#[url]#">#[url]#</a></td>
<td><a href="IndexCreateWWWRemoteQueue_p.html?deleteEntry=#[hash]#">[Delete]</a></td>
</tr>
#{/list}#
</table>
#(/crawler-queue)#
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,142 @@
// IndexCreateWWWRemoteQueue_p.java
// -------------------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
// last major change: 04.07.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// You must compile this file with
// javac -classpath .:../classes IndexCreateWWWRemoteQueue_p.java
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public class IndexCreateWWWRemoteQueue_p {
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(Date date) {
if (date == null) return "";
return dayFormatter.format(date);
}
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
final servletProperties prop = new servletProperties();
final plasmaSwitchboard sb = (plasmaSwitchboard)env;
int showLimit = 100;
if (post != null) {
if (post.containsKey("limit")) {
try {
showLimit = Integer.parseInt((String)post.get("limit"));
} catch (NumberFormatException e) { }
}
if (post.containsKey("clearcrawlqueue")) {
int c = sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
sb.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_REMOTE);
try { sb.cleanProfiles(); } catch (InterruptedException e) { /* Ignore this */}
/*
int c = 0;
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash();
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
*/
prop.put("info", 3); // crawling queue cleared
prop.put("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
String urlHash = (String) post.get("deleteEntry");
sb.noticeURL.remove(urlHash);
prop.put("LOCATION","");
return prop;
}
}
int stackSize = sb.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE);
if (stackSize == 0) {
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
plasmaCrawlEntry[] crawlerList = sb.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_REMOTE, showLimit);
plasmaCrawlEntry urle;
boolean dark = true;
yacySeed initiator;
String profileHandle;
plasmaCrawlProfile.entry profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.length) && (showNum < showLimit); i++) {
urle = crawlerList[i];
if (urle != null && urle.url() != null) {
initiator = yacyCore.seedDB.getConnected(urle.initiator());
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.profiles.getEntry(profileHandle);
prop.put("crawler-queue_list_" + showNum + "_dark", ((dark) ? 1 : 0) );
prop.put("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.loaddate()) );
prop.put("crawler-queue_list_" + showNum + "_anchor", urle.name());
prop.put("crawler-queue_list_" + showNum + "_url", urle.url().toString());
prop.put("crawler-queue_list_" + showNum + "_hash", urle.urlhash());
dark = !dark;
showNum++;
} else {
stackSize--;
}
}
prop.put("crawler-queue_show-num", showNum); //showin sjow-num most recent
prop.put("crawler-queue_num", stackSize);//num Entries
prop.put("crawler-queue_list", showNum);
}
return prop;
}
}

@ -71,7 +71,7 @@
<td>#(localCrawlPaused)#&nbsp;::(paused)#(/localCrawlPaused)#</td>
</tr>
<tr>
<td>Remote triggered Crawl</td>
<td><a href="IndexCreateWWWRemoteQueue_p.html">Remote triggered Crawl</a></td>
<td>#[remoteTriggeredCrawlQueueSize]#</td>
<td><a href="Status.html?#(remoteTriggeredCrawlPaused)#pauseCrawlJob::continueCrawlJob#(/remoteTriggeredCrawlPaused)#=&amp;jobType=remoteTriggeredCrawl" title="#(remoteTriggeredCrawlPaused)#pause remote triggered crawl::continue remote triggered crawl#(/remoteTriggeredCrawlPaused)#"><img src="env/grafics/#(remoteTriggeredCrawlPaused)#stop.gif::start.gif#(/remoteTriggeredCrawlPaused)#" alt="#(remoteTriggeredCrawlPaused)#pause remote triggered crawl::continue remote triggered crawl#(/remoteTriggeredCrawlPaused)#" style="width:12px;height:12px;" /></a></td>
<td>#(remoteTriggeredCrawlPaused)#&nbsp;::(paused)#(/remoteTriggeredCrawlPaused)#</td>

@ -1,9 +1,15 @@
<div class="SubMenu">
<h3>Index Creation Menu</h3>
<div class="SubMenugroup">
<h3>Control Queues</h3>
<h3>Administration</h3>
<ul class="SubMenu">
<li><a href="/IndexCreate_p.html" class="MenuItemLink lock">Crawl Start</a></li>
<li><a href="/CrawlProfileEditor_p.html" class="MenuItemLink lock">Crawl Profile Editor</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Control Queues</h3>
<ul class="SubMenu">
<li><a href="/IndexCreateIndexingQueue_p.html" class="MenuItemLink lock">Indexing</a></li>
<li><a href="/IndexCreateLoaderQueue_p.html" class="MenuItemLink lock">Loader</a></li>
</ul>
@ -13,6 +19,7 @@
<ul class="SubMenu">
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li>
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li>
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li>
<li><!--<a href="/IndexCreateWWWOverhangQueue_p.html" class="MenuItemLink">--><em class="lock">Overhang</em><!--</a>--></li>
</ul>
</div>

@ -228,7 +228,27 @@ public class plasmaCrawlProfile {
public class entry {
// this is a simple record structure that hold all properties of a single crawl start
public static final String HANDLE = "handle";
public static final String NAME = "name";
public static final String START_URL = "startURL";
public static final String GENERAL_FILTER = "generalFilter";
public static final String SPECIFIC_FILTER = "specificFilter";
public static final String GENERAL_DEPTH = "generalDepth";
public static final String SPECIFIC_DEPTH = "specificDepth";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_FILTER_DEPTH = "domFilterDepth";
public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ";
public static final String INDEX_TEXT = "indexText";
public static final String INDEX_MEDIA = "indexMedia";
public static final String STORE_HTCACHE = "storeHTCache";
public static final String STORE_TXCACHE = "storeTXCache";
public static final String REMOTE_INDEXING = "remoteIndexing";
public static final String XSSTOPW = "xsstopw";
public static final String XDSTOPW = "xdstopw";
public static final String XPSTOPW = "xpstopw";
private Map mem;
private Map doms;
@ -243,25 +263,25 @@ public class plasmaCrawlProfile {
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, crawlProfileHandleLength);
mem = new HashMap();
mem.put("handle", handle);
mem.put("name", name);
mem.put("startURL", (startURL == null) ? "" : startURL);
mem.put("generalFilter", (generalFilter == null) ? ".*" : generalFilter);
mem.put("specificFilter", (specificFilter == null) ? ".*" : specificFilter);
mem.put("generalDepth", Integer.toString(generalDepth));
mem.put("specificDepth", Integer.toString(specificDepth));
mem.put("recrawlIfOlder", Integer.toString(recrawlIfOlder));
mem.put("domFilterDepth", Integer.toString(domFilterDepth));
mem.put("domMaxPages", Integer.toString(domMaxPages));
mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
mem.put("indexText", (indexText) ? "true" : "false");
mem.put("indexMedia", (indexMedia) ? "true" : "false");
mem.put("storeHTCache", (storeHTCache) ? "true" : "false");
mem.put("storeTXCache", (storeTXCache) ? "true" : "false");
mem.put("remoteIndexing", (remoteIndexing) ? "true" : "false");
mem.put("xsstopw", (xsstopw) ? "true" : "false"); // exclude static stop-words
mem.put("xdstopw", (xdstopw) ? "true" : "false"); // exclude dynamic stop-word
mem.put("xpstopw", (xpstopw) ? "true" : "false"); // exclude parent stop-words
mem.put(HANDLE, handle);
mem.put(NAME, name);
mem.put(START_URL, (startURL == null) ? "" : startURL);
mem.put(GENERAL_FILTER, (generalFilter == null) ? ".*" : generalFilter);
mem.put(SPECIFIC_FILTER, (specificFilter == null) ? ".*" : specificFilter);
mem.put(GENERAL_DEPTH, Integer.toString(generalDepth));
mem.put(SPECIFIC_DEPTH, Integer.toString(specificDepth));
mem.put(RECRAWL_IF_OLDER, Integer.toString(recrawlIfOlder));
mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth));
mem.put(DOM_MAX_PAGES, Integer.toString(domMaxPages));
mem.put(CRAWLING_Q, Boolean.toString(crawlingQ)); // crawling of urls with '?'
mem.put(INDEX_TEXT, Boolean.toString(indexText));
mem.put(INDEX_MEDIA, Boolean.toString(indexMedia));
mem.put(STORE_HTCACHE, Boolean.toString(storeHTCache));
mem.put(STORE_TXCACHE, Boolean.toString(storeTXCache));
mem.put(REMOTE_INDEXING, Boolean.toString(remoteIndexing));
mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words
mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word
mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words
doms = new HashMap();
}
@ -278,7 +298,7 @@ public class plasmaCrawlProfile {
public entry(Map mem) {
this.mem = mem;
this.doms = (HashMap) domsCache.get(this.mem.get("handle"));
this.doms = (HashMap) domsCache.get(this.mem.get(HANDLE));
if (this.doms == null) this.doms = new HashMap();
}
@ -286,27 +306,27 @@ public class plasmaCrawlProfile {
return mem;
}
public String handle() {
String r = (String) mem.get("handle");
String r = (String) mem.get(HANDLE);
if (r == null) return null; else return r;
}
public String name() {
String r = (String) mem.get("name");
String r = (String) mem.get(NAME);
if (r == null) return ""; else return r;
}
public String startURL() {
String r = (String) mem.get("startURL");
String r = (String) mem.get(START_URL);
if (r == null) return null; else return r;
}
public String generalFilter() {
String r = (String) mem.get("generalFilter");
String r = (String) mem.get(GENERAL_FILTER);
if (r == null) return ".*"; else return r;
}
public String specificFilter() {
String r = (String) mem.get("specificFilter");
String r = (String) mem.get(SPECIFIC_FILTER);
if (r == null) return ".*"; else return r;
}
public int generalDepth() {
String r = (String) mem.get("generalDepth");
String r = (String) mem.get(GENERAL_DEPTH);
if (r == null) return 0; else try {
return Integer.parseInt(r);
} catch (NumberFormatException e) {
@ -314,7 +334,7 @@ public class plasmaCrawlProfile {
}
}
public int specificDepth() {
String r = (String) mem.get("specificDepth");
String r = (String) mem.get(SPECIFIC_DEPTH);
if (r == null) return 0; else try {
return Integer.parseInt(r);
} catch (NumberFormatException e) {
@ -324,7 +344,7 @@ public class plasmaCrawlProfile {
public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that
// an antry must have to be re-crawled
String r = (String) mem.get("recrawlIfOlder");
String r = (String) mem.get(RECRAWL_IF_OLDER);
if (r == null) return Long.MAX_VALUE; else try {
long l = Long.parseLong(r) * ((long) 60000);
if (l < 0) return Long.MAX_VALUE; else return l;
@ -336,7 +356,7 @@ public class plasmaCrawlProfile {
// if the depth is equal or less to this depth,
// then the current url feeds with its domain the crawl filter
// if this is -1, all domains are feeded
String r = (String) mem.get("domFilterDepth");
String r = (String) mem.get(DOM_FILTER_DEPTH);
if (r == null) return Integer.MAX_VALUE; else try {
int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
@ -348,7 +368,7 @@ public class plasmaCrawlProfile {
public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
String r = (String) mem.get("domMaxPages");
String r = (String) mem.get(DOM_MAX_PAGES);
if (r == null) return Integer.MAX_VALUE; else try {
int i = Integer.parseInt(r);
if (i < 0) return Integer.MAX_VALUE;
@ -358,40 +378,40 @@ public class plasmaCrawlProfile {
}
}
public boolean crawlingQ() {
String r = (String) mem.get("crawlingQ");
if (r == null) return false; else return (r.equals("true"));
String r = (String) mem.get(CRAWLING_Q);
if (r == null) return false; else return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexText() {
String r = (String) mem.get("indexText");
if (r == null) return true; else return (r.equals("true"));
String r = (String) mem.get(INDEX_TEXT);
if (r == null) return true; else return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexMedia() {
String r = (String) mem.get("indexMedia");
if (r == null) return true; else return (r.equals("true"));
String r = (String) mem.get(INDEX_MEDIA);
if (r == null) return true; else return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeHTCache() {
String r = (String) mem.get("storeHTCache");
if (r == null) return false; else return (r.equals("true"));
String r = (String) mem.get(STORE_HTCACHE);
if (r == null) return false; else return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeTXCache() {
String r = (String) mem.get("storeTXCache");
if (r == null) return false; else return (r.equals("true"));
String r = (String) mem.get(STORE_TXCACHE);
if (r == null) return false; else return (r.equals(Boolean.TRUE.toString()));
}
public boolean remoteIndexing() {
String r = (String) mem.get("remoteIndexing");
if (r == null) return false; else return (r.equals("true"));
String r = (String) mem.get(REMOTE_INDEXING);
if (r == null) return false; else return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeStaticStopwords() {
String r = (String) mem.get("xsstopw");
if (r == null) return false; else return (r.equals("true"));
String r = (String) mem.get(XSSTOPW);
if (r == null) return false; else return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeDynamicStopwords() {
String r = (String) mem.get("xdstopw");
if (r == null) return false; else return (r.equals("true"));
String r = (String) mem.get(XDSTOPW);
if (r == null) return false; else return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeParentStopwords() {
String r = (String) mem.get("xpstopw");
if (r == null) return false; else return (r.equals("true"));
String r = (String) mem.get(XPSTOPW);
if (r == null) return false; else return (r.equals(Boolean.TRUE.toString()));
}
public void changeEntry(String propName, String newValue) throws IOException {
mem.put(propName, newValue);
@ -409,7 +429,7 @@ public class plasmaCrawlProfile {
doms.put(domain, dp);
}
}
domsCache.put(this.mem.get("handle"), doms);
domsCache.put(this.mem.get(HANDLE), doms);
}
public boolean grantedDomAppearance(String domain) {
int max = domFilterDepth();

@ -134,11 +134,11 @@ public final class serverDate {
}
public static long UTCDiff(String diffString) {
if (diffString.length() != 5) throw new RuntimeException("UTC String malformed (wrong size):" + diffString);
if (diffString.length() != 5) throw new IllegalArgumentException("UTC String malformed (wrong size):" + diffString);
boolean ahead = true;
if (diffString.charAt(0) == '+') ahead = true;
else if (diffString.charAt(0) == '-') ahead = false;
else throw new RuntimeException("UTC String malformed (wrong sign):" + diffString);
else throw new IllegalArgumentException("UTC String malformed (wrong sign):" + diffString);
long oh = Long.parseLong(diffString.substring(1, 3));
long om = Long.parseLong(diffString.substring(3));
return ((ahead) ? (long) 1 : (long) -1) * (oh * hourMillis + om * minuteMillis);

@ -72,8 +72,6 @@ import java.util.Map;
import java.util.Properties;
import java.util.Set;
import com.sun.org.apache.bcel.internal.generic.LLOAD;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.natLib;
import de.anomic.plasma.plasmaCondenser;

Loading…
Cancel
Save