Merge branch 'master' of git://gitorious.org/yacy/rc1.git

pull/1/head
reger 13 years ago
commit 4f92389550

@ -0,0 +1,12 @@
#!/bin/bash
cd "`dirname $0`"
port=$(grep ^port= ../DATA/SETTINGS/yacy.conf |cut -d= -f2)
pw=$(grep ^adminAccountBase64MD5= ../DATA/SETTINGS/yacy.conf |cut -d= -f2)
if which curl &>/dev/null; then
curl -s --header "Authorization: realm=$pw" "http://127.0.0.1:$port/$1"
elif which wget &>/dev/null; then
wget -q -t 1 --timeout=5 --header "Authorization: realm=$pw" "http://127.0.0.1:$port/$1"
else
exit 1
fi

@ -388,6 +388,7 @@
<copy todir="${release_main}/bin">
<fileset dir="bin">
<include name="apicall.sh"/>
<include name="apicat.sh"/>
<include name="clearall.sh"/>
<include name="clearcache.sh"/>
<include name="clearindex.sh"/>

@ -109,8 +109,8 @@ inboundlinks_tag_txt
## total number of inbound links, int
inboundlinkscount_i
## number of inbound links with noindex tag, int
inboundlinksnoindexcount_i
## number of inbound links with nofollow tag, int
inboundlinksnofollowcount_i
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
outboundlinks_tag_txt
@ -136,8 +136,8 @@ outboundlinks_tag_txt
## external number of inbound links, int
outboundlinkscount_i
## number of external links with noindex tag, int
outboundlinksnoindexcount_i
## number of external links with nofollow tag, int
outboundlinksnofollowcount_i
## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
images_tag_txt

@ -61,6 +61,7 @@ public class ConfigParser {
}
}
env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime());
env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension());
}
}

@ -6,14 +6,22 @@
<script type="text/javascript" src="/js/ajax.js"></script>
<script type="text/javascript" src="/js/xml.js"></script>
<script type="text/javascript" src="/js/html.js"></script>
<script type="text/javascript" src="/js/Crawler.js"></script></head>
<body id="Crawler" onload="initCrawler();">
<script type="text/javascript" src="/js/Crawler.js"></script>
<script type="text/javascript">
function refreshiframe()
{
var f = document.getElementById('QueuesTable');
f.contentWindow.location.reload(true);
setTimeout("refreshiframe()", 2000);
}
</script>
</head>
<body id="Crawler" onload="initCrawler();refreshiframe();">
#%env/templates/header.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Crawler Queues</h2>
<noscript><p>(Please enable JavaScript to automatically update this page!)</p></noscript>
<p> Next update in <input type="text" id="nextUpdate" onfocus="changeInterval()" onblur="newInterval()" size="2" /> seconds. <img src="/env/grafics/empty.gif" id="ajax" alt="empty"/>
&nbsp;See a access timing <a href="/api/latency_p.xml">here</a></p>
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
<tr class="TableHeader">
@ -71,20 +79,6 @@
</tbody>
</table>
<form action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
<tr class="TableHeader">
<th colspan="3">Speed</th>
</tr>
<tr class="TableCellLight">
<td align="left" #(crawlingSpeedMinChecked)#::class="TableCellDark"#(/crawlingSpeedMinChecked)#><input type="submit" name="crawlingPerformance" value="minimum" /></td>
<td align="left" #(crawlingSpeedCustChecked)#::class="TableCellDark"#(/crawlingSpeedCustChecked)#><input name="customPPM" type="text" size="5" maxlength="5" value="#[customPPMdefault]#" />PPM <input type="submit" name="crawlingPerformance" value="custom" /></td>
<td align="left" #(crawlingSpeedMaxChecked)#::class="TableCellDark"#(/crawlingSpeedMaxChecked)#><input type="submit" name="crawlingPerformance" value="maximum" /></td>
</tr>
</tbody>
</table>
</form>
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
@ -103,15 +97,24 @@
</tbody>
</table>
<form action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
<tr class="TableHeader">
<th>Indicator</th>
<th colspan="2">Level</th>
</tr>
<tr class="TableCellLight">
<td align="left">Speed</td>
<td align="left" colspan="2">
<input #(crawlingSpeedMinChecked)#::class="TableCellDark"#(/crawlingSpeedMinChecked)# type="submit" name="crawlingPerformance" value="minimum" />
<input #(crawlingSpeedCustChecked)#::class="TableCellDark"#(/crawlingSpeedCustChecked)# name="customPPM" type="text" size="5" maxlength="5" value="#[customPPMdefault]#" />PPM <input type="submit" name="crawlingPerformance" value="custom" />
<input #(crawlingSpeedMaxChecked)#::class="TableCellDark"#(/crawlingSpeedMaxChecked)# type="submit" name="crawlingPerformance" value="maximum" />
</td>
</tr>
<tr class="TableCellLight">
<td align="left">PPM (Pages Per Minute)</td>
<td align="left"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
<td align="left" width="20"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
<td align="left"><span id="ppmSpan">&nbsp;&nbsp;&nbsp;</span></td>
</tr>
<tr class="TableCellLight">
@ -126,6 +129,7 @@
</tr>
</tbody>
</table>
</form>
<p class="watchCrawler"> #(info)#
<!-- 0 -->
@ -157,23 +161,10 @@
<!-- crawl queues -->
<p id="crawlingQueues"><strong>Crawl Queue:</strong></p>
<table border="0" cellpadding="2" cellspacing="1" id="queueTable">
<tbody>
<tr class="TableHeader">
<th>Queue</th>
<th>Profile</th>
<th>Initiator</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>URL</th>
<th>Size</th>
<th>Delete</th>
</tr>
</tbody>
</table>
<p>See an <a href="/api/latency_p.xml">access timing</a></p>
<iframe id="QueuesTable" src="IndexCreateQueues_p.html?embed=&urlsPerHost=1" width="100%" height="0" align="left" scrolling="no" marginheight="0" marginwidth="0" frameborder="0" ></iframe>
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,95 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': '#[queuename]#' Crawl Queue</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateQueues">
<div id="fullcontent">
#(embed)#
#%env/templates/header.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>'#[queuename]#' Crawl Queue</h2>
::#(/embed)#
#(crawler)#
<p>This crawler queue is empty</p>
::
#(embed)#
<form action="IndexCreateQueues_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
Delete Entries:
<input type="text" name="pattern" value="#[deletepattern]#" size="40" maxlength="200" />
<select name="option" size="1">
<option value="5">Initiator</option>
<option value="3">Profile</option>
<option value="4">Depth</option>
<option value="6">Modified Date</option>
<option value="2">Anchor Name</option>
<option value="1" selected="selected">URL</option>
</select>
<input type="hidden" name="stack" value="#[queuename]#" />
<input type="submit" name="delete" value="Delete" />
</fieldset>
</form>
::#(/embed)#
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col width="5" />
<col width="10" />
<col width="30" />
<col width="10" />
<col width="10" />
<col width="10" />
<col width="10" />
<col width="10" />
<col width="10" />
<col />
</colgroup>
<tr class="TableHeader">
<th>Count</th>
<th>Delta/ms</th>
<th>Host</th>
<th>Initiator</th>
<th>Profile</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>Delta/ms</th>
<th>URL</th>
</tr>
#{host}#
<tr class="TableCellDark">
<td>#[hostcount]#</td>
<td>#[hostdelta]#</td>
<td><a href="IndexCreateQueues_p.html?#(embed)#::embed=&#(/embed)#delete=&stack=#[queuename]#&option=1&pattern=.*#[hostname]#.*&urlsPerHost=#[urlsPerHost]#"><img src="env/grafics/trash.gif"></a>&nbsp;#[hostname]#</td>
<td colspan="7"></td>
</tr>
#{list}#
<tr class="TableCellLight">
<td colspan="3"></td>
<td>#[initiator]#</td>
<td>#[profile]#</td>
<td>#[depth]#</td>
<td>#[modified]#</td>
<td>#[anchor]#</td>
<td>#[delta]#</td>
<td><a href="#[url]#">#[url]#</a></td>
</tr>
#{/list}#
</td>
</tr>
#{/host}#
#(/crawler)#
#(embed)#
#%env/templates/footer.template%#
::#(/embed)#
</div>
<script type="text/javascript">
<!--
parentPage = parent.document.getElementById('QueuesTable');
if (parentPage != null) parentPage.height = document.getElementById('fullcontent').offsetHeight + 30;
-->
</script>
</body>
</html>

@ -1,192 +1,169 @@
// IndexCreateWWWLocalQueue_p.java
// -------------------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
//
//$LastChangedDate$
//$LastChangedRevision$
//$LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// You must compile this file with
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.retrieval.Request;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class IndexCreateWWWLocalQueue_p {
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(final Date date) {
if (date == null) return "";
return dayFormatter.format(date);
}
private static final int INVALID = 0;
private static final int URL = 1;
private static final int ANCHOR = 2;
private static final int PROFILE = 3;
private static final int DEPTH = 4;
private static final int INITIATOR = 5;
private static final int MODIFIED = 6;
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
int showLimit = 100;
if (post != null) {
showLimit = post.getInt("limit", 100);
if (post.containsKey("deleteEntries")) {
int c = 0;
final String pattern = post.get("pattern", ".*").trim();
final int option = post.getInt("option", INVALID);
if (".*".equals(pattern)) {
c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.CORE);
try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */}
} else if (option > INVALID) {
try {
// compiling the regular expression
final Pattern compiledPattern = Pattern.compile(pattern);
if (option == PROFILE) {
// search and delete the crawl profile (_much_ faster, independant of queue size)
// XXX: what to do about the annoying LOST PROFILE messages in the log?
CrawlProfile entry;
for (final byte[] handle: sb.crawler.getActive()) {
entry = sb.crawler.getActive(handle);
final String name = entry.name();
if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
continue;
if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes());
}
} else {
// iterating through the list of URLs
final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE);
Request entry;
final List<byte[]> removehashes = new ArrayList<byte[]>();
while (iter.hasNext()) {
if ((entry = iter.next()) == null) continue;
String value = null;
location: switch (option) {
case URL: value = (entry.url() == null) ? null : entry.url().toString(); break location;
case ANCHOR: value = entry.name(); break location;
case DEPTH: value = Integer.toString(entry.depth()); break location;
case INITIATOR:
value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : ASCII.String(entry.initiator());
break location;
case MODIFIED: value = daydate(entry.appdate()); break location;
default: value = null; break location;
}
if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash());
}
Log.logInfo("IndexCreateWWWLocalQueue", "created a remove list with " + removehashes.size() + " entries for pattern '" + pattern + "'");
for (final byte[] b: removehashes) {
sb.crawlQueues.noticeURL.removeByURLHash(b);
}
}
} catch (final PatternSyntaxException e) {
Log.logException(e);
}
}
prop.put("info", "3");//crawling queue cleared
prop.putNum("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
final String urlHash = post.get("deleteEntry");
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
prop.put("LOCATION","");
return prop;
}
}
int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
if (stackSize == 0) {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20));
Request urle;
boolean dark = true;
Seed initiator;
String profileHandle;
CrawlProfile profileEntry;
int i;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if ((urle != null)&&(urle.url()!=null)) {
initiator = sb.peers.getConnected(urle.initiator() == null ? "" : ASCII.String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
dark = !dark;
showNum++;
} else {
stackSize--;
}
}
prop.putNum("crawler-queue_list", showNum);
prop.putNum("crawler-queue_num", stackSize);//num Entries
prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent
}
// return rewrite properties
return prop;
}
}
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.NoticedURL.StackType;
import de.anomic.crawler.retrieval.Request;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class IndexCreateQueues_p {
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(final Date date) {
if (date == null) return "";
return dayFormatter.format(date);
}
private static final int INVALID = 0;
private static final int URL = 1;
private static final int ANCHOR = 2;
private static final int PROFILE = 3;
private static final int DEPTH = 4;
private static final int INITIATOR = 5;
private static final int MODIFIED = 6;
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
StackType stackType = StackType.LOCAL;
int urlsPerHost = 5;
boolean embed = false;
String deletepattern = ".*";
if (post != null) {
stackType = StackType.valueOf(post.get("stack", stackType.name()).toUpperCase());
urlsPerHost = post.getInt("urlsPerHost", urlsPerHost);
if (post.containsKey("embed")) embed = true;
if (post.containsKey("delete")) {
deletepattern = post.get("pattern", deletepattern).trim();
final int option = post.getInt("option", INVALID);
if (".*".equals(deletepattern)) {
sb.crawlQueues.noticeURL.clear(stackType);
try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */}
} else if (option > INVALID) {
try {
// compiling the regular expression
final Pattern compiledPattern = Pattern.compile(deletepattern);
if (option == PROFILE) {
// search and delete the crawl profile (_much_ faster, independant of queue size)
// XXX: what to do about the annoying LOST PROFILE messages in the log?
CrawlProfile entry;
for (final byte[] handle: sb.crawler.getActive()) {
entry = sb.crawler.getActive(handle);
final String name = entry.name();
if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
continue;
if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes());
}
} else {
// iterating through the list of URLs
final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(stackType);
Request entry;
final List<byte[]> removehashes = new ArrayList<byte[]>();
while (iter.hasNext()) {
if ((entry = iter.next()) == null) continue;
String value = null;
location: switch (option) {
case URL: value = (entry.url() == null) ? null : entry.url().toString(); break location;
case ANCHOR: value = entry.name(); break location;
case DEPTH: value = Integer.toString(entry.depth()); break location;
case INITIATOR:
value = (entry.initiator() == null || entry.initiator().length == 0) ? "proxy" : ASCII.String(entry.initiator());
break location;
case MODIFIED: value = daydate(entry.appdate()); break location;
default: value = null; break location;
}
if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash());
}
Log.logInfo("IndexCreateQueues_p", "created a remove list with " + removehashes.size() + " entries for pattern '" + deletepattern + "'");
for (final byte[] b: removehashes) {
sb.crawlQueues.noticeURL.removeByURLHash(b);
}
}
} catch (final PatternSyntaxException e) {
Log.logException(e);
}
}
}
}
int stackSize = sb.crawlQueues.noticeURL.stackSize(stackType);
if (stackSize == 0) {
prop.put("crawler", "0");
} else {
prop.put("crawler", "1");
prop.put("crawler_embed", embed ? 1 : 0);
prop.put("crawler_embed_deletepattern", deletepattern);
prop.put("crawler_embed_queuename", stackType.name());
final Map<String, Integer[]> hosts = sb.crawlQueues.noticeURL.getDomainStackHosts(stackType);
int hc = 0;
for (Map.Entry<String, Integer[]> host: hosts.entrySet()) {
prop.putHTML("crawler_host_" + hc + "_hostname", host.getKey());
prop.put("crawler_host_" + hc + "_embed", embed ? 1 : 0);
prop.put("crawler_host_" + hc + "_urlsPerHost", urlsPerHost);
prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name());
prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]);
prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1]);
List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost);
Seed initiator;
String profileHandle;
CrawlProfile profileEntry;
int count = 0;
for (Request request: domainStackReferences) {
if (request == null) continue;
initiator = sb.peers.getConnected(request.initiator() == null ? "" : ASCII.String(request.initiator()));
profileHandle = request.profileHandle();
profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
prop.putHTML("crawler_host_" + hc + "_list_" + count + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler_host_" + hc + "_list_" + count + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler_host_" + hc + "_list_" + count + "_depth", request.depth());
prop.put("crawler_host_" + hc + "_list_" + count + "_modified", daydate(request.appdate()) );
prop.putHTML("crawler_host_" + hc + "_list_" + count + "_anchor", request.name());
prop.put("crawler_host_" + hc + "_list_" + count + "_delta", sb.crawlQueues.noticeURL.getDomainSleepTime(stackType, sb.crawler, request));
prop.putHTML("crawler_host_" + hc + "_list_" + count + "_url", request.url().toNormalform(false, true));
prop.put("crawler_host_" + hc + "_list_" + count + "_hash", request.url().hash());
count++;
}
prop.putNum("crawler_host_" + hc + "_list", count);
hc++;
}
prop.put("crawler_host", hc);
}
prop.put("embed", embed ? 1 : 0);
prop.put("queuename", stackType.name().charAt(0) + stackType.name().substring(1).toLowerCase());
prop.put("embed_queuename", stackType.name().charAt(0) + stackType.name().substring(1).toLowerCase());
// return rewrite properties
return prop;
}
}

@ -1,58 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Global Crawl Queue</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Global Crawl Queue</h2>
<p>
This queue stores the urls that shall be sent to other peers to perform a remote crawl.
If there is no peer for remote crawling available, the links are crawled locally.
</p>
#(crawler-queue)#
<p>The global crawler queue is empty</p>
::
<form action="IndexCreateWWWGlobalQueue_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<input type="submit" name="clearcrawlqueue" value="clear global crawl queue" />
</fieldset>
</form>
<p>There are <strong>#[num]#</strong> entries in the global crawler queue. Showing <strong>#[show-num]#</strong> most recent entries.</p>
<p>Show last <a href="IndexCreateWWWGlobalQueue_p.html?limit=50">50</a> | <a href="IndexCreateWWWGlobalQueue_p.html?limit=100">100</a> | <a href="IndexCreateWWWGlobalQueue_p.html?limit=250">250</a> | <a href="IndexCreateWWWGlobalQueue_p.html?limit=500">500</a> entries.</p>
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col width="60" span="2" />
<col width="10" />
<col width="80" />
<col width="180" />
<col />
<col width="10" />
</colgroup>
<tr class="TableHeader">
<th>Initiator</th>
<th>Profile</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>URL</th>
<th>Delete</th>
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[initiator]#</td>
<td>#[profile]#</td>
<td>#[depth]#</td>
<td>#[modified]#</td>
<td>#[anchor]#</td>
<td><a href="#[url]#">#[url]#</a></td>
<td><a href="IndexCreateWWWGlobalQueue_p.html?deleteEntry=#[hash]#">[Delete]</a></td>
</tr>
#{/list}#
</table>
#(/crawler-queue)#
#%env/templates/footer.template%#
</body>
</html>

@ -1,125 +0,0 @@
// IndexCreateWWWGlobalQueue_p.java
// -------------------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
//
//$LastChangedDate$
//$LastChangedRevision$
//$LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// You must compile this file with
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class IndexCreateWWWGlobalQueue_p {
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(final Date date) {
if (date == null) return "";
return dayFormatter.format(date);
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
int showLimit = 100;
if (post != null) {
showLimit = post.getInt("limit", 100);
if (post.containsKey("clearcrawlqueue")) {
final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.LIMIT);
try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */}
/*
int c = 0;
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash();
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
*/
prop.put("info", "3");//crawling queue cleared
prop.putNum("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
final String urlHash = post.get("deleteEntry");
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
prop.put("LOCATION","");
return prop;
}
}
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
if (stackSize == 0) {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit);
Request urle;
boolean dark = true;
Seed initiator;
String profileHandle;
CrawlProfile profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
dark = !dark;
showNum++;
} else {
stackSize--;
}
}
prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent
prop.putNum("crawler-queue_num", stackSize);//num Entries
prop.putNum("crawler-queue_list", showNum);
}
// return rewrite properties
return prop;
}
}

@ -1,69 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Local Crawl Queue</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateWWWLocalQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Local Crawl Queue</h2>
<p>
This queue stores the urls that shall be crawled localy by this peer.
It may also contain urls that are computed by the proxy-prefetch.
</p>
#(crawler-queue)#
<p>The local crawler queue is empty</p>
::
<form action="IndexCreateWWWLocalQueue_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
Delete Entries:
<input type="text" name="pattern" value=".*" size="40" maxlength="200" />
<select name="option" size="1">
<option value="5">Initiator</option>
<option value="3">Profile</option>
<option value="4">Depth</option>
<option value="6">Modified Date</option>
<option value="2">Anchor Name</option>
<option value="1" selected="selected">URL</option>
</select>
<input type="submit" name="deleteEntries" value="Delete" /><em>This may take a quite long time.</em>
</fieldset>
</form>
<p>There are <strong>#[num]#</strong> entries in the local crawler queue. Showing <strong>#[show-num]#</strong> most recent entries.</p>
<p>Show last <a href="IndexCreateWWWLocalQueue_p.html?limit=50">50</a> | <a href="IndexCreateWWWLocalQueue_p.html?limit=100">100</a> | <a href="IndexCreateWWWLocalQueue_p.html?limit=250">250</a> | <a href="IndexCreateWWWLocalQueue_p.html?limit=500">500</a> entries.</p>
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col width="60" span="2" />
<col width="10" />
<col width="80" />
<col width="180" />
<col />
<col width="10" />
</colgroup>
<tr class="TableHeader">
<th>Initiator</th>
<th>Profile</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>URL</th>
<th>Delete</th>
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[initiator]#</td>
<td>#[profile]#</td>
<td>#[depth]#</td>
<td>#[modified]#</td>
<td>#[anchor]#</td>
<td><a href="#[url]#">#[url]#</a></td>
<td><a href="IndexCreateWWWLocalQueue_p.html?deleteEntry=#[hash]#">[Delete]</a></td>
</tr>
#{/list}#
</table>
#(/crawler-queue)#
#%env/templates/footer.template%#
</body>
</html>

@ -1,65 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Remote Crawl Queue</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Remote Crawl Queue</h2>
<p>
This queue stores the urls that other peers sent to you in order to perform a remote crawl for them.
</p>
#(crawler-queue)#
<p>The remote crawler queue is empty</p>
::
<form action="IndexCreateWWWRemoteQueue_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<input type="submit" name="clearcrawlqueue" value="clear remote crawl queue" />
</fieldset>
</form>
<p>
There are <strong>#[num]#</strong> entries in the remote crawler queue.
Showing <strong>#[show-num]#</strong> most recent entries.
</p>
<p>
Show last <a href="IndexCreateWWWRemoteQueue_p.html?limit=50">50</a> |
<a href="IndexCreateWWWRemoteQueue_p.html?limit=100">100</a> |
<a href="IndexCreateWWWRemoteQueue_p.html?limit=250">250</a> |
<a href="IndexCreateWWWRemoteQueue_p.html?limit=500">500</a> entries.
</p>
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col width="60" span="2" />
<col width="10" />
<col width="80" />
<col width="180" />
<col />
<col width="10" />
</colgroup>
<tr class="TableHeader">
<th>Initiator</th>
<th>Profile</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>URL</th>
<th>Delete</th>
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[initiator]#</td>
<td>#[profile]#</td>
<td>#[depth]#</td>
<td>#[modified]#</td>
<td>#[anchor]#</td>
<td><a href="#[url]#">#[url]#</a></td>
<td><a href="IndexCreateWWWRemoteQueue_p.html?deleteEntry=#[hash]#">[Delete]</a></td>
</tr>
#{/list}#
</table>
#(/crawler-queue)#
#%env/templates/footer.template%#
</body>
</html>

@ -1,120 +0,0 @@
// IndexCreateWWWRemoteQueue_p.java
// -------------------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
// last major change: 04.07.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// You must compile this file with
// javac -classpath .:../classes IndexCreateWWWRemoteQueue_p.java
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties;
public class IndexCreateWWWRemoteQueue_p {
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(final Date date) {
if (date == null) return "";
return dayFormatter.format(date);
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final servletProperties prop = new servletProperties();
final Switchboard sb = (Switchboard)env;
int showLimit = 100;
if (post != null) {
showLimit = post.getInt("limit", 100);
if (post.containsKey("clearcrawlqueue")) {
final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE);
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.REMOTE);
try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */}
/*
int c = 0;
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash();
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
*/
prop.put("info", "3"); // crawling queue cleared
prop.putNum("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
final String urlHash = post.get("deleteEntry");
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
prop.put("LOCATION","");
return prop;
}
}
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE);
if (stackSize == 0) {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit);
Request urle;
boolean dark = true;
Seed initiator;
String profileHandle;
CrawlProfile profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.appdate()) );
prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name());
prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString());
prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash());
dark = !dark;
showNum++;
} else {
stackSize--;
}
}
prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent
prop.putNum("crawler-queue_num", stackSize);//num Entries
prop.putNum("crawler-queue_list", showNum);
}
return prop;
}
}

@ -1,124 +0,0 @@
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segments;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class queues_p {
public static final String STATE_RUNNING = "running";
public static final String STATE_PAUSED = "paused";
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(final Date date) {
if (date == null) return "";
return dayFormatter.format(date);
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
//wikiCode wikiTransformer = new wikiCode(switchboard);
final serverObjects prop = new serverObjects();
Segment segment = null;
final boolean html = post != null && post.containsKey("html");
prop.setLocalized(html);
if (post != null && post.containsKey("segment") && sb.verifyAuthentication(header)) {
segment = sb.indexSegments.segment(post.get("segment"));
}
if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
prop.put("rejected", "0");
//int showRejectedCount = 10;
Seed initiator;
// index size
prop.putNum("urlpublictextSize", segment.urlMetadata().size());
prop.putNum("rwipublictextSize", segment.termIndex().sizesMax());
// loader queue
prop.putNum("loaderSize", sb.crawlQueues.workerSize());
prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10));
if (sb.crawlQueues.workerSize() == 0) {
prop.put("list-loader", "0");
} else {
final Request[] w = sb.crawlQueues.activeWorkerEntries();
int count = 0;
for (final Request r : w) {
if (r == null) continue;
prop.put("list-loader_"+count+"_profile", r.profileHandle());
initiator = sb.peers.getConnected((r.initiator() == null) ? "" : ASCII.String(r.initiator()));
prop.putHTML("list-loader_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("list-loader_"+count+"_depth", r.depth());
prop.putXML("list-loader_"+count+"_url", r.url().toString());
count++;
}
prop.put("list-loader", count);
}
//local crawl queue
prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount());
prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
addNTable(sb, prop, "list-local", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, Math.min(10, stackSize)));
//global crawl queue
prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize());
prop.put("limitCrawlState", STATE_RUNNING);
stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
//remote crawl queue
prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount());
prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
if (stackSize == 0) {
prop.put("list-remote", "0");
} else {
addNTable(sb, prop, "list-remote", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, Math.min(10, stackSize)));
}
//noload crawl queue
prop.putNum("noloadCrawlSize", sb.crawlQueues.noloadCrawlJobSize());
prop.put("noloadCrawlState", STATE_RUNNING);
//stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.NOLOAD);
// return rewrite properties
return prop;
}
public static final void addNTable(final Switchboard sb, final serverObjects prop, final String tableName, final List<Request> crawlerList) {
int showNum = 0;
Seed initiator;
for (final Request urle : crawlerList) {
if ((urle != null) && (urle.url() != null)) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : UTF8.String(urle.initiator()));
prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle());
prop.put(tableName + "_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put(tableName + "_" + showNum + "_depth", urle.depth());
prop.put(tableName + "_" + showNum + "_modified", daydate(urle.appdate()));
prop.putXML(tableName + "_" + showNum + "_anchor", urle.name());
prop.putXML(tableName + "_" + showNum + "_url", urle.url().toNormalform(false, true));
prop.put(tableName + "_" + showNum + "_hash", urle.url().hash());
showNum++;
}
}
prop.put(tableName, showNum);
}
}

@ -1,71 +0,0 @@
<?xml version="1.0"?>
<queues>
<dbsize>
<urlpublictext>#[urlpublictextSize]#</urlpublictext>
<rwipublictext>#[rwipublictextSize]#</rwipublictext>
</dbsize>
<loaderqueue>
<size>#[loaderSize]#</size>
<max>#[loaderMax]#</max>
#{list-loader}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<url>#[url]#</url>
</entry>
#{/list-loader}#
</loaderqueue>
<localcrawlerqueue>
<size>#[localCrawlSize]#</size>
<state>#[localCrawlState]#</state>
#{list-local}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<modified>#[modified]#</modified>
<anchor>#[anchor]#</anchor>
<url>#[url]#</url>
<hash>#[hash]#</hash>
<inProcess>#(inProcess)#false::true#(/inProcess)#</inProcess>
</entry>
#{/list-local}#
</localcrawlerqueue>
<limitcrawlerqueue>
<size>#[limitCrawlSize]#</size>
<state>#[limitCrawlState]#</state>
#{list-limit}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<modified>#[modified]#</modified>
<anchor>#[anchor]#</anchor>
<url>#[url]#</url>
<hash>#[hash]#</hash>
<inProcess>#(inProcess)#false::true#(/inProcess)#</inProcess>
</entry>
#{/list-limit}#
</limitcrawlerqueue>
<remotecrawlerqueue>
<size>#[remoteCrawlSize]#</size>
<state>#[remoteCrawlState]#</state>
#{list-remote}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<modified>#[modified]#</modified>
<anchor>#[anchor]#</anchor>
<url>#[url]#</url>
<hash>#[hash]#</hash>
<inProcess>#(inProcess)#false::true#(/inProcess)#</inProcess>
</entry>
#{/list-remote}#
</remotecrawlerqueue>
<noloadcrawlerqueue>
<size>#[noloadCrawlSize]#</size>
<state>#[noloadCrawlState]#</state>
</noloadcrawlerqueue>
</queues>

@ -39,7 +39,7 @@
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_en.txt"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/><filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
@ -57,7 +57,7 @@
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>description</defaultSearchField>
<defaultSearchField>sku</defaultSearchField>
<solrQueryParser defaultOperator="AND"/>
</schema>

@ -1,4 +1,29 @@
// status_p
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 18.12.2006 on http://www.anomic.de
// this file was created using the an implementation from IndexCreate_p.java, published 02.12.2004
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.io.ByteCount;
@ -13,6 +38,8 @@ import de.anomic.server.serverSwitch;
public class status_p {
public static final String STATE_RUNNING = "running";
public static final String STATE_PAUSED = "paused";
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
@ -53,6 +80,30 @@ public class status_p {
prop.put("trafficProxy", ByteCount.getAccountCount(ByteCount.PROXY));
prop.put("trafficCrawler", ByteCount.getAccountCount(ByteCount.CRAWLER));
// index size
prop.putNum("urlpublictextSize", segment.urlMetadata().size());
prop.putNum("rwipublictextSize", segment.termIndex().sizesMax());
// loader queue
prop.putNum("loaderSize", sb.crawlQueues.workerSize());
prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10));
//local crawl queue
prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount());
prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
//global crawl queue
prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize());
prop.put("limitCrawlState", STATE_RUNNING);
//remote crawl queue
prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount());
prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
//noload crawl queue
prop.putNum("noloadCrawlSize", sb.crawlQueues.noloadCrawlJobSize());
prop.put("noloadCrawlState", STATE_RUNNING);
// return rewrite properties
return prop;
}

@ -1,35 +1,52 @@
<?xml version="1.0"?>
<status>
<ppm>#[ppm]#</ppm>
<wordCacheSize>#[wordCacheSize]#</wordCacheSize>
<wordCacheMaxSize>#[wordCacheMaxSize]#</wordCacheMaxSize>
<memory>
<free>#[freeMemory]#</free>
<total>#[totalMemory]#</total>
<max>#[maxMemory]#</max>
</memory>
<processors>#[processors]#</processors>
<traffic>
<in>#[trafficIn]#</in>
<proxy>#[trafficProxy]#</proxy>
<crawler>#[trafficCrawler]#</crawler>
</traffic>
<dbsize>
<urlpublictext>#[urlpublictextSize]#</urlpublictext>
<rwipublictext>#[rwipublictextSize]#</rwipublictext>
</dbsize>
<loaderqueue>
<size>#[loaderSize]#</size>
<max>#[loaderMax]#</max>
</loaderqueue>
<localcrawlerqueue>
<size>#[localCrawlSize]#</size>
<state>#[localCrawlState]#</state>
</localcrawlerqueue>
<limitcrawlerqueue>
<size>#[limitCrawlSize]#</size>
<state>#[limitCrawlState]#</state>
</limitcrawlerqueue>
<remotecrawlerqueue>
<size>#[remoteCrawlSize]#</size>
<state>#[remoteCrawlState]#</state>
</remotecrawlerqueue>
<noloadcrawlerqueue>
<size>#[noloadCrawlSize]#</size>
<state>#[noloadCrawlState]#</state>
</noloadcrawlerqueue>
<memory>
<free>#[freeMemory]#</free>
<total>#[totalMemory]#</total>
<max>#[maxMemory]#</max>
</memory>
<processors>#[processors]#</processors>
<traffic>
<in>#[trafficIn]#</in>
<proxy>#[trafficProxy]#</proxy>
<crawler>#[trafficCrawler]#</crawler>
</traffic>
</status>

@ -9,7 +9,7 @@
// $LastChangedBy: orbiter $
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -24,9 +24,9 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.TreeSet;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.protocol.RequestHeader;
@ -43,7 +43,6 @@ import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segments;
import net.yacy.search.query.QueryParams;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -52,18 +51,18 @@ public final class timeline {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
if ((post == null) || (env == null)) return prop;
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
Segment segment = null;
if (post.containsKey("segment") && authenticated) {
segment = sb.indexSegments.segment(post.get("segment"));
} else {
segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
final String querystring = post.get("query", ""); // a string of word hashes that shall be searched and combined
final int count = Math.min((authenticated) ? 1000 : 10, post.getInt("maximumRecords", 1000)); // SRU syntax
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
@ -75,22 +74,22 @@ public final class timeline {
language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent);
if (language == null) language = "en";
}
final TreeSet<String>[] query = QueryParams.cleanQuery(querystring); // converts also umlaute
final Collection<String>[] query = QueryParams.cleanQuery(querystring); // converts also umlaute
HandleSet q = Word.words2hashesHandles(query[0]);
// tell all threads to do nothing for a specific time
sb.intermissionAllThreads(3000);
// prepare search
final long timestamp = System.currentTimeMillis();
// prepare an abstract result
int indexabstractContainercount = 0;
int joincount = 0;
// retrieve index containers
//yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links");
// get the index container with the result vector
TermSearch<WordReference> search = null;
try {
@ -99,7 +98,7 @@ public final class timeline {
Log.logException(e);
}
ReferenceContainer<WordReference> index = search.joined();
Iterator<WordReference> i = index.entries();
WordReference entry;
int c = 0;
@ -117,14 +116,14 @@ public final class timeline {
c++;
}
prop.put("event", c);
// log
Network.log.logInfo("EXIT TIMELINE SEARCH: " +
QueryParams.anonymizedQueryHashes(q) + " - " + joincount + " links found, " +
prop.get("linkcount", "?") + " links selected, " +
indexabstractContainercount + " index abstracts, " +
(System.currentTimeMillis() - timestamp) + " milliseconds");
return prop;
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 932 B

@ -14,9 +14,10 @@
<div class="SubMenugroup">
<h3>Queues</h3>
<ul class="SubMenu">
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li>
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li>
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li>
<li><a href="/IndexCreateQueues_p.html?stack=LOCAL" class="MenuItemLink lock">Local</a></li>
<li><a href="/IndexCreateQueues_p.html?stack=GLOBAL" class="MenuItemLink lock">Global</a></li>
<li><a href="/IndexCreateQueues_p.html?stack=REMOTE" class="MenuItemLink lock">Remote</a></li>
<li><a href="/IndexCreateQueues_p.html?stack=NOLOAD" class="MenuItemLink lock">No-Load</a></li>
</ul>
</div>

@ -100,17 +100,17 @@
<tr>
<td><label for="count">Results per page</label>:</td>
<td>
<input type="radio" name="maximumRecords" value="10" #(count-10)#::checked="checked"#(/count-10)#/>10
<input type="radio" name="maximumRecords" value="50" #(count-50)#::checked="checked"#(/count-50)#/>50
<input type="radio" name="maximumRecords" value="100" #(count-100)#::checked="checked"#(/count-100)#/>100
<input type="radio" name="maximumRecords" id="mr10" value="10" #(count-10)#::checked="checked"#(/count-10)#/><label for="mr10">10</label>
<input type="radio" name="maximumRecords" id="mr50" value="50" #(count-50)#::checked="checked"#(/count-50)#/><label for="mr50">50</label>
<input type="radio" name="maximumRecords" id="mr100" value="100" #(count-100)#::checked="checked"#(/count-100)#/><label for="mr100">100</label>
</td>
</tr>
<tr>
#(resource-select)#::
<td><label for="resource">Resource</label>:</td>
<td>
<input type="radio" name="resource" value="global" #(global)#::checked="checked"#(/global)# #(global-disabled)#::disabled="disabled"#(/global-disabled)#/>the peer-to-peer network
<input type="radio" name="resource" value="local" #(local)#::checked="checked"#(/local)#/>only the local index
<input type="radio" name="resource" id="rglobal" value="global" #(global)#::checked="checked"#(/global)# #(global-disabled)#::disabled="disabled"#(/global-disabled)#/><label for="rglobal">the peer-to-peer network</label>
<input type="radio" name="resource" id="rlocal" value="local" #(local)#::checked="checked"#(/local)#/><label for="rlocal">only the local index</label>
</td>
#(/resource-select)#
</tr>

@ -5,12 +5,10 @@ WORDCACHEBAR_LENGTH=1/4;
var statusRPC;
var queuesRPC;
var refreshInterval=5;
var refreshInterval=3;
var wait=0;
var changing=false; //change the interval
var statusLoaded=true;
var queueLoaded=true;
function initCrawler(){
refresh();
@ -38,21 +36,20 @@ function newInterval(){
countInterval=window.setInterval("countdown()", 1000);
changing=false;
}
function countdown(){
if(statusLoaded && queueLoaded){
document.getElementById("nextUpdate").value=wait;
wait--;
if(wait==0){
if(statusLoaded){
wait--;
if (wait == 0) {
refresh();
}
}
}
function refresh(){
wait=refreshInterval;
statusLoaded=false;
queueLoaded=false;
requestStatus();
requestQueues();
}
function requestStatus(){
@ -61,13 +58,6 @@ function requestStatus(){
statusRPC.onreadystatechange = handleStatus;
statusRPC.send(null);
}
function requestQueues(){
queuesRPC=createRequestObject();
queuesRPC.open('get', '/api/queues_p.xml?html=');
queuesRPC.onreadystatechange = handleQueues;
queuesRPC.send(null);
}
function handleStatus(){
if(statusRPC.readyState != 4){
@ -118,65 +108,44 @@ function handleStatus(){
img.setAttribute("src", BAR_IMG1);
wordCacheSpan.appendChild(img);
}
statusLoaded=true;
}
function handleQueues(){
if(queuesRPC.readyState != 4){
return;
}
var queuesResponse = queuesRPC.responseXML;
//xml=getFirstChild(queuesResponse);
xml=getFirstChild(queuesResponse, "queues");
if(queuesResponse != null){
clearTable(document.getElementById("queueTable"), 1);
dbsize=getFirstChild(xml, "dbsize");
urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext"));
rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext"));
document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize;
document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize;
loaderqueue=getFirstChild(xml, "loaderqueue");
updateTable(loaderqueue, "loader");
loaderqueue_size=getValue(getFirstChild(loaderqueue, "size"));
loaderqueue_max=getValue(getFirstChild(loaderqueue, "max"));
document.getElementById("loaderqueuesize").firstChild.nodeValue=loaderqueue_size;
document.getElementById("loaderqueuemax").firstChild.nodeValue=loaderqueue_max;
localcrawlerqueue=getFirstChild(xml, "localcrawlerqueue");
localcrawlerqueue_size=getValue(getFirstChild(localcrawlerqueue, "size"));
localcrawlerqueue_state=getValue(getFirstChild(localcrawlerqueue, "state"));
document.getElementById("localcrawlerqueuesize").firstChild.nodeValue=localcrawlerqueue_size;
putQueueState("localcrawler", localcrawlerqueue_state);
updateTable(localcrawlerqueue, "local crawler");
limitcrawlerqueue=getFirstChild(xml, "limitcrawlerqueue");
updateTable(limitcrawlerqueue, "limitCrawlerTable");
limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size"));
limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state"));
document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size;
putQueueState("limitcrawler", limitcrawlerqueue_state);
updateTable(limitcrawlerqueue, "limit crawler");
remotecrawlerqueue=getFirstChild(xml, "remotecrawlerqueue");
updateTable(remotecrawlerqueue, "remoteCrawlerTable");
remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size"));
remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state"));
document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size;
putQueueState("remotecrawler", remotecrawlerqueue_state);
updateTable(remotecrawlerqueue, "remote crawler");
noloadcrawlerqueue=getFirstChild(xml, "noloadcrawlerqueue");
noloadcrawlerqueue_size=getValue(getFirstChild(noloadcrawlerqueue, "size"));
noloadcrawlerqueue_state=getValue(getFirstChild(noloadcrawlerqueue, "state"));
document.getElementById("noloadcrawlerqueuesize").firstChild.nodeValue=noloadcrawlerqueue_size;
putQueueState("noloadcrawler", noloadcrawlerqueue_state);
dbsize=getFirstChild(statusTag, "dbsize");
urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext"));
rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext"));
document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize;
document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize;
loaderqueue=getFirstChild(statusTag, "loaderqueue");
loaderqueue_size=getValue(getFirstChild(loaderqueue, "size"));
loaderqueue_max=getValue(getFirstChild(loaderqueue, "max"));
document.getElementById("loaderqueuesize").firstChild.nodeValue=loaderqueue_size;
document.getElementById("loaderqueuemax").firstChild.nodeValue=loaderqueue_max;
localcrawlerqueue=getFirstChild(statusTag, "localcrawlerqueue");
localcrawlerqueue_size=getValue(getFirstChild(localcrawlerqueue, "size"));
localcrawlerqueue_state=getValue(getFirstChild(localcrawlerqueue, "state"));
document.getElementById("localcrawlerqueuesize").firstChild.nodeValue=localcrawlerqueue_size;
putQueueState("localcrawler", localcrawlerqueue_state);
limitcrawlerqueue=getFirstChild(statusTag, "limitcrawlerqueue");
limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size"));
limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state"));
document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size;
putQueueState("limitcrawler", limitcrawlerqueue_state);
remotecrawlerqueue=getFirstChild(statusTag, "remotecrawlerqueue");
remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size"));
remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state"));
document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size;
putQueueState("remotecrawler", remotecrawlerqueue_state);
noloadcrawlerqueue=getFirstChild(statusTag, "noloadcrawlerqueue");
noloadcrawlerqueue_size=getValue(getFirstChild(noloadcrawlerqueue, "size"));
noloadcrawlerqueue_state=getValue(getFirstChild(noloadcrawlerqueue, "state"));
document.getElementById("noloadcrawlerqueuesize").firstChild.nodeValue=noloadcrawlerqueue_size;
putQueueState("noloadcrawler", noloadcrawlerqueue_state);
}
queueLoaded=true;
statusLoaded=true;
}
function putQueueState(queue, state) {
@ -184,53 +153,17 @@ function putQueueState(queue, state) {
img = document.getElementById(queue + "stateIMG");
if (state == "paused") {
a.href = "Crawler_p.html?continue=" + queue;
a.title = "Continue this queue";
a.title = "Continue this queue (" + state + ")";
img.src = "/env/grafics/start.gif";
img.alt = "Continue this queue";
} else {
a.href = "Crawler_p.html?pause=" + queue;
a.title = "Pause this queue";
a.title = "Pause this queue (" + state + ")";
img.src = "/env/grafics/stop.gif";
img.alt = "Pause this queue";
}
}
function updateTable(indexingqueue, tablename){
indexingTable=document.getElementById("queueTable");
entries=indexingqueue.getElementsByTagName("entry");
dark=false;
for(i=0;i<entries.length;i++){
profile=getValue(getFirstChild(entries[i], "profile"));
initiator=getValue(getFirstChild(entries[i], "initiator"));
depth=getValue(getFirstChild(entries[i], "depth"));
modified=getValue(getFirstChild(entries[i], "modified"));
anchor=getValue(getFirstChild(entries[i], "anchor"));
url=getValue(getFirstChild(entries[i], "url"));
size=getValue(getFirstChild(entries[i], "size"));
hash=getValue(getFirstChild(entries[i], "hash"));
inProcess=false;
if(getValue(getFirstChild(entries[i], "inProcess"))=="true"){
inProcess=true;
}
if (tablename=="indexingTable")
deletebutton=createLinkCol("IndexCreateIndexingQueue_p.html?deleteEntry="+hash, DELETE_STRING);
else
deletebutton=createCol("");
row=createIndexingRow(tablename, profile, initiator, depth, modified, anchor, url, size, deletebutton);
//create row
if(inProcess){
row.setAttribute("class", "TableCellActive");
}else if(dark){
row.setAttribute("class", "TableCellDark");
}else{
row.setAttribute("class", "TableCellLight");
}
getFirstChild(indexingTable, "tbody").appendChild(row);
dark=!dark;
}
}
function shortenURL(url) {
if (url.length > 80) {

@ -60,7 +60,7 @@ public class urls {
if (post.get("call", "").equals("remotecrawl")) {
// perform a remote crawl url handover
final NoticedURL.StackType stackType = NoticedURL.StackType.LIMIT;
final NoticedURL.StackType stackType = NoticedURL.StackType.GLOBAL;
int maxCount = Math.min(100, post.getInt("count", 10));
final long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000)));
final long timeout = System.currentTimeMillis() + maxTime;

@ -602,14 +602,14 @@ public class yacysearch {
(post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "");
// the query
final TreeSet<String>[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute
final Collection<String>[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute
final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? query.length - 1 : Integer.MAX_VALUE;
// filter out stopwords
final SortedSet<String> filtered = SetTools.joinConstructive(query[0], Switchboard.stopwords);
final SortedSet<String> filtered = SetTools.joinConstructiveByTest(query[0], Switchboard.stopwords);
if ( !filtered.isEmpty() ) {
SetTools.excludeDestructive(query[0], Switchboard.stopwords);
SetTools.excludeDestructiveByTestSmallInLarge(query[0], Switchboard.stopwords);
}
// if a minus-button was hit, remove a special reference first

@ -25,8 +25,8 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
@ -165,7 +165,7 @@ public class yacysearchitem {
prop.putHTML("content_publisher", result.publisher());
prop.putHTML("content_creator", result.creator());// author
prop.putHTML("content_subject", result.subject());
final Set<String>[] query = theQuery.queryWords();
final Collection<String>[] query = theQuery.queryWords();
final StringBuilder s = new StringBuilder(query[0].size() * 20);
for (final String t: query[0]) {
s.append('+').append(t);

@ -29,22 +29,20 @@ package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.HandleSet;
@ -53,7 +51,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.table.Table;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.MemoryControl;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.client.Cache;
@ -74,9 +71,6 @@ public class Balancer {
// class variables computed during operation
private final ConcurrentMap<String, HandleSet> domainStacks; // a map from host name to lists with url hashs
private final ConcurrentLinkedQueue<byte[]> top; // a list of url-hashes that shall be taken next
private final SortedMap<Long, byte[]> delayed;
private final HandleSet ddc;
private final HandleSet double_push_check; // for debugging
private long lastDomainStackFill;
private int domStackInitSize;
@ -91,13 +85,10 @@ public class Balancer {
final boolean exceed134217727) {
this.cacheStacksPath = cachePath;
this.domainStacks = new ConcurrentHashMap<String, HandleSet>();
this.top = new ConcurrentLinkedQueue<byte[]>();
this.delayed = new TreeMap<Long, byte[]>();
this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta;
this.myAgentIDs = myAgentIDs;
this.domStackInitSize = Integer.MAX_VALUE;
this.ddc = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.double_push_check = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
// create a stack for newly entered entries
@ -145,12 +136,7 @@ public class Balancer {
Log.logException(e);
}
this.domainStacks.clear();
this.top.clear();
synchronized (this.delayed) {
this.delayed.clear();
}
this.double_push_check.clear();
this.ddc.clear();
}
public Request get(final byte[] urlhash) throws IOException {
@ -202,28 +188,11 @@ public class Balancer {
if (entry != null) removedCounter++;
// remove from double-check caches
this.ddc.remove(urlhash);
this.double_push_check.remove(urlhash);
}
if (removedCounter == 0) return 0;
assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s;
// iterate through the top list
final Iterator<byte[]> j = this.top.iterator();
byte[] urlhash;
while (j.hasNext()) {
urlhash = j.next();
if (urlHashes.has(urlhash)) j.remove();
}
// remove from delayed
synchronized (this.delayed) {
final Iterator<Map.Entry<Long, byte[]>> k = this.delayed.entrySet().iterator();
while (k.hasNext()) {
if (urlHashes.has(k.next().getValue())) k.remove();
}
}
// iterate through the domain stacks
final Iterator<Map.Entry<String, HandleSet>> q = this.domainStacks.entrySet().iterator();
HandleSet stack;
@ -237,7 +206,7 @@ public class Balancer {
}
public boolean has(final byte[] urlhashb) {
return this.urlFileIndex.has(urlhashb) || this.ddc.has(urlhashb);
return this.urlFileIndex.has(urlhashb) || this.double_push_check.has(urlhashb);
}
public boolean notEmpty() {
@ -277,7 +246,6 @@ public class Balancer {
synchronized (this) {
// double-check
if (this.double_push_check.has(hash)) return "double occurrence in double_push_check";
if (this.ddc.has(hash)) return "double occurrence in ddc";
if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex";
if (this.double_push_check.size() > 10000 || MemoryControl.shortStatus()) this.double_push_check.clear();
@ -297,16 +265,16 @@ public class Balancer {
/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to the size of the domain stack
* @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time}
*/
public Map<String, Integer> getDomainStackHosts() {
Map<String, Integer> map = new HashMap<String, Integer>();
public Map<String, Integer[]> getDomainStackHosts() {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) {
map.put(entry.getKey(), entry.getValue().size());
map.put(entry.getKey(), new Integer[]{entry.getValue().size(), (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)});
}
return map;
}
/**
* compute the current sleep time for a given crawl entry
* @param cs
@ -315,20 +283,20 @@ public class Balancer {
*/
public long getDomainSleepTime(final CrawlSwitchboard cs, Request crawlEntry) {
final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
return getDomainSleepTime(cs, profileEntry, crawlEntry);
return getDomainSleepTime(cs, profileEntry, crawlEntry.url());
}
private long getDomainSleepTime(final CrawlSwitchboard cs, final CrawlProfile profileEntry, Request crawlEntry) {
private long getDomainSleepTime(final CrawlSwitchboard cs, final CrawlProfile profileEntry, final DigestURI crawlURL) {
if (profileEntry == null) {
return 0;
}
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? 0 : Latency.waitingRemaining(crawlURL, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* get lists of crawl request entries for a specific host
* @param host
@ -360,7 +328,7 @@ public class Balancer {
}
return cel;
}
private void pushHashToDomainStacks(String host, final byte[] urlhash) throws RowSpaceExceededException {
// extend domain stack
if (host == null) host = localhost;
@ -388,21 +356,6 @@ public class Balancer {
if (domainList.isEmpty()) this.domainStacks.remove(host);
}
private byte[] nextFromDelayed() {
if (this.delayed.isEmpty()) return null;
final Long first = this.delayed.firstKey();
if (first.longValue() < System.currentTimeMillis()) {
return this.delayed.remove(first);
}
return null;
}
private byte[] anyFromDelayed() {
if (this.delayed.isEmpty()) return null;
final Long first = this.delayed.firstKey();
return this.delayed.remove(first);
}
/**
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized
* and always above the given minimum delay time. An additional delay time is computed using the robots.txt
@ -418,41 +371,13 @@ public class Balancer {
public Request pop(final boolean delay, final CrawlSwitchboard cs) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
try {
filltop(delay, -600000, false);
filltop(delay, -60000, false);
filltop(delay, -10000, false);
filltop(delay, -6000, false);
filltop(delay, -4000, false);
filltop(delay, -3000, false);
filltop(delay, -2000, false);
filltop(delay, -1000, false);
filltop(delay, -500, false);
filltop(delay, 0, true);
filltop(delay, 500, true);
filltop(delay, 1000, true);
filltop(delay, 2000, true);
filltop(delay, 3000, true);
filltop(delay, 4000, true);
filltop(delay, 6000, true);
filltop(delay, Long.MAX_VALUE, true);
} catch (final RowSpaceExceededException e) {}
long sleeptime = 0;
Request crawlEntry = null;
synchronized (this) {
byte[] failhash = null;
while (!this.urlFileIndex.isEmpty()) {
// first simply take one of the entries in the top list, that should be one without any delay
byte[] nexthash = nextFromDelayed();
//System.out.println("*** nextFromDelayed=" + nexthash);
if (nexthash == null && !this.top.isEmpty()) {
nexthash = this.top.remove();
//System.out.println("*** top.remove()=" + nexthash);
}
if (nexthash == null) {
nexthash = anyFromDelayed();
}
byte[] nexthash = getbest();
if (nexthash == null) return null;
// check minimumDelta and if necessary force a sleep
//final int s = urlFileIndex.size();
@ -485,37 +410,14 @@ public class Balancer {
return null;
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = getDomainSleepTime(cs, profileEntry, crawlEntry);
sleeptime = getDomainSleepTime(cs, profileEntry, crawlEntry.url());
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops
if (delay && sleeptime > 0 && this.domStackInitSize > 1) {
//System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
// put that thing back to omit a delay here
if (!ByteBuffer.contains(this.delayed.values(), nexthash)) {
//System.out.println("*** delayed +=" + nexthash);
this.delayed.put(Long.valueOf(System.currentTimeMillis() + sleeptime + 1), nexthash);
}
try {
this.urlFileIndex.put(rowEntry);
String host = crawlEntry.url().getHost();
if (host == null) host = localhost;
this.domainStacks.remove(host);
failhash = nexthash;
} catch (final RowSpaceExceededException e) {
Log.logException(e);
}
continue;
}
break;
}
if (crawlEntry != null) {
if (this.ddc.size() > 10000 || MemoryControl.shortStatus()) this.ddc.clear();
try { this.ddc.put(crawlEntry.url().hash()); } catch (final RowSpaceExceededException e) {}
}
}
if (crawlEntry == null) return null;
@ -524,7 +426,7 @@ public class Balancer {
// in best case, this should never happen if the balancer works propertly
// this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", top.size() = " + this.top.size() + ", delayed.size() = " + this.delayed.size() + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
long loops = sleeptime / 1000;
long rest = sleeptime % 1000;
if (loops < 3) {
@ -537,15 +439,11 @@ public class Balancer {
try {this.wait(1000); } catch (final InterruptedException e) {}
}
}
this.ddc.remove(crawlEntry.url().hash());
Latency.update(crawlEntry.url());
return crawlEntry;
}
private void filltop(final boolean delay, final long maximumwaiting, final boolean acceptonebest) throws RowSpaceExceededException {
if (!this.top.isEmpty()) return;
//System.out.println("*** DEBUG started filltop delay=" + ((delay) ? "true":"false") + ", maximumwaiting=" + maximumwaiting + ", acceptonebest=" + ((acceptonebest) ? "true":"false"));
private byte[] getbest() {
// check if we need to get entries from the file index
try {
@ -560,6 +458,7 @@ public class Balancer {
long smallestWaiting = Long.MAX_VALUE;
byte[] besturlhash = null;
String besthost = null;
Map<String, byte[]> zeroWaitingCandidates = new HashMap<String, byte[]>();
while (i.hasNext()) {
entry = i.next();
@ -571,34 +470,52 @@ public class Balancer {
final byte[] n = entry.getValue().removeOne();
if (n == null) continue;
if (delay) {
final long w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
if (w > maximumwaiting) {
if (w < smallestWaiting) {
smallestWaiting = w;
besturlhash = n;
besthost = entry.getKey();
}
entry.getValue().put(n); // put entry back
continue;
final long w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
if (w < smallestWaiting) {
smallestWaiting = w;
besturlhash = n;
besthost = entry.getKey();
if (w <= 0) {
zeroWaitingCandidates.put(besthost, besturlhash);
}
}
this.top.add(n);
if (entry.getValue().isEmpty()) i.remove();
try {
entry.getValue().put(n); // put entry back, we are checking only
} catch (RowSpaceExceededException e) {
e.printStackTrace();
}
}
// if we could not find any entry, then take the best we have seen so far
if (acceptonebest && !this.top.isEmpty() && besturlhash != null) {
removeHashFromDomainStacks(besthost, besturlhash);
this.top.add(besturlhash);
if (besturlhash == null) return null; // worst case
// best case would be, if we have some zeroWaitingCandidates,
// then we select that one with the largest stack
if (zeroWaitingCandidates.size() > 0) {
int largestStack = -1;
String largestStackHost = null;
byte[] largestStackHash = null;
for (Map.Entry<String, byte[]> z: zeroWaitingCandidates.entrySet()) {
HandleSet hs = this.domainStacks.get(z.getKey());
if (hs == null || hs.size() <= largestStack) continue;
largestStack = hs.size();
largestStackHost = z.getKey();
largestStackHash = z.getValue();
}
if (largestStackHost != null && largestStackHash != null) {
removeHashFromDomainStacks(largestStackHost, largestStackHash);
//Log.logInfo("Balancer", "*** picked one from largest stack");
return largestStackHash;
}
}
// default case: just take that one with least waiting
removeHashFromDomainStacks(besthost, besturlhash);
return besturlhash;
}
private void fillDomainStacks() throws IOException {
if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 120000L) return;
if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return;
this.domainStacks.clear();
this.top.clear();
this.lastDomainStackFill = System.currentTimeMillis();
final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2);
final CloneableIterator<byte[]> i = handles.keys(true, null);
@ -621,51 +538,6 @@ public class Balancer {
this.domStackInitSize = this.domainStacks.size();
}
public List<Request> top(int count) {
final List<Request> cel = new ArrayList<Request>();
if (count == 0) return cel;
byte[][] ta = new byte[Math.min(count, this.top.size())][];
ta = this.top.toArray(ta);
for (final byte[] n: ta) {
if (n == null) break;
try {
final Row.Entry rowEntry = this.urlFileIndex.get(n, false);
if (rowEntry == null) continue;
final Request crawlEntry = new Request(rowEntry);
cel.add(crawlEntry);
count--;
if (count <= 0) break;
} catch (final IOException e) {}
}
int depth = 0;
loop: while (count > 0) {
// iterate over the domain stacks
final int celsize = cel.size();
ll: for (final HandleSet list: this.domainStacks.values()) {
if (list.size() <= depth) continue ll;
final byte[] n = list.getOne(depth);
if (n == null) continue ll;
try {
final Row.Entry rowEntry = this.urlFileIndex.get(n, false);
if (rowEntry == null) continue;
final Request crawlEntry = new Request(rowEntry);
cel.add(crawlEntry);
count--;
if (count <= 0) break loop;
} catch (final IOException e) {}
}
if (cel.size() == celsize) break loop;
depth++;
}
if (cel.size() < count) try {
final List<Row.Entry> list = this.urlFileIndex.top(count - cel.size());
for (final Row.Entry entry: list) cel.add(new Request(entry));
} catch (final IOException e) { }
return cel;
}
public Iterator<Request> iterator() throws IOException {
return new EntryIterator();
}
@ -678,10 +550,12 @@ public class Balancer {
this.rowIterator = Balancer.this.urlFileIndex.rows();
}
@Override
public boolean hasNext() {
return (this.rowIterator == null) ? false : this.rowIterator.hasNext();
}
@Override
public Request next() {
final Row.Entry entry = this.rowIterator.next();
try {
@ -693,6 +567,7 @@ public class Balancer {
}
}
@Override
public void remove() {
if (this.rowIterator != null) this.rowIterator.remove();
}

@ -215,7 +215,7 @@ public class CrawlQueues {
}
public int coreCrawlJobSize() {
return this.noticeURL.stackSize(NoticedURL.StackType.CORE) + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD);
return this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD);
}
public boolean coreCrawlJob() {
@ -226,14 +226,14 @@ public class CrawlQueues {
// move some tasks to the core crawl job so we have something to do
final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
for (int i = 0; i < toshift; i++) {
this.noticeURL.shift(NoticedURL.StackType.LIMIT, NoticedURL.StackType.CORE, this.sb.crawler);
this.noticeURL.shift(NoticedURL.StackType.GLOBAL, NoticedURL.StackType.LOCAL, this.sb.crawler);
}
this.log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() +
", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + this.sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "") +
", robinsonMode=" + ((this.sb.isRobinsonMode()) ? "on" : "off"));
}
final String queueCheckCore = loadIsPossible(NoticedURL.StackType.CORE);
final String queueCheckCore = loadIsPossible(NoticedURL.StackType.LOCAL);
final String queueCheckNoload = loadIsPossible(NoticedURL.StackType.NOLOAD);
if (queueCheckCore != null && queueCheckNoload != null) {
if (this.log.isFine()) {
@ -251,11 +251,11 @@ public class CrawlQueues {
// do a local crawl
Request urlEntry;
while (this.noticeURL.stackSize(NoticedURL.StackType.CORE) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
while (this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
final String stats = "LOCALCRAWL[" +
this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) +
", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
try {
@ -284,7 +284,7 @@ public class CrawlQueues {
return true;
}
urlEntry = this.noticeURL.pop(NoticedURL.StackType.CORE, true, this.sb.crawler);
urlEntry = this.noticeURL.pop(NoticedURL.StackType.LOCAL, true, this.sb.crawler);
if (urlEntry == null) {
continue;
}
@ -300,7 +300,7 @@ public class CrawlQueues {
} catch (final IOException e) {
this.log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
if (e.getMessage().indexOf("hash is null",0) > 0) {
this.noticeURL.clear(NoticedURL.StackType.CORE);
this.noticeURL.clear(NoticedURL.StackType.LOCAL);
}
}
}
@ -547,7 +547,7 @@ public class CrawlQueues {
}
public int limitCrawlJobSize() {
return this.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
return this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL);
}
public int noloadCrawlJobSize() {
@ -579,7 +579,7 @@ public class CrawlQueues {
}
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", "
final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", "
+ this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
try {
final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler);

@ -370,14 +370,14 @@ public final class CrawlStacker {
// it may be possible that global == true and local == true, so do not check an error case against it
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LIMIT, entry);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry);
} else if (local) {
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry);
} else if (proxy) {
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry);
} else if (remote) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry);
}

@ -146,7 +146,7 @@ public class Latency {
// return time that is remaining
//System.out.println("Latency: " + (waiting - timeSinceLastAccess));
return waiting - timeSinceLastAccess;
return Math.max(0, waiting - timeSinceLastAccess);
}
/**

@ -44,7 +44,7 @@ import de.anomic.crawler.retrieval.Request;
public class NoticedURL {
public enum StackType {
NULL, CORE, LIMIT, OVERHANG, REMOTE, NOLOAD, IMAGE, MOVIE, MUSIC;
LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD;
}
public static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
@ -146,8 +146,8 @@ public class NoticedURL {
public int stackSize(final StackType stackType) {
switch (stackType) {
case NOLOAD: return (this.noloadStack == null) ? 0 : this.noloadStack.size();
case CORE: return (this.coreStack == null) ? 0 : this.coreStack.size();
case LIMIT: return (this.limitStack == null) ? 0 : this.limitStack.size();
case LOCAL: return (this.coreStack == null) ? 0 : this.coreStack.size();
case GLOBAL: return (this.limitStack == null) ? 0 : this.limitStack.size();
case OVERHANG: return 0;
case REMOTE: return (this.remoteStack == null) ? 0 : this.remoteStack.size();
default: return -1;
@ -172,9 +172,9 @@ public class NoticedURL {
public String push(final StackType stackType, final Request entry) {
try {
switch (stackType) {
case CORE:
case LOCAL:
return this.coreStack.push(entry);
case LIMIT:
case GLOBAL:
return this.limitStack.push(entry);
case REMOTE:
return this.remoteStack.push(entry);
@ -233,30 +233,30 @@ public class NoticedURL {
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to the size of the domain stacks
*/
public Map<String, Integer> getDomainStackHosts(final StackType stackType) {
public Map<String, Integer[]> getDomainStackHosts(final StackType stackType) {
switch (stackType) {
case CORE: return this.coreStack.getDomainStackHosts();
case LIMIT: return this.limitStack.getDomainStackHosts();
case LOCAL: return this.coreStack.getDomainStackHosts();
case GLOBAL: return this.limitStack.getDomainStackHosts();
case REMOTE: return this.remoteStack.getDomainStackHosts();
case NOLOAD: return this.noloadStack.getDomainStackHosts();
default: return null;
}
}
/**
* get a list of domains that are currently maintained as domain stacks
* @return a collection of clear text strings of host names
*/
public long getDomainSleepTime(final StackType stackType, final CrawlSwitchboard cs, Request crawlEntry) {
switch (stackType) {
case CORE: return this.coreStack.getDomainSleepTime(cs, crawlEntry);
case LIMIT: return this.limitStack.getDomainSleepTime(cs, crawlEntry);
case LOCAL: return this.coreStack.getDomainSleepTime(cs, crawlEntry);
case GLOBAL: return this.limitStack.getDomainSleepTime(cs, crawlEntry);
case REMOTE: return this.remoteStack.getDomainSleepTime(cs, crawlEntry);
case NOLOAD: return this.noloadStack.getDomainSleepTime(cs, crawlEntry);
default: return 0;
}
}
/**
* get lists of crawl request entries for a specific host
* @param host
@ -265,28 +265,18 @@ public class NoticedURL {
*/
public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount) {
switch (stackType) {
case CORE: return this.coreStack.getDomainStackReferences(host, maxcount);
case LIMIT: return this.limitStack.getDomainStackReferences(host, maxcount);
case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount);
case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount);
case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount);
case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount);
default: return null;
}
}
public List<Request> top(final StackType stackType, final int count) {
switch (stackType) {
case CORE: return top(this.coreStack, count);
case LIMIT: return top(this.limitStack, count);
case REMOTE: return top(this.remoteStack, count);
case NOLOAD: return top(this.noloadStack, count);
default: return null;
}
}
public Request pop(final StackType stackType, final boolean delay, final CrawlSwitchboard cs) throws IOException {
switch (stackType) {
case CORE: return pop(this.coreStack, delay, cs);
case LIMIT: return pop(this.limitStack, delay, cs);
case LOCAL: return pop(this.coreStack, delay, cs);
case GLOBAL: return pop(this.limitStack, delay, cs);
case REMOTE: return pop(this.remoteStack, delay, cs);
case NOLOAD: return pop(this.noloadStack, false, cs);
default: return null;
@ -310,8 +300,8 @@ public class NoticedURL {
public void clear(final StackType stackType) {
Log.logInfo("NoticedURL", "CLEARING STACK " + stackType);
switch (stackType) {
case CORE: this.coreStack.clear(); break;
case LIMIT: this.limitStack.clear(); break;
case LOCAL: this.coreStack.clear(); break;
case GLOBAL: this.limitStack.clear(); break;
case REMOTE: this.remoteStack.clear(); break;
case NOLOAD: this.noloadStack.clear(); break;
default: return;
@ -340,17 +330,11 @@ public class NoticedURL {
return null;
}
private static List<Request> top(final Balancer balancer, int count) {
// this is a filo - top
if (count > balancer.size()) count = balancer.size();
return balancer.top(count);
}
public Iterator<Request> iterator(final StackType stackType) {
// returns an iterator of plasmaCrawlBalancerEntry Objects
try {switch (stackType) {
case CORE: return this.coreStack.iterator();
case LIMIT: return this.limitStack.iterator();
case LOCAL: return this.coreStack.iterator();
case GLOBAL: return this.limitStack.iterator();
case REMOTE: return this.remoteStack.iterator();
case NOLOAD: return this.noloadStack.iterator();
default: return null;

@ -143,7 +143,7 @@ public class BookmarkHelper {
//load the links
final ContentScraper scraper = new ContentScraper(baseURL);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer= new TransformerWriter(null,null,scraper, null, false);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(input,writer);
writer.close();
links = scraper.getAnchors();

@ -40,6 +40,7 @@ import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.kelondro.blob.ArrayStack;
import net.yacy.kelondro.blob.Compressor;
@ -172,26 +173,30 @@ public final class Cache {
* @return true if the content of the url is in the cache, false otherwise
*/
public static boolean has(final DigestURI url) {
return has(url.hash());
}
public static boolean has(final byte[] urlhash) {
boolean headerExists;
boolean fileExists;
//synchronized (responseHeaderDB) {
headerExists = responseHeaderDB.containsKey(url.hash());
fileExists = fileDB.containsKey(url.hash());
headerExists = responseHeaderDB.containsKey(urlhash);
fileExists = fileDB.containsKey(urlhash);
//}
if (headerExists && fileExists) return true;
if (!headerExists && !fileExists) return false;
// if not both is there then we do a clean-up
if (headerExists) try {
log.logWarning("header but not content of url " + url.toString() + " in cache; cleaned up");
log.logWarning("header but not content of urlhash " + ASCII.String(urlhash) + " in cache; cleaned up");
if (responseHeaderDB instanceof MapHeap) {
((MapHeap) responseHeaderDB).delete(url.hash());
((MapHeap) responseHeaderDB).delete(urlhash);
} else {
responseHeaderDB.remove(url.hash());
responseHeaderDB.remove(urlhash);
}
} catch (final IOException e) {}
if (fileExists) try {
//log.logWarning("content but not header of url " + url.toString() + " in cache; cleaned up");
fileDB.delete(url.hash());
fileDB.delete(urlhash);
} catch (final IOException e) {}
return false;
}

@ -1039,18 +1039,18 @@ public final class HTTPDFileHandler {
if (mimeType.startsWith("text")) {
// every text-file distributed by yacy is UTF-8
if(!path.startsWith("/repository")) {
if (!path.startsWith("/repository")) {
mimeType = mimeType + "; charset=UTF-8";
} else {
// detect charset of html-files
if((path.endsWith("html") || path.endsWith("htm"))) {
if ((path.endsWith("html") || path.endsWith("htm"))) {
// save position
fis.mark(1000);
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(fis,"UTF-8",new DigestURI("http://localhost"),null,false);
final ScraperInputStream htmlFilter = new ScraperInputStream(fis, "UTF-8", new DigestURI("http://localhost"), null, false);
final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
if(charset != null)
mimeType = mimeType + "; charset="+charset;
htmlFilter.close();
if (charset != null) mimeType = mimeType + "; charset="+charset;
// reset position
fis.reset();
}

@ -560,7 +560,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
private void identPort(final String inputURL, final int dflt) throws MalformedURLException {
// identify ref in file
final int r = this.host.indexOf(':');
int pss = 0;
int ip6 = this.host.indexOf('[');
if (ip6 >= 0 && ((ip6 = this.host.indexOf("]", ip6)) > 0)) {
pss = ip6 + 1;
}
final int r = this.host.indexOf(":", pss);
if (r < 0) {
this.port = dflt;
} else {
@ -1164,13 +1169,14 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return splitpattern.split(normalizedURL.toLowerCase()); // word components of the url
}
/*
public static void main(final String[] args) {
for (final String s: args) System.out.println(toTokens(s));
}
/*
*/
public static void main(final String[] args) {
final String[][] test = new String[][]{
new String[]{null, "http://[2a00:1450:400c:c01::69]/"},
new String[]{null, "C:WINDOWS\\CMD0.EXE"},
new String[]{null, "file://C:WINDOWS\\CMD0.EXE"},
new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
@ -1221,9 +1227,9 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
String environment, url;
MultiProtocolURI aURL, aURL1;
java.net.URL jURL;
for (int i = 0; i < test.length; i++) {
environment = test[i][0];
url = test[i][1];
for (String[] element : test) {
environment = element[0];
url = element[1];
try {aURL = MultiProtocolURI.newURL(environment, url);} catch (final MalformedURLException e) {e.printStackTrace(); aURL = null;}
if (environment == null) {
try {jURL = new java.net.URL(url);} catch (final MalformedURLException e) {jURL = null;}
@ -1255,6 +1261,5 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
}
}
}
*/
}

@ -31,6 +31,7 @@ import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
@ -148,7 +149,7 @@ public class SolrScheme extends ConfigurationSet {
wordcount_i(Types.integer, true, true),
paths_txt(Types.text_general, true, true, true),
inboundlinkscount_i(Types.integer, true, true),
inboundlinksnoindexcount_i(Types.integer, true, true),
inboundlinksnofollowcount_i(Types.integer, true, true),
inboundlinks_tag_txt(Types.text_general, true, true, true),
inboundlinks_protocol_txt(Types.text_general, true, true, true),
inboundlinks_urlstub_txt(Types.text_general, true, true, true),
@ -157,7 +158,7 @@ public class SolrScheme extends ConfigurationSet {
inboundlinks_relflags_txt(Types.text_general, true, true, true),
inboundlinks_text_txt(Types.text_general, true, true, true),
outboundlinkscount_i(Types.integer, true, true),
outboundlinksnoindexcount_i(Types.integer, true, true),
outboundlinksnofollowcount_i(Types.integer, true, true),
outboundlinks_tag_txt(Types.text_general, true, true, true),
outboundlinks_protocol_txt(Types.text_general, true, true, true),
outboundlinks_urlstub_txt(Types.text_general, true, true, true),
@ -297,7 +298,7 @@ public class SolrScheme extends ConfigurationSet {
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
int c = 0;
if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, yacydoc.inboundLinkCount());
if (isEmpty() || contains(Field.inboundlinksnoindexcount_i.name())) addSolr(solrdoc, Field.inboundlinksnoindexcount_i, yacydoc.inboundLinkNoindexCount());
if (isEmpty() || contains(Field.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
@ -325,7 +326,7 @@ public class SolrScheme extends ConfigurationSet {
c++;
}
if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag);
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, inboundlinksURLProtocol);
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName);
if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel);
@ -334,7 +335,7 @@ public class SolrScheme extends ConfigurationSet {
c = 0;
if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, yacydoc.outboundLinkCount());
if (isEmpty() || contains(Field.outboundlinksnoindexcount_i.name())) addSolr(solrdoc, Field.outboundlinksnoindexcount_i, yacydoc.outboundLinkNoindexCount());
if (isEmpty() || contains(Field.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
@ -362,7 +363,7 @@ public class SolrScheme extends ConfigurationSet {
c++;
}
if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag);
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, outboundlinksURLProtocol);
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName);
if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel);
@ -476,7 +477,7 @@ public class SolrScheme extends ConfigurationSet {
}
addSolr(solrdoc, Field.imagescount_i, imgtags.length);
if (isEmpty() || contains(Field.images_tag_txt.name())) addSolr(solrdoc, Field.images_tag_txt, imgtags);
if (isEmpty() || contains(Field.images_protocol_txt.name())) addSolr(solrdoc, Field.images_protocol_txt, imgprots);
if (isEmpty() || contains(Field.images_protocol_txt.name())) addSolr(solrdoc, Field.images_protocol_txt, protocolList2indexedList(imgprots));
if (isEmpty() || contains(Field.images_urlstub_txt.name())) addSolr(solrdoc, Field.images_urlstub_txt, imgstubs);
if (isEmpty() || contains(Field.images_alt_txt.name())) addSolr(solrdoc, Field.images_alt_txt, imgalts);
@ -556,6 +557,18 @@ public class SolrScheme extends ConfigurationSet {
return solrdoc;
}
private static String[] protocolList2indexedList(String[] protocol) {
List<String> a = new ArrayList<String>();
for (int i = 0; i < protocol.length; i++) {
if (!protocol[i].equals("http")) {
String c = Integer.toString(i);
while (c.length() < 3) c = "0" + c;
a.add(c + "-" + protocol[i]);
}
}
return a.toArray(new String[a.size()]);
}
/**
* encode a string containing attributes from anchor rel properties binary:
* bit 0: "me" contained in rel
@ -615,7 +628,7 @@ public class SolrScheme extends ConfigurationSet {
}
/*
* standard solr scheme
standard solr schema
<field name="name" type="textgen" indexed="true" stored="true"/>
<field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>
@ -641,6 +654,5 @@ public class SolrScheme extends ConfigurationSet {
<field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="last_modified" type="date" indexed="true" stored="true"/>
<field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
*/
}

@ -121,13 +121,13 @@ public class ConfigurationSet extends AbstractSet<String> implements Set<String>
return false;
}
public void fill(final ConfigurationSet other) {
public void fill(final ConfigurationSet other, final boolean defaultActivated) {
final Iterator<Entry> i = other.allIterator();
Entry e;
while (i.hasNext()) {
e = i.next();
if (contains(e.key) || containsDisabled(e.key)) continue;
this.add(e.key(), other.commentHeadline(e.key()), e.enabled());
this.add(e.key(), other.commentHeadline(e.key()), defaultActivated && e.enabled());
}
}

@ -634,22 +634,22 @@ dc_rights
return (this.outboundlinks == null) ? 0 : this.outboundlinks.size();
}
public int inboundLinkNoindexCount() {
public int inboundLinkNofollowCount() {
if (this.inboundlinks == null) resortLinks();
if (this.inboundlinks == null) return 0;
int c = 0;
for (final String tag: this.inboundlinks.values()) {
if (tag.contains("noindex")) c++;
if (tag.contains("nofollow")) c++;
}
return c;
}
public int outboundLinkNoindexCount() {
public int outboundLinkNofollowCount() {
if (this.outboundlinks == null) resortLinks();
if (this.outboundlinks == null) return 0;
int c = 0;
for (final String tag: this.outboundlinks.values()) {
if (tag.contains("noindex")) c++;
if (tag.contains("nofollow")) c++;
}
return c;
}

@ -472,6 +472,7 @@ public final class TextParser {
}
public static void grantExtension(final String ext, final boolean grant) {
if (ext == null || ext.length() == 0) return;
if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v);
}

@ -131,6 +131,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private float lon, lat;
private MultiProtocolURI canonical;
/**
* {@link MultiProtocolURI} to the favicon that belongs to the document
*/
@ -151,6 +152,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
assert root != null;
this.root = root;
this.evaluationScores = new Evaluation();
this.rss = new HashMap<MultiProtocolURI, String>();
@ -175,6 +177,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.canonical = null;
}
@Override
public void finish() {
this.content.trimToSize();
}
private void mergeAnchors(final MultiProtocolURI url, final Properties p) {
final Properties p0 = this.anchors.get(url);
if (p0 == null) {
@ -485,17 +492,23 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
try {
FileUtils.copy(new CharArrayReader(inlineHtml), writer);
writer.close();
} catch (final IOException e) {
Log.logException(e);
return cleanLine(super.stripAll(inlineHtml));
} finally {
try {
writer.close();
} catch (IOException e) {
}
}
for (final Map.Entry<MultiProtocolURI, Properties> entry: scraper.getAnchors().entrySet()) {
mergeAnchors(entry.getKey(), entry.getValue());
}
this.images.putAll(scraper.images);
return cleanLine(super.stripAll(scraper.content.getChars()));
String line = cleanLine(super.stripAll(scraper.content.getChars()));
scraper.close();
return line;
}
private final static String cleanLine(final String s) {
@ -885,14 +898,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
if(charset == null)
charset = Charset.defaultCharset().toString();
htmlFilter.close();
if (charset == null) charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"));
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
writer.close();
return scraper;
}

@ -34,7 +34,6 @@ import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
public class ContentTransformer extends AbstractTransformer implements Transformer {
@ -90,11 +89,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
bb.append("</FONT> ");
final char[] result = bb.getChars();
try {
bb.close();
} catch (IOException e) {
Log.logException(e);
}
bb.close();
return result;
}

@ -1,4 +1,4 @@
// Scraper.java
// Scraper.java
// ---------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
@ -39,10 +39,12 @@ public interface Scraper {
public void scrapeTag1(String tagname, Properties tagopts, char[] text);
public void scrapeComment(final char[] comment);
public void finish();
public void close();
public void registerHtmlFilterEventListener(ScraperListener listener);
public void deregisterHtmlFilterEventListener(ScraperListener listener);
}

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -39,11 +39,11 @@ import net.yacy.cora.document.MultiProtocolURI;
public class ScraperInputStream extends InputStream implements ScraperListener {
private static final int MODE_PRESCAN = 0;
private static final int MODE_PRESCAN_FINISHED = 1;
private int mode = 1;
private static final long preBufferSize = 4096;
private long preRead = 0;
private final BufferedInputStream bufferedIn;
@ -51,10 +51,10 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
private String detectedCharset;
private boolean charsetChanged = false;
private boolean endOfHead = false;
private Reader reader;
private Writer writer;
public ScraperInputStream(
final InputStream inStream,
final String inputStreamCharset,
@ -65,10 +65,10 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
// create a input stream for buffereing
this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
this.bufferedIn.mark((int) preBufferSize);
final ContentScraper scraper = new ContentScraper(rooturl);
scraper.registerHtmlFilterEventListener(this);
try {
this.reader = (inputStreamCharset == null) ? new InputStreamReader(this) : new InputStreamReader(this,inputStreamCharset);
} catch (UnsupportedEncodingException e) {
@ -78,17 +78,17 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
// how is that possible?
this.reader = new InputStreamReader(this);
}
}
}
this.writer = new TransformerWriter(null,null,scraper,transformer,passbyIfBinarySuspect);
}
private static String extractCharsetFromMimetypeHeader(final String mimeType) {
if (mimeType == null) return null;
final String[] parts = mimeType.split(";");
if (parts == null || parts.length <= 1) return null;
for (int i=1; i < parts.length; i++) {
for (int i=1; i < parts.length; i++) {
final String param = parts[i].trim();
if (param.startsWith("charset=")) {
String charset = param.substring("charset=".length()).trim();
@ -97,13 +97,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
return charset.trim();
}
}
return null;
return null;
}
@Override
public void scrapeTag0(final String tagname, final Properties tagopts) {
if (tagname == null || tagname.length() == 0) return;
if (tagname.equalsIgnoreCase("meta")) {
if (tagopts.containsKey("http-equiv")) {
final String value = tagopts.getProperty("http-equiv");
@ -113,7 +114,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
this.detectedCharset = extractCharsetFromMimetypeHeader(contentType);
if (this.detectedCharset != null && this.detectedCharset.length() > 0) {
this.charsetChanged = true;
} else if (tagopts.containsKey("charset")) {
} else if (tagopts.containsKey("charset")) {
// sometimes the charset property is configured as extra attribut. try it ...
this.detectedCharset = tagopts.getProperty("charset");
this.charsetChanged = true;
@ -123,48 +124,54 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
}
}
@Override
public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
if (tagname == null || tagname.length() == 0) return;
if (tagname.equalsIgnoreCase("head")) {
this.endOfHead = true;
}
}
public String detectCharset() throws IOException {
this.mode = MODE_PRESCAN;
this.mode = MODE_PRESCAN;
// loop until we have detected the header element or the charset data
int c;
while ((c = this.reader.read())!= -1) {
this.writer.write(c);
if (this.charsetChanged) break; // thats enough
}
// free writer
this.writer = null;
// don't close writer here, otherwise it will shutdown our source stream
this.writer = null;
// don't close writer here, otherwise it will shutdown our source stream
// reset the buffer if not already done
if (this.mode != MODE_PRESCAN_FINISHED) {
this.mode++;
this.bufferedIn.reset();
}
// return scanning result
return (this.charsetChanged) ? this.detectedCharset : null;
}
@Override
public int read() throws IOException {
// mode 0 is called from within the detectCharset function
if (this.mode == MODE_PRESCAN) {
if (this.mode == MODE_PRESCAN) {
if (this.endOfHead || this.charsetChanged || this.preRead >= preBufferSize - 1) {
return -1;
return -1;
}
this.preRead++;
}
this.preRead++;
}
return this.bufferedIn.read();
}
@Override
public void close() throws IOException {
if (this.writer != null) this.writer.close();
}
}

@ -127,11 +127,7 @@ public final class TransformerWriter extends Writer {
}
bb.append('>');
final char[] result = bb.getChars();
try {
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
bb.close();
return result;
}
@ -147,11 +143,7 @@ public final class TransformerWriter extends Writer {
bb.append(text);
bb.append('<').append('/').append(tagname).append('>');
final char[] result = bb.getChars();
try {
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
bb.close();
return result;
}
@ -165,11 +157,7 @@ public final class TransformerWriter extends Writer {
}
bb.append('>');
final char[] result = bb.getChars();
try {
bb.close();
} catch (final IOException e) {
Log.logException(e);
}
bb.close();
return result;
}
@ -178,11 +166,7 @@ public final class TransformerWriter extends Writer {
final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3);
cb.append(text).append('<').append('/').append(tagname).append('>');
final char[] result = cb.getChars();
try {
cb.close();
} catch (final IOException e) {
Log.logException(e);
}
cb.close();
return result;
}
@ -202,11 +186,7 @@ public final class TransformerWriter extends Writer {
result = bb.getChars(1);
else
result = bb.getChars();
try {
bb.close();
} catch (final IOException ex) {
Log.logException(ex);
}
bb.close();
return result;
}
@ -227,12 +207,7 @@ public final class TransformerWriter extends Writer {
// this single tag is collected at once here
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.scraper.scrapeTag0(tag, charBuffer.propParser());
try {
charBuffer.close();
} catch (final IOException e) {
// TODO Auto-generated catch block
Log.logException(e);
}
charBuffer.close();
}
if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
// this single tag is collected at once here
@ -240,11 +215,7 @@ public final class TransformerWriter extends Writer {
try {
return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
} finally {
try {
scb.close();
} catch (final IOException e) {
Log.logException(e);
}
scb.close();
}
} else if (((this.scraper != null) && (this.scraper.isTag1(tag))) ||
((this.transformer != null) && (this.transformer.isTag1(tag)))) {
@ -252,11 +223,7 @@ public final class TransformerWriter extends Writer {
this.filterTag = tag;
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.filterOpts = scb.propParser();
try {
scb.close();
} catch (final IOException e) {
Log.logException(e);
}
scb.close();
if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
return new char[0];
} else {
@ -543,6 +510,7 @@ public final class TransformerWriter extends Writer {
// the filter process is messed up
// instead, we simply flush the underlying output stream
if (this.out != null) this.out.flush();
if (this.scraper != null) this.scraper.finish();
// if you want to flush all, call close() at end of writing;
}
@ -567,8 +535,7 @@ public final class TransformerWriter extends Writer {
this.filterOpts = null;
if (this.filterCont != null) this.filterCont.close();
this.filterCont = null;
// if (scraper != null) {scraper.close(); scraper = null;}
// if (transformer != null) {transformer.close(); transformer = null;}
if (this.scraper != null) this.scraper.finish();
}
private static boolean binaryHint(final char c) {

@ -203,8 +203,9 @@ public class htmlParser extends AbstractParser implements Parser {
} catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally {
writer.flush();
sourceStream.close();
writer.close();
//writer.close();
}
//OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
//serverFileUtils.copy(sourceFile, hfos);

@ -144,14 +144,13 @@ public class pdfParser extends AbstractParser implements Parser {
try {
writer.append(stripper.getText(pdfDoc));
} catch (final Throwable e) {}
}
};
}
};
t.start();
t.join(3000);
if (t.isAlive()) t.interrupt();
pdfDoc.close();
contentBytes = writer.getBytes(); // get final text before closing writer
writer.close();
contentBytes = writer.getBytes(); // get final text before closing writer
} catch (final IOException e) {
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
@ -166,6 +165,7 @@ public class pdfParser extends AbstractParser implements Parser {
//throw new Parser.Failure(e.getMessage(), location);
} finally {
try {pdfDoc.close();} catch (final IOException e) {}
writer.close();
}
String[] docKeywords = null;
@ -175,7 +175,7 @@ public class pdfParser extends AbstractParser implements Parser {
if (docTitle == null) {
docTitle = docSubject;
}
// clear resources in pdfbox. they say that is resolved but it's not. see:
// https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351

@ -0,0 +1,174 @@
/**
* MapColumnIndex
* Copyright 2012 by Michael Christen
* First released 01.02.2012 at http://yacy.net
*
* $LastChangedDate$
* $LastChangedRevision$
* $LastChangedBy$
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.blob;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.document.ASCII;
import net.yacy.kelondro.order.NaturalOrder;
/**
* a mapping from a column name to maps with the value of the columns to the primary keys where the entry exist in the table
*/
public class MapColumnIndex {
private static final long serialVersionUID=-424741536889467566L;
private final Map<String, Map<String, Collection<byte[]>>> index;
public MapColumnIndex() {
this.index = new HashMap<String, Map<String, Collection<byte[]>>>();
}
public synchronized Collection<byte[]> getIndex(final String whereKey, final String isValue) throws UnsupportedOperationException {
Map<String, Collection<byte[]>> references = this.index.get(whereKey);
if (references == null) throw new UnsupportedOperationException();
Collection<byte[]> indexes = references.get(isValue);
if (indexes == null) return new ArrayList<byte[]>(0); // empty collection
return indexes;
}
public synchronized void clear() {
this.index.clear();
}
/**
* create a full index for the whereKey
* @param whereKey
* @param isValue
* @param table
*/
public synchronized void init(final String whereKey, final String isValue, final Iterator<Map.Entry<byte[], Map<String, String>>> table) {
Map<String, Collection<byte[]>> valueIdxMap = new HashMap<String, Collection<byte[]>>();
this.index.put(whereKey, valueIdxMap);
Map.Entry<byte[], Map<String, String>> line;
while (table.hasNext()) {
line = table.next();
String value = line.getValue().get(whereKey);
if (value == null) continue; // we don't need to remember that
indexupdate(line.getKey(), valueIdxMap, value);
}
}
/**
* update an index entry
* @param primarykey the primary key for the row that is updated
* @param row the row that was updated (a mapping from column names to values)
*/
public synchronized void update(final byte[] primarykey, final Map<String, String> row) {
for (Map.Entry<String, Map<String, Collection<byte[]>>> entry: this.index.entrySet()) {
// create an index for all columns that we track
String value = row.get(entry.getKey());
if (value == null) continue; // we don't need to remember that
indexupdate(primarykey, entry.getValue(), value);
}
}
private void indexupdate(final byte[] primarykey, final Map<String, Collection<byte[]>> valueIdxMap, final String value) {
Collection<byte[]> indexes = valueIdxMap.get(value);
if (indexes == null) {
// create a new index entry
indexes = new ArrayList<byte[]>(1);
indexes.add(primarykey);
valueIdxMap.put(value, indexes);
} else {
// update the existing index entry
// check if value already exist
if (!net.yacy.kelondro.util.ByteBuffer.contains(indexes, primarykey)) {
indexes.add(primarykey);
}
}
}
/**
* delete all references to the primary key
* @param primarykey
*/
public synchronized void delete(final byte[] primarykey) {
for (Map.Entry<String, Map<String, Collection<byte[]>>> entry: this.index.entrySet()) {
// we must check all index reference maps: iterate over entries
indexdelete(primarykey, entry.getValue());
}
}
private void indexdelete(final byte[] index, final Map<String, Collection<byte[]>> valueIdxMap) {
Iterator<Map.Entry<String, Collection<byte[]>>> i = valueIdxMap.entrySet().iterator();
Map.Entry<String, Collection<byte[]>> ref;
while (i.hasNext()) {
ref = i.next();
net.yacy.kelondro.util.ByteBuffer.remove(ref.getValue(), index);
if (ref.getValue().isEmpty()) {
i.remove();
}
}
}
private static Collection<byte[]> getIndexWithExceptionHandler(final MapColumnIndex idx, final String whereKey, final String isValue, Map<byte[], Map<String, String>> table) {
try {
return idx.getIndex(whereKey, isValue);
} catch (UnsupportedOperationException e) {
idx.init(whereKey, isValue, table.entrySet().iterator());
try {
return idx.getIndex(whereKey, isValue);
} catch (UnsupportedOperationException ee) {
throw ee;
}
}
}
private static void printIndex(Collection<byte[]> index) {
System.out.print("idx{");
int c = 0;
for (byte[] a: index) {
if (c++ != 0) System.out.print(", ");
System.out.print(ASCII.String(a));
}
System.out.print("}");
}
public static void main(String[] args) {
Map<byte[], Map<String, String>> table = new TreeMap<byte[], Map<String, String>>(NaturalOrder.naturalOrder);
Map<String, String> row;
row = new HashMap<String, String>(); row.put("a", "1"); row.put("b", "2"); row.put("c", "2"); table.put("line1".getBytes(), row);
row = new HashMap<String, String>(); row.put("a", "3"); row.put("b", "2"); row.put("c", "4"); table.put("line2".getBytes(), row);
row = new HashMap<String, String>(); row.put("a", "5"); row.put("b", "2"); row.put("c", "4"); table.put("line3".getBytes(), row);
row = new HashMap<String, String>(); row.put("a", "6"); row.put("b", "7"); row.put("c", "8"); table.put("line4".getBytes(), row);
MapColumnIndex idx = new MapColumnIndex();
System.out.print("colum b, value 2: "); printIndex(getIndexWithExceptionHandler(idx, "b", "2", table)); System.out.println();
System.out.print("colum c, value 4: "); printIndex(getIndexWithExceptionHandler(idx, "c", "4", table)); System.out.println();
System.out.print("colum b, value 2: "); printIndex(getIndexWithExceptionHandler(idx, "b", "7", table)); System.out.println();
System.out.print("colum d, value 0: "); printIndex(getIndexWithExceptionHandler(idx, "d", "0", table)); System.out.println();
row = new HashMap<String, String>(); row.put("a", "9"); row.put("b", "9"); row.put("c", "4"); table.put("line5".getBytes(), row);
idx.update("line5".getBytes(), row);
System.out.print("colum c, value 4: "); printIndex(getIndexWithExceptionHandler(idx, "c", "4", table)); System.out.println();
}
}

@ -29,6 +29,7 @@ package net.yacy.kelondro.blob;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@ -55,6 +56,7 @@ public class MapDataMining extends MapHeap {
private Map<String, ScoreMap<String>> sortClusterMap; // a String-kelondroMScoreCluster - relation
private Map<String, Long> accLong; // to store accumulations of Long cells
private Map<String, Float> accFloat; // to store accumulations of Float cells
private final MapColumnIndex columnIndex; // to store fast select-where indexes
@SuppressWarnings("unchecked")
public MapDataMining(final File heapFile,
@ -73,6 +75,8 @@ public class MapDataMining extends MapHeap {
this.longaccfields = longaccfields;
this.floataccfields = floataccfields;
this.columnIndex = new MapColumnIndex();
ScoreMap<String>[] cluster = null;
if (sortfields == null) this.sortClusterMap = null; else {
this.sortClusterMap = new ConcurrentHashMap<String, ScoreMap<String>>();
@ -192,6 +196,8 @@ public class MapDataMining extends MapHeap {
this.accFloat.put(floataccfield, FLOAT0);
}
}
this.columnIndex.clear();
}
@Override
@ -216,6 +222,8 @@ public class MapDataMining extends MapHeap {
// update sortCluster
if (this.sortClusterMap != null) updateSortCluster(UTF8.String(key), newMap);
this.columnIndex.update(key, newMap);
}
private void updateAcc(final Map<String, String> map, final boolean add) {
@ -294,6 +302,8 @@ public class MapDataMining extends MapHeap {
}
}
super.delete(key);
this.columnIndex.delete(key);
}
private void deleteSortCluster(final String key) {
@ -315,6 +325,10 @@ public class MapDataMining extends MapHeap {
return new string2bytearrayIterator(cluster.keys(up));
}
private synchronized Iterator<byte[]> keys() throws IOException {
return super.keys(true, null);
}
private static class string2bytearrayIterator implements Iterator<byte[]> {
private final Iterator<String> s;
@ -342,15 +356,25 @@ public class MapDataMining extends MapHeap {
}
@Override
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final String whereKey, final String isValue) throws IOException {
return super.entries(whereKey, isValue);
public synchronized Collection<byte[]> select(final String whereKey, final String isValue) throws IOException {
Collection<byte[]> idx = null;
try {
idx = this.columnIndex.getIndex(whereKey, isValue);
} catch (UnsupportedOperationException e) {
this.columnIndex.init(whereKey, isValue, new FullMapIterator(keys()));
try {
idx = this.columnIndex.getIndex(whereKey, isValue);
} catch (UnsupportedOperationException ee) {
throw ee;
}
}
return idx;
}
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final boolean up, final String field) {
return new MapIterator(keys(up, field), null, null);
return new FullMapIterator(keys(up, field));
}
public synchronized long getLongAcc(final String field) {
final Long accumulator = this.accLong.get(field);
if (accumulator == null) return -1;

@ -82,6 +82,14 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
return this.blob.keylength();
}
/**
* get the ordering of the primary keys
* @return
*/
public ByteOrder ordering() {
return this.blob.ordering();
}
/**
* clears the content of the database
* @throws IOException
@ -366,6 +374,10 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
return new KeyIterator(up, rotating, firstKey, secondKey);
}
public synchronized CloneableIterator<byte[]> keys(boolean up, byte[] firstKey) throws IOException {
return this.blob.keys(up, firstKey);
}
public class KeyIterator implements CloneableIterator<byte[]>, Iterator<byte[]> {
final boolean up, rotating;
@ -406,17 +418,13 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
}
}
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final String whereKey, final String isValue) throws IOException {
return new MapIterator(this.blob.keys(true, null), whereKey, isValue);
}
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final boolean up, final boolean rotating) throws IOException {
return new MapIterator(keys(up, rotating), null, null);
return new FullMapIterator(keys(up, rotating));
}
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException {
return new MapIterator(keys(up, rotating, firstKey, secondKey), null, null);
return new FullMapIterator(keys(up, rotating, firstKey, secondKey));
}
/**
@ -448,18 +456,15 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
public void finalize() {
close();
}
public class MapIterator extends LookAheadIterator<Map.Entry<byte[], Map<String, String>>> implements Iterator<Map.Entry<byte[], Map<String, String>>> {
protected class FullMapIterator extends LookAheadIterator<Map.Entry<byte[], Map<String, String>>> implements Iterator<Map.Entry<byte[], Map<String, String>>> {
// enumerates Map-Type elements
// the key is also included in every map that is returned; it's key is 'key'
private final Iterator<byte[]> keyIterator;
private final String whereKey, isValue;
MapIterator(final Iterator<byte[]> keyIterator, final String whereKey, final String isValue) {
FullMapIterator(final Iterator<byte[]> keyIterator) {
this.keyIterator = keyIterator;
this.whereKey = whereKey;
this.isValue = isValue;
}
@Override
@ -479,19 +484,14 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
continue;
}
if (map == null) continue; // circumvention of a modified exception
// check if the where case holds
if (this.whereKey != null && this.isValue != null) {
String v = map.get(this.whereKey);
if (v == null) continue;
if (!v.equals(this.isValue)) continue;
}
// produce entry
Map.Entry<byte[], Map<String, String>> entry = new AbstractMap.SimpleImmutableEntry<byte[], Map<String, String>>(nextKey, map);
return entry;
}
return null;
}
} // class mapIterator
} // class FullMapIterator
@Override
public void putAll(final Map<? extends byte[], ? extends Map<String, String>> map) {

@ -189,7 +189,7 @@ public class URIMetadataRow implements URIMetadata {
final String dc_publisher,
final float lat,
final float lon) {
final CharBuffer s = new CharBuffer(20000, 360);
final CharBuffer s = new CharBuffer(3600, 360);
s.append(url.toNormalform(false, true)).appendLF();
s.append(dc_title).appendLF();
if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator);

@ -26,6 +26,7 @@
package net.yacy.kelondro.data.word;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
@ -118,11 +119,11 @@ public class Word {
private final static byte lowByte = Base64Order.alpha_enhanced[0];
private final static byte highByte = Base64Order.alpha_enhanced[Base64Order.alpha_enhanced.length - 1];
public static boolean isPrivate(byte[] hash) {
return hash[0] == highByte && hash[1] == highByte && hash[2] == highByte && hash[3] == highByte && hash[4] == highByte;
}
// create a word hash
public static final byte[] word2hash(final String word) {
final String wordlc = word.toLowerCase(Locale.ENGLISH);
@ -148,7 +149,7 @@ public class Word {
public final static byte PRIVATE_TYPE_COPY = 'C'; // used for a private local copy of the index
public final static byte PRIVATE_TYPE_PHONETIC = 'K'; // used for ColognePhonetics
public static final byte[] hash2private(final byte[] hash, byte privateType) {
byte[] p = new byte[commonHashLength];
p[0] = highByte; p[1] = highByte; p[2] = highByte; ; p[3] = highByte; ; p[4] = highByte; p[5] = privateType;
@ -156,7 +157,7 @@ public class Word {
return p;
}
public static final HandleSet words2hashesHandles(final Set<String> words) {
public static final HandleSet words2hashesHandles(final Collection<String> words) {
final HandleSet hashes = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.size());
for (final String word: words)
try {

@ -73,27 +73,6 @@ public final class CharBuffer extends Writer {
this.maximumLength = maximumLength;
}
public CharBuffer(final int maximumLength, final char[] bb, final int of, final int le) {
if (of * 2 > bb.length) {
this.buffer = new char[le];
System.arraycopy(bb, of, this.buffer, 0, le);
this.length = le;
this.offset = 0;
} else {
this.buffer = bb;
this.length = le;
this.offset = of;
}
this.maximumLength = maximumLength;
}
public CharBuffer(final CharBuffer bb) {
this.buffer = bb.buffer;
this.length = bb.length;
this.offset = bb.offset;
this.maximumLength = bb.maximumLength;
}
public CharBuffer(final File f) throws IOException {
// initially fill the buffer with the content of a file
if (f.length() > Integer.MAX_VALUE) throw new IOException("file is too large for buffering");
@ -130,8 +109,7 @@ public final class CharBuffer extends Writer {
}
private void grow(int minSize) {
int newsize = this.buffer.length + 1024;
if (newsize < minSize) newsize = minSize+1;
int newsize = 12 * Math.max(this.buffer.length, minSize) / 10; // grow by 20%
char[] tmp = new char[newsize];
System.arraycopy(this.buffer, this.offset, tmp, 0, this.length);
this.buffer = tmp;
@ -187,7 +165,7 @@ public final class CharBuffer extends Writer {
}
public CharBuffer append(final char[] bb) {
write(bb);
write(bb, 0, bb.length);
return this;
}
@ -205,14 +183,14 @@ public final class CharBuffer extends Writer {
public CharBuffer append(final String s) {
final char[] temp = new char[s.length()];
s.getChars(0, temp.length, temp, 0);
write(temp);
write(temp, 0, temp.length);
return this;
}
public CharBuffer append(final String s, final int off, final int len) {
final char[] temp = new char[len];
s.getChars(off, (off + len), temp, 0);
write(temp);
write(temp, 0, len);
return this;
}
@ -479,15 +457,12 @@ public final class CharBuffer extends Writer {
this.offset = 0;
}
public void reset(final int newSize) {
this.resize(newSize);
this.reset();
}
public void resize(final int newSize) {
if(newSize < 0) throw new IllegalArgumentException("Illegal array size: " + newSize);
final char[] v = new char[newSize];
System.arraycopy(this.buffer,0,v,0,newSize > this.buffer.length ? this.buffer.length : newSize);
/**
* call trimToSize() whenever a CharBuffer is not extended any more and is kept to store the content permanently
*/
public void trimToSize() {
final char[] v = new char[this.length];
System.arraycopy(this.buffer, this.offset, v, 0, this.length);
this.buffer = v;
}
@ -498,13 +473,15 @@ public final class CharBuffer extends Writer {
}
@Override
public void close() throws IOException {
public void close() {
this.length = 0;
this.offset = 0;
this.buffer = null; // assist with garbage collection
}
@Override
public void flush() throws IOException {
// TODO Auto-generated method stub
public void flush() {
trimToSize();
}
}

@ -320,7 +320,9 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
public Row.Entry get(final byte[] key, final boolean forcecopy) throws IOException {
final Index keeper = keeperOf(key);
if (keeper == null) return null;
return keeper.get(key, forcecopy);
synchronized (this) { // avoid concurrent IO from different methods
return keeper.get(key, forcecopy);
}
}
@Override
@ -376,8 +378,10 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
public Row.Entry replace(final Row.Entry row) throws IOException, RowSpaceExceededException {
assert row.objectsize() <= this.rowdef.objectsize;
Index keeper = keeperOf(row.getPrimaryKeyBytes());
if (keeper != null) return keeper.replace(row);
synchronized (this.tables) {
if (keeper != null) synchronized (this) { // avoid concurrent IO from different methods
return keeper.replace(row);
}
synchronized (this) {
assert this.current == null || this.tables.get(this.current) != null : "this.current = " + this.current;
keeper = (this.current == null) ? newTable() : checkTable(this.tables.get(this.current));
}
@ -397,12 +401,11 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
assert row.objectsize() <= this.rowdef.objectsize;
final byte[] key = row.getPrimaryKeyBytes();
if (this.tables == null) return true;
Index keeper = null;
synchronized (this.tables) {
keeper = keeperOf(key);
Index keeper = keeperOf(key);
if (keeper != null) synchronized (this) { // avoid concurrent IO from different methods
return keeper.put(row);
}
if (keeper != null) return keeper.put(row);
synchronized (this.tables) {
synchronized (this) {
keeper = keeperOf(key); // we must check that again because it could have changed in between
if (keeper != null) return keeper.put(row);
assert this.current == null || this.tables.get(this.current) != null : "this.current = " + this.current;
@ -425,12 +428,12 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
@Override
public void addUnique(final Row.Entry row) throws IOException, RowSpaceExceededException {
assert row.objectsize() <= this.rowdef.objectsize;
Index table = (this.current == null) ? null : this.tables.get(this.current);
synchronized (this.tables) {
Index keeper = (this.current == null) ? null : this.tables.get(this.current);
synchronized (this) {
assert this.current == null || this.tables.get(this.current) != null : "this.current = " + this.current;
if (table == null) table = newTable(); else table = checkTable(table);
if (keeper == null) keeper = newTable(); else keeper = checkTable(keeper);
}
table.addUnique(row);
keeper.addUnique(row);
}
@Override
@ -447,14 +450,18 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
public boolean delete(final byte[] key) throws IOException {
final Index table = keeperOf(key);
if (table == null) return false;
return table.delete(key);
synchronized (this) { // avoid concurrent IO from different methods
return table.delete(key);
}
}
@Override
public Row.Entry remove(final byte[] key) throws IOException {
final Index table = keeperOf(key);
if (table == null) return null;
return table.remove(key);
synchronized (this) { // avoid concurrent IO from different methods
return table.remove(key);
}
}
@Override
@ -472,7 +479,9 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
if (maxtable == null) {
return null;
}
return maxtable.removeOne();
synchronized (this) { // avoid concurrent IO from different methods
return maxtable.removeOne();
}
}
@Override
@ -490,7 +499,9 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
if (maxtable == null) {
return null;
}
return maxtable.top(count);
synchronized (this) { // avoid concurrent IO from different methods
return maxtable.top(count);
}
}
@Override

@ -32,6 +32,7 @@ import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
@ -67,7 +68,7 @@ public final class ByteBuffer extends OutputStream {
}
public ByteBuffer(final String s) {
this.buffer = s.getBytes(UTF8.charset);
this.buffer = UTF8.getBytes(s);
this.length = this.buffer.length;
this.offset = 0;
}
@ -140,6 +141,7 @@ public final class ByteBuffer extends OutputStream {
this.offset = 0;
}
@Override
public void write(final int b) {
write((byte) (b & 0xff));
}
@ -518,6 +520,20 @@ public final class ByteBuffer extends OutputStream {
return false;
}
public static int remove(final Collection<byte[]> collection, final byte[] key) {
Iterator<byte[]> i = collection.iterator();
byte[] v;
int c = 0;
while (i.hasNext()) {
v = i.next();
if (equals(v, key)) {
i.remove();
c++;
}
}
return c;
}
public static List<byte[]> split(final byte[] b, final byte s) {
final ArrayList<byte[]> a = new ArrayList<byte[]>();
int c = 0;

@ -49,12 +49,12 @@ import net.yacy.kelondro.logging.Log;
public final class SetTools {
//public static Comparator fastStringComparator = fastStringComparator(true);
// ------------------------------------------------------------------------------------------------
// helper methods
public static int log2a(int x) {
// this computes 1 + log2
// it is the number of bits in x, not the logarithm by 2
@ -72,10 +72,10 @@ public final class SetTools {
// - join by pairwise enumeration
// - join by iterative tests (where we distinguish left-right and right-left tests)
public static <A, B> SortedMap<A, B> joinConstructive(final Collection<SortedMap<A, B>> maps, final boolean concatStrings) {
// this joins all TreeMap(s) contained in maps
// first order entities by their size
final SortedMap<Long, SortedMap<A, B>> orderMap = new TreeMap<Long, SortedMap<A, B>>();
SortedMap<A, B> singleMap;
@ -84,18 +84,18 @@ public final class SetTools {
while (i.hasNext()) {
// get next entity:
singleMap = i.next();
// check result
if ((singleMap == null) || (singleMap.isEmpty())) return new TreeMap<A, B>();
// store result in order of result size
orderMap.put(Long.valueOf(singleMap.size() * 1000 + count), singleMap);
count++;
}
// check if there is any result
if (orderMap.isEmpty()) return new TreeMap<A, B>();
// we now must pairwise build up a conjunction of these maps
Long k = orderMap.firstKey(); // the smallest, which means, the one with the least entries
SortedMap<A, B> mapA, mapB, joinResult = orderMap.remove(k);
@ -114,7 +114,7 @@ public final class SetTools {
if (joinResult.isEmpty()) return new TreeMap<A, B>();
return joinResult;
}
public static <A, B> SortedMap<A, B> joinConstructive(final SortedMap<A, B> map1, final SortedMap<A, B> map2, final boolean concatStrings) {
// comparators must be equal
if ((map1 == null) || (map2 == null)) return null;
@ -134,7 +134,7 @@ public final class SetTools {
}
return joinConstructiveByEnumeration(map1, map2, concatStrings);
}
@SuppressWarnings("unchecked")
private static <A, B> SortedMap<A, B> joinConstructiveByTest(final SortedMap<A, B> small, final SortedMap<A, B> large, final boolean concatStrings) {
final SortedMap<A, B> result = new TreeMap<A, B>(large.comparator());
@ -198,7 +198,7 @@ public final class SetTools {
}
return result;
}
// now the same for set-set
public static <A> SortedSet<A> joinConstructive(final SortedSet<A> set1, final SortedSet<A> set2) {
// comparators must be equal
@ -220,9 +220,9 @@ public final class SetTools {
return joinConstructiveByEnumeration(set1, set2);
}
private static <A> SortedSet<A> joinConstructiveByTest(final SortedSet<A> small, final SortedSet<A> large) {
public static <A> SortedSet<A> joinConstructiveByTest(final Collection<A> small, final SortedSet<A> large) {
final Iterator<A> mi = small.iterator();
final SortedSet<A> result = new TreeSet<A>(small.comparator());
final SortedSet<A> result = new TreeSet<A>(large.comparator());
A o;
while (mi.hasNext()) {
o = mi.next();
@ -256,7 +256,7 @@ public final class SetTools {
}
return result;
}
/**
* test if one set is totally included in another set
* @param <A>
@ -269,8 +269,8 @@ public final class SetTools {
if (!large.contains(o)) return false;
}
return true;
}
}
/**
* test if one set is totally included in another set
* @param small
@ -282,8 +282,8 @@ public final class SetTools {
if (!large.has(handle)) return false;
}
return true;
}
}
/**
* test if the intersection of two sets is not empty
* @param <A>
@ -379,7 +379,7 @@ public final class SetTools {
}
return false;
}
private static boolean anymatchByEnumeration(final HandleSet set1, final HandleSet set2) {
// implement pairwise enumeration
final Comparator<byte[]> comp = set1.comparator();
@ -402,7 +402,7 @@ public final class SetTools {
}
return false;
}
// ------------------------------------------------------------------------------------------------
// exclude
@ -416,7 +416,7 @@ public final class SetTools {
return excludeConstructiveByTestMapInSet(map, set);
// return excludeConstructiveByEnumeration(map, set);
}
private static <A, B> TreeMap<A, B> excludeConstructiveByTestMapInSet(final TreeMap<A, B> map, final Set<A> set) {
final TreeMap<A, B> result = new TreeMap<A, B>(map.comparator());
A o;
@ -427,7 +427,7 @@ public final class SetTools {
return result;
}
*/
public static <A, B> void excludeDestructive(final Map<A, B> map, final Set<A> set) {
// comparators must be equal
if (map == null) return;
@ -440,40 +440,40 @@ public final class SetTools {
else
excludeDestructiveByTestSetInMap(map, set);
}
private static <A, B> void excludeDestructiveByTestMapInSet(final Map<A, B> map, final Set<A> set) {
final Iterator<A> mi = map.keySet().iterator();
while (mi.hasNext()) if (set.contains(mi.next())) mi.remove();
}
private static <A, B> void excludeDestructiveByTestSetInMap(final Map<A, B> map, final Set<A> set) {
final Iterator<A> si = set.iterator();
while (si.hasNext()) map.remove(si.next());
}
// and the same again with set-set
public static <A> void excludeDestructive(final Set<A> set1, final Set<A> set2) {
if (set1 == null) return;
if (set2 == null) return;
assert !(set1 instanceof SortedSet<?> && set2 instanceof SortedSet<?>) || ((SortedSet<A>) set1).comparator() == ((SortedSet<A>) set2).comparator();
if (set1.isEmpty() || set2.isEmpty()) return;
if (set1.size() < set2.size())
excludeDestructiveByTestSmallInLarge(set1, set2);
else
excludeDestructiveByTestLargeInSmall(set1, set2);
}
private static <A> void excludeDestructiveByTestSmallInLarge(final Set<A> small, final Set<A> large) {
public static <A> void excludeDestructiveByTestSmallInLarge(final Collection<A> small, final Set<A> large) {
final Iterator<A> mi = small.iterator();
while (mi.hasNext()) if (large.contains(mi.next())) mi.remove();
}
private static <A> void excludeDestructiveByTestLargeInSmall(final Set<A> large, final Set<A> small) {
public static <A> void excludeDestructiveByTestLargeInSmall(final Set<A> large, final Collection<A> small) {
final Iterator<A> si = small.iterator();
while (si.hasNext()) large.remove(si.next());
}
// ------------------------------------------------------------------------------------------------
public static SortedMap<String, String> loadMap(final String filename, final String sep) {
@ -488,13 +488,13 @@ public final class SetTools {
if ((line.length() > 0 && line.charAt(0) != '#') && ((pos = line.indexOf(sep)) > 0))
map.put(line.substring(0, pos).trim().toLowerCase(), line.substring(pos + sep.length()).trim());
}
} catch (final IOException e) {
} catch (final IOException e) {
} finally {
if (br != null) try { br.close(); } catch (final Exception e) {}
}
return map;
}
public static SortedMap<String, List<String>> loadMapMultiValsPerKey(final String filename, final String sep) {
final SortedMap<String, List<String>> map = new TreeMap<String, List<String>>();
BufferedReader br = null;
@ -511,17 +511,17 @@ public final class SetTools {
map.get(key).add(value);
}
}
} catch (final IOException e) {
} catch (final IOException e) {
} finally {
if (br != null) try { br.close(); } catch (final Exception e) {}
}
return map;
}
public static SortedSet<String> loadList(final File file, final Comparator<String> c) {
final SortedSet<String> list = new TreeSet<String>(c);
if (!(file.exists())) return list;
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
@ -531,7 +531,7 @@ public final class SetTools {
if (line.length() > 0 && line.charAt(0) != '#') list.add(line.trim().toLowerCase());
}
br.close();
} catch (final IOException e) {
} catch (final IOException e) {
} finally {
if (br != null) try{br.close();}catch(final Exception e){}
}
@ -547,7 +547,7 @@ public final class SetTools {
}
return sb.toString();
}
public static String setToString(final Set<String> set, final char separator) {
final Iterator<String> i = set.iterator();
final StringBuilder sb = new StringBuilder(set.size() * 7);
@ -560,7 +560,7 @@ public final class SetTools {
// ------------------------------------------------------------------------------------------------
public static void main(final String[] args) {
final SortedMap<String, String> m = new TreeMap<String, String>();
final SortedMap<String, String> s = new TreeMap<String, String>();

@ -29,11 +29,10 @@ import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.lang.ref.SoftReference;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
@ -98,8 +97,6 @@ public final class SeedDB implements AlternativeDomainNames {
private Seed mySeed; // my own seed
private final Set<String> myBotIDs; // list of id's that this bot accepts as robots.txt identification
private final Map<String, String> nameLookupCache; // a name-to-hash relation
private final Map<InetAddress, SoftReference<Seed>> ipLookupCache;
public SeedDB(
final File networkRoot,
@ -128,12 +125,6 @@ public final class SeedDB implements AlternativeDomainNames {
this.seedPassiveDB = openSeedTable(this.seedPassiveDBFile);
this.seedPotentialDB = openSeedTable(this.seedPotentialDBFile);
// start our virtual DNS service for yacy peers with empty cache
this.nameLookupCache = new HashMap<String, String>();
// cache for reverse name lookup
this.ipLookupCache = new HashMap<InetAddress, SoftReference<Seed>>();
// check if we are in the seedCaches: this can happen if someone else published our seed
removeMySeed();
@ -184,12 +175,6 @@ public final class SeedDB implements AlternativeDomainNames {
this.seedPassiveDB = openSeedTable(this.seedPassiveDBFile);
this.seedPotentialDB = openSeedTable(this.seedPotentialDBFile);
// start our virtual DNS service for yacy peers with empty cache
this.nameLookupCache.clear();
// cache for reverse name lookup
this.ipLookupCache.clear();
// check if we are in the seedCaches: this can happen if someone else published our seed
removeMySeed();
@ -497,7 +482,6 @@ public final class SeedDB implements AlternativeDomainNames {
//seed.put(yacySeed.LASTSEEN, yacyCore.shortFormatter.format(new Date(yacyCore.universalTime())));
synchronized (this) {
try {
this.nameLookupCache.put(seed.getName(), seed.hash);
final ConcurrentMap<String, String> seedPropMap = seed.getMap();
this.seedActiveDB.insert(ASCII.getBytes(seed.hash), seedPropMap);
this.seedPassiveDB.delete(ASCII.getBytes(seed.hash));
@ -513,7 +497,6 @@ public final class SeedDB implements AlternativeDomainNames {
if (seed.isProper(false) != null) return;
synchronized (this) {
try {
this.nameLookupCache.remove(seed.getName());
this.seedActiveDB.delete(ASCII.getBytes(seed.hash));
this.seedPotentialDB.delete(ASCII.getBytes(seed.hash));
} catch (final Exception e) { Log.logWarning("yacySeedDB", "could not remove hash ("+ e.getClass() +"): "+ e.getMessage()); }
@ -532,7 +515,6 @@ public final class SeedDB implements AlternativeDomainNames {
if (seed.isProper(false) != null) return;
synchronized (this) {
try {
this.nameLookupCache.remove(seed.getName());
this.seedActiveDB.delete(ASCII.getBytes(seed.hash));
this.seedPassiveDB.delete(ASCII.getBytes(seed.hash));
} catch (final Exception e) { Log.logWarning("yacySeedDB", "could not remove hash ("+ e.getClass() +"): "+ e.getMessage()); }
@ -637,52 +619,35 @@ public final class SeedDB implements AlternativeDomainNames {
return this.mySeed;
}
// then try to use the cache
peerName = peerName.toLowerCase();
final String seedhash = this.nameLookupCache.get(peerName);
Seed seed;
if (seedhash != null) {
seed = this.get(seedhash);
if (seed != null) {
//System.out.println("*** found lookupByName in cache: " + peerName);
return seed;
}
}
// enumerate the cache
String name = Seed.checkPeerName(peerName);
Map.Entry<byte[], Map<String, String>> entry;
try {
Iterator<Map.Entry<byte[], Map<String, String>>> mmap = this.seedActiveDB.entries(Seed.NAME, name);
while (mmap.hasNext()) {
entry = mmap.next();
if (entry == null) break;
seed = this.getConnected(ASCII.String(entry.getKey()));
synchronized (this) { try {
Collection<byte[]> idx = this.seedActiveDB.select(Seed.NAME, name);
for (byte[] pk: idx) {
seed = this.getConnected(ASCII.String(pk));
if (seed == null) continue;
if (seed.isProper(false) == null) this.nameLookupCache.put(seed.getName().toLowerCase(), seed.hash);
//System.out.println("*** found lookupByName in seedActiveDB: " + peerName);
return seed;
}
} catch ( IOException e ) {
}
try {
Iterator<Map.Entry<byte[], Map<String, String>>> mmap = this.seedPassiveDB.entries(Seed.NAME, name);
while (mmap.hasNext()) {
entry = mmap.next();
if (entry == null) break;
seed = this.getConnected(ASCII.String(entry.getKey()));
}}
synchronized (this) { try {
Collection<byte[]> idx = this.seedPassiveDB.select(Seed.NAME, name);
for (byte[] pk: idx) {
seed = this.getDisconnected(ASCII.String(pk));
if (seed == null) continue;
if (seed.isProper(false) == null) this.nameLookupCache.put(seed.getName().toLowerCase(), seed.hash);
//System.out.println("*** found lookupByName in seedPassiveDB: " + peerName);
return seed;
}
} catch ( IOException e ) {
}
}}
// check local seed
if (this.mySeed == null) initMySeed();
name = this.mySeed.getName().toLowerCase();
if (this.mySeed.isProper(false) == null) this.nameLookupCache.put(name, this.mySeed.hash);
if (name.equals(peerName)) return this.mySeed;
// nothing found
return null;
@ -705,31 +670,16 @@ public final class SeedDB implements AlternativeDomainNames {
}
// then try to use the cache
final SoftReference<Seed> ref = this.ipLookupCache.get(peerIP);
Seed seed = null;
if (ref != null) {
seed = ref.get();
if (seed != null) {
//System.out.println("*** found lookupByIP in cache: " + peerIP.toString() + " -> " + this.mySeed.getName());
return seed;
}
}
String ipString = peerIP.getHostAddress();
Map.Entry<byte[], Map<String, String>> entry;
if (lookupConnected) {
if (lookupConnected) synchronized (this) {
try {
Iterator<Map.Entry<byte[], Map<String, String>>> mmap = this.seedActiveDB.entries(Seed.IP, ipString);
while (mmap.hasNext()) {
entry = mmap.next();
if (entry == null) break;
String p = entry.getValue().get(Seed.PORT);
if (p == null) continue;
if (port > 0 && Integer.parseInt(p) != port) continue;
seed = this.getConnected(ASCII.String(entry.getKey()));
Collection<byte[]> idx = this.seedActiveDB.select(Seed.IP, ipString);
for (byte[] pk: idx) {
seed = this.getConnected(ASCII.String(pk));
if (seed == null) continue;
this.ipLookupCache.put(peerIP, new SoftReference<Seed>(seed));
if (seed.getPort() != port) continue;
//System.out.println("*** found lookupByIP in connected: " + peerIP.toString() + " -> " + seed.getName());
return seed;
}
@ -737,18 +687,13 @@ public final class SeedDB implements AlternativeDomainNames {
}
}
if (lookupDisconnected) {
if (lookupDisconnected) synchronized (this) {
try {
Iterator<Map.Entry<byte[], Map<String, String>>> mmap = this.seedPassiveDB.entries(Seed.IP, ipString);
while (mmap.hasNext()) {
entry = mmap.next();
if (entry == null) break;
String p = entry.getValue().get(Seed.PORT);
if (p == null) continue;
if (port > 0 && Integer.parseInt(p) != port) continue;
seed = this.getDisconnected(ASCII.String(entry.getKey()));
Collection<byte[]> idx = this.seedPassiveDB.select(Seed.IP, ipString);
for (byte[] pk: idx) {
seed = this.getDisconnected(ASCII.String(pk));
if (seed == null) continue;
this.ipLookupCache.put(peerIP, new SoftReference<Seed>(seed));
if (seed.getPort() != port) continue;
//System.out.println("*** found lookupByIP in disconnected: " + peerIP.toString() + " -> " + seed.getName());
return seed;
}
@ -756,18 +701,13 @@ public final class SeedDB implements AlternativeDomainNames {
}
}
if (lookupPotential) {
if (lookupPotential) synchronized (this) {
try {
Iterator<Map.Entry<byte[], Map<String, String>>> mmap = this.seedPotentialDB.entries(Seed.IP, ipString);
while (mmap.hasNext()) {
entry = mmap.next();
if (entry == null) break;
String p = entry.getValue().get(Seed.PORT);
if (p == null) continue;
if (port > 0 && Integer.parseInt(p) != port) continue;
seed = this.getPotential(ASCII.String(entry.getKey()));
Collection<byte[]> idx = this.seedPotentialDB.select(Seed.IP, ipString);
for (byte[] pk: idx) {
seed = this.getPotential(ASCII.String(pk));
if (seed == null) continue;
this.ipLookupCache.put(peerIP, new SoftReference<Seed>(seed));
if (seed.getPort() != port) continue;
//System.out.println("*** found lookupByIP in potential: " + peerIP.toString() + " -> " + seed.getName());
return seed;
}

@ -365,6 +365,7 @@ public final class yacyRelease extends yacyVersion {
try {
final CharBuffer signBuffer = new CharBuffer(getSignatureFile());
final byte[] signByteBuffer = Base64Order.standardCoder.decode(signBuffer.toString().trim());
signBuffer.close();
final CryptoLib cl = new CryptoLib();
for(final yacyUpdateLocation updateLocation : latestReleaseLocations) {
try {

@ -295,7 +295,7 @@ public final class Switchboard extends serverSwitch
}
// init TrayIcon if possible
tray = new Tray(this);
this.tray = new Tray(this);
// remote proxy configuration
initRemoteProxy();
@ -636,6 +636,7 @@ public final class Switchboard extends serverSwitch
// define a realtime parsable mimetype list
this.log.logConfig("Parser: Initializing Mime Type deny list");
TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
TextParser.setDenyExtension(getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
// prepare a solr index profile switch list
final File solrBackupProfile = new File("defaults/solr.keys.list");
@ -650,7 +651,7 @@ public final class Switchboard extends serverSwitch
// update the working scheme with the backup scheme. This is necessary to include new features.
// new features are always activated by default
workingScheme.fill(backupScheme);
workingScheme.fill(backupScheme, false);
// set up the solr interface
final String solrurls =
@ -1598,7 +1599,7 @@ public final class Switchboard extends serverSwitch
Domains.close();
AccessTracker.dumpLog(new File("DATA/LOG/queries.log"));
UPnP.deletePortMapping();
tray.remove();
this.tray.remove();
try {
HTTPClient.closeConnectionManager();
} catch ( final InterruptedException e ) {
@ -3327,7 +3328,7 @@ public final class Switchboard extends serverSwitch
this.peers.mySeed().put(Seed.NCOUNT, Integer.toString(this.crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
this.peers.mySeed().put(
Seed.RCOUNT,
Integer.toString(this.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's)
Integer.toString(this.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.GLOBAL))); // the number of links that the peer provides for remote crawling (ZURL's)
this.peers.mySeed().put(Seed.ICOUNT, Long.toString(this.indexSegments.RWICount())); // the minimum number of words that the peer has indexed (as it says)
this.peers.mySeed().put(Seed.SCOUNT, Integer.toString(this.peers.sizeConnected())); // the number of seeds that the peer has stored
this.peers.mySeed().put(

@ -599,19 +599,21 @@ public final class MetadataRepository implements Iterable<byte[]> {
public Map<String, URLHashCounter> domainSampleCollector() throws IOException {
final Map<String, URLHashCounter> map = new HashMap<String, URLHashCounter>();
// first collect all domains and calculate statistics about it
final CloneableIterator<byte[]> i = this.urlIndexFile.keys(true, null);
String hosthash;
byte[] urlhashb;
URLHashCounter ds;
if (i != null) while (i.hasNext()) {
urlhashb = i.next();
hosthash = ASCII.String(urlhashb, 6, 6);
ds = map.get(hosthash);
if (ds == null) {
ds = new URLHashCounter(urlhashb);
map.put(hosthash, ds);
} else {
ds.count++;
synchronized (this) {
final CloneableIterator<byte[]> i = this.urlIndexFile.keys(true, null);
String hosthash;
byte[] urlhashb;
URLHashCounter ds;
if (i != null) while (i.hasNext()) {
urlhashb = i.next();
hosthash = ASCII.String(urlhashb, 6, 6);
ds = map.get(hosthash);
if (ds == null) {
ds = new URLHashCounter(urlhashb);
map.put(hosthash, ds);
} else {
ds.count++;
}
}
}
return map;
@ -739,11 +741,13 @@ public final class MetadataRepository implements Iterable<byte[]> {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
final ArrayList<String> l = new ArrayList<String>();
final CloneableIterator<byte[]> i = this.urlIndexFile.keys(true, null);
String hash;
while (i != null && i.hasNext()) {
hash = ASCII.String(i.next());
if (hosthash.equals(hash.substring(6))) l.add(hash);
synchronized (this) {
final CloneableIterator<byte[]> i = this.urlIndexFile.keys(true, null);
String hash;
while (i != null && i.hasNext()) {
hash = ASCII.String(i.next());
if (hosthash.equals(hash.substring(6))) l.add(hash);
}
}
// then delete the urls using this list

@ -73,7 +73,7 @@ public class Segment {
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900;
public static final long targetFileSize = 256 * 1024 * 1024; // 256 MB
public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB
public static final int writeBufferSize = 4 * 1024 * 1024;
// the reference factory

@ -35,7 +35,6 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@ -56,7 +55,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.SetTools;
import net.yacy.peers.Seed;
import net.yacy.search.index.Segment;
@ -162,7 +160,7 @@ public final class QueryParams {
}
} else {
this.queryString = queryString;
final TreeSet<String>[] cq = cleanQuery(queryString);
final Collection<String>[] cq = cleanQuery(queryString);
this.queryHashes = Word.words2hashesHandles(cq[0]);
this.excludeHashes = Word.words2hashesHandles(cq[1]);
this.fullqueryHashes = Word.words2hashesHandles(cq[2]);
@ -378,11 +376,11 @@ public final class QueryParams {
private static String seps = "'.,/&_"; static {seps += '"';}
@SuppressWarnings("unchecked")
public static TreeSet<String>[] cleanQuery(String querystring) {
public static Collection<String>[] cleanQuery(String querystring) {
// returns three sets: a query set, a exclude set and a full query set
final TreeSet<String> query = new TreeSet<String>(NaturalOrder.naturalComparator);
final TreeSet<String> exclude = new TreeSet<String>(NaturalOrder.naturalComparator);
final TreeSet<String> fullquery = new TreeSet<String>(NaturalOrder.naturalComparator);
final Collection<String> query = new ArrayList<String>();
final Collection<String> exclude = new ArrayList<String>();
final Collection<String> fullquery = new ArrayList<String>();
if ((querystring != null) && (!querystring.isEmpty())) {
@ -401,22 +399,23 @@ public final class QueryParams {
final String[] queries = querystring.split(" ");
for (String quer : queries) {
if (quer.startsWith("-")) {
exclude.add(quer.substring(1));
String x = quer.substring(1);
if (!exclude.contains(x)) exclude.add(x);
} else {
while ((c = quer.indexOf('-')) >= 0) {
s = quer.substring(0, c);
l = s.length();
if (l >= Condenser.wordminsize) {query.add(s);}
if (l > 0) {fullquery.add(s);}
if (l >= Condenser.wordminsize && !query.contains(s)) {query.add(s);}
if (l > 0 && !fullquery.contains(s)) {fullquery.add(s);}
quer = quer.substring(c + 1);
}
l = quer.length();
if (l >= Condenser.wordminsize) {query.add(quer);}
if (l > 0) {fullquery.add(quer);}
if (l >= Condenser.wordminsize && !query.contains(quer)) {query.add(quer);}
if (l > 0 && !fullquery.contains(quer)) {fullquery.add(quer);}
}
}
}
return new TreeSet[]{query, exclude, fullquery};
return new Collection[]{query, exclude, fullquery};
}
public String queryString(final boolean encodeHTML) {
@ -438,7 +437,7 @@ public final class QueryParams {
}
}
public TreeSet<String>[] queryWords() {
public Collection<String>[] queryWords() {
return cleanQuery(this.queryString);
}

Loading…
Cancel
Save