*) fixed bug which caused entries to not be deleted when deleting by URL on IndexCreateWWWLocalQueue_p.html (I hope this did not break anything else)

*)  cleaned up code a little bit

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7493 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
low012 14 years ago
parent d58071947a
commit c5051c4020

@ -28,8 +28,8 @@
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import net.yacy.cora.protocol.RequestHeader;
@ -89,7 +89,7 @@ public class IndexCreateWWWGlobalQueue_p {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit);
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit);
Request urle;
boolean dark = true;

@ -82,21 +82,20 @@ public class IndexCreateWWWLocalQueue_p {
final String pattern = post.get("pattern", ".*").trim();
final int option = post.getInt("option", INVALID);
if (pattern.equals(".*")) {
if (".*".equals(pattern)) {
c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.CORE);
try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */}
} else if (option > INVALID) {
Pattern compiledPattern = null;
try {
// compiling the regular expression
compiledPattern = Pattern.compile(pattern);
final Pattern compiledPattern = Pattern.compile(pattern);
if (option == PROFILE) {
// search and delete the crawl profile (_much_ faster, independant of queue size)
// XXX: what to do about the annoying LOST PROFILE messages in the log?
CrawlProfile entry;
for (byte[] handle: sb.crawler.getActive()) {
for (final byte[] handle: sb.crawler.getActive()) {
entry = sb.crawler.getActive(handle);
final String name = entry.name();
if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
@ -113,7 +112,7 @@ public class IndexCreateWWWLocalQueue_p {
// iterating through the list of URLs
final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE);
Request entry;
List<byte[]> removehashes = new ArrayList<byte[]>();
final List<byte[]> removehashes = new ArrayList<byte[]>();
while (iter.hasNext()) {
if ((entry = iter.next()) == null) continue;
String value = null;
@ -129,10 +128,10 @@ public class IndexCreateWWWLocalQueue_p {
default: value = null; break location;
}
if (value != null && compiledPattern.matcher(value).find()) removehashes.add(entry.url().hash());
if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash());
}
Log.logInfo("IndexCreateWWWLocalQueue", "created a remove list with " + removehashes.size() + " entries for pattern '" + pattern + "'");
for (byte[] b: removehashes) {
for (final byte[] b: removehashes) {
sb.crawlQueues.noticeURL.removeByURLHash(b);
}
}
@ -156,7 +155,7 @@ public class IndexCreateWWWLocalQueue_p {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20));
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20));
Request urle;
boolean dark = true;

@ -25,8 +25,8 @@
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import net.yacy.cora.protocol.RequestHeader;
@ -86,7 +86,7 @@ public class IndexCreateWWWRemoteQueue_p {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final ArrayList<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit);
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit);
Request urle;
boolean dark = true;

@ -1,6 +1,6 @@
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import net.yacy.cora.protocol.RequestHeader;
@ -55,13 +55,13 @@ public class queues_p {
} else {
final Request[] w = sb.crawlQueues.activeWorkerEntries();
int count = 0;
for (int i = 0; i < w.length; i++) {
if (w[i] == null) continue;
prop.put("list-loader_"+count+"_profile", w[i].profileHandle());
initiator = sb.peers.getConnected((w[i].initiator() == null) ? "" : new String(w[i].initiator()));
for (final Request r : w) {
if (r == null) continue;
prop.put("list-loader_"+count+"_profile", r.profileHandle());
initiator = sb.peers.getConnected((r.initiator() == null) ? "" : new String(r.initiator()));
prop.putHTML("list-loader_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("list-loader_"+count+"_depth", w[i].depth());
prop.putXML("list-loader_"+count+"_url", w[i].url().toString());
prop.put("list-loader_"+count+"_depth", r.depth());
prop.putXML("list-loader_"+count+"_url", r.url().toString());
count++;
}
prop.put("list-loader", count);
@ -94,13 +94,11 @@ public class queues_p {
}
public static final void addNTable(final Switchboard sb, final serverObjects prop, final String tableName, final ArrayList<Request> crawlerList) {
public static final void addNTable(final Switchboard sb, final serverObjects prop, final String tableName, final List<Request> crawlerList) {
int showNum = 0;
Request urle;
yacySeed initiator;
for (int i = 0; i < crawlerList.size(); i++) {
urle = crawlerList.get(i);
for (final Request urle : crawlerList) {
if ((urle != null) && (urle.url() != null)) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator()));
prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle());

@ -6,6 +6,10 @@
// Frankfurt, Germany, 2005
// created: 24.09.2005
//
//$LastChangedDate$
//$LastChangedRevision$
//$LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -28,6 +32,7 @@ import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
@ -45,6 +50,7 @@ import net.yacy.kelondro.util.ByteBuffer;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.client.Cache;
import java.util.concurrent.ConcurrentMap;
public class Balancer {
@ -54,9 +60,9 @@ public class Balancer {
private static final String localhost = "localhost";
// class variables
private final ConcurrentHashMap<String, HandleSet> domainStacks; // a map from host name to lists with url hashs
private final ConcurrentMap<String, HandleSet> domainStacks; // a map from host name to lists with url hashs
private final ConcurrentLinkedQueue<byte[]> top; // a list of url-hashes that shall be taken next
private final TreeMap<Long, byte[]> delayed;
private final SortedMap<Long, byte[]> delayed;
private final HandleSet ddc;
private final HandleSet double_push_check; // for debugging
private BufferedObjectIndex urlFileIndex;
@ -67,8 +73,8 @@ public class Balancer {
private int domStackInitSize;
public Balancer(
final File cachePath,
final String stackname,
final File cachePath,
final String stackname,
final long minimumLocalDelta,
final long minimumGlobalDelta,
final boolean useTailCache,
@ -189,16 +195,16 @@ public class Balancer {
final Iterator<byte[]> j = top.iterator();
byte[] urlhash;
while (j.hasNext()) {
urlhash = j.next();
if (urlHashes.has(urlhash)) j.remove();
urlhash = j.next();
if (urlHashes.has(urlhash)) j.remove();
}
// remove from delayed
synchronized (this.delayed) {
final Iterator<Map.Entry<Long, byte[]>> k = this.delayed.entrySet().iterator();
while (k.hasNext()) {
if (urlHashes.has(k.next().getValue())) k.remove();
}
final Iterator<Map.Entry<Long, byte[]>> k = this.delayed.entrySet().iterator();
while (k.hasNext()) {
if (urlHashes.has(k.next().getValue())) k.remove();
}
}
// iterate through the domain stacks
@ -206,7 +212,7 @@ public class Balancer {
HandleSet stack;
while (q.hasNext()) {
stack = q.next().getValue();
for (byte[] handle: urlHashes) stack.remove(handle);
for (final byte[] handle: urlHashes) stack.remove(handle);
if (stack.isEmpty()) q.remove();
}
@ -234,7 +240,7 @@ public class Balancer {
private boolean domainStacksNotEmpty() {
if (domainStacks == null) return false;
synchronized (domainStacks) {
for (HandleSet l: domainStacks.values()) {
for (final HandleSet l: domainStacks.values()) {
if (!l.isEmpty()) return true;
}
}
@ -288,15 +294,15 @@ public class Balancer {
return;
}
domainList.remove(urlhash);
if (domainList.size() == 0) domainStacks.remove(host);
if (domainList.isEmpty()) domainStacks.remove(host);
}
private byte[] nextFromDelayed() {
if (this.delayed.isEmpty()) return null;
final Long first = this.delayed.firstKey();
if (first.longValue() < System.currentTimeMillis()) {
return this.delayed.remove(first);
}
if (this.delayed.isEmpty()) return null;
final Long first = this.delayed.firstKey();
if (first.longValue() < System.currentTimeMillis()) {
return this.delayed.remove(first);
}
return null;
}
@ -465,37 +471,37 @@ public class Balancer {
byte[] besturlhash = null;
String besthost = null;
while (i.hasNext()) {
entry = i.next();
// clean up empty entries
if (entry.getValue().isEmpty()) {
i.remove();
continue;
}
byte[] n = entry.getValue().removeOne();
if (n == null) continue;
if (delay) {
final long w = Latency.waitingRemainingGuessed(entry.getKey(), minimumLocalDelta, minimumGlobalDelta);
if (w > maximumwaiting) {
if (w < smallestWaiting) {
smallestWaiting = w;
besturlhash = n;
besthost = entry.getKey();
}
entry.getValue().put(n); // put entry back
continue;
}
}
this.top.add(n);
if (entry.getValue().isEmpty()) i.remove();
entry = i.next();
// clean up empty entries
if (entry.getValue().isEmpty()) {
i.remove();
continue;
}
byte[] n = entry.getValue().removeOne();
if (n == null) continue;
if (delay) {
final long w = Latency.waitingRemainingGuessed(entry.getKey(), minimumLocalDelta, minimumGlobalDelta);
if (w > maximumwaiting) {
if (w < smallestWaiting) {
smallestWaiting = w;
besturlhash = n;
besthost = entry.getKey();
}
entry.getValue().put(n); // put entry back
continue;
}
}
this.top.add(n);
if (entry.getValue().isEmpty()) i.remove();
}
// if we could not find any entry, then take the best we have seen so far
if (acceptonebest && !this.top.isEmpty() && besturlhash != null) {
removeHashFromDomainStacks(besthost, besturlhash);
this.top.add(besturlhash);
removeHashFromDomainStacks(besthost, besturlhash);
this.top.add(besturlhash);
}
}
@ -525,21 +531,21 @@ public class Balancer {
this.domStackInitSize = this.domainStacks.size();
}
public ArrayList<Request> top(int count) {
final ArrayList<Request> cel = new ArrayList<Request>();
public List<Request> top(int count) {
final List<Request> cel = new ArrayList<Request>();
if (count == 0) return cel;
byte[][] ta = new byte[Math.min(count, top.size())][];
ta = top.toArray(ta);
for (byte[] n: ta) {
for (final byte[] n: ta) {
if (n == null) break;
try {
final Row.Entry rowEntry = urlFileIndex.get(n);
if (rowEntry == null) continue;
final Request crawlEntry = new Request(rowEntry);
cel.add(crawlEntry);
count--;
if (count <= 0) break;
} catch (IOException e) {}
final Row.Entry rowEntry = urlFileIndex.get(n);
if (rowEntry == null) continue;
final Request crawlEntry = new Request(rowEntry);
cel.add(crawlEntry);
count--;
if (count <= 0) break;
} catch (IOException e) {}
}
int depth = 0;
@ -565,7 +571,7 @@ public class Balancer {
if (cel.size() < count) try {
List<Row.Entry> list = urlFileIndex.top(count - cel.size());
for (Row.Entry entry: list) cel.add(new Request(entry));
for (final Row.Entry entry: list) cel.add(new Request(entry));
} catch (IOException e) { }
return cel;
}

@ -1,10 +1,13 @@
// plasmaNURL.java
// NoticedURL.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 09.08.2004
//
//$LastChangedDate$
//$LastChangedRevision$
//$LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -26,9 +29,9 @@ package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
@ -52,7 +55,7 @@ public class NoticedURL {
private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry
public NoticedURL(
final File cachePath,
final File cachePath,
final boolean useTailCache,
final boolean exceed134217727) {
Log.logInfo("NoticedURL", "CREATING STACKS at " + cachePath.toString());
@ -107,11 +110,13 @@ public class NoticedURL {
}
}
protected void finalize() {
@Override
protected void finalize() throws Throwable {
if ((coreStack != null) || (limitStack != null) || (remoteStack != null)) {
Log.logWarning("plasmaCrawlNURL", "NURL stack closed by finalizer");
close();
}
super.finalize();
}
public boolean notEmpty() {
@ -195,13 +200,14 @@ public class NoticedURL {
*/
public boolean removeByURLHash(final byte[] urlhashBytes) {
try {
HandleSet urlHashes = Base64Order.enhancedCoder.getHandleSet(12, 1);
final HandleSet urlHashes = Base64Order.enhancedCoder.getHandleSet(12, 1);
urlHashes.put(urlhashBytes);
try {return noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {return coreStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {return limitStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {return remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {}
return false;
boolean ret = false;
try {ret |= noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {ret |= coreStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {ret |= limitStack.remove(urlHashes) > 0;} catch (final IOException e) {}
try {ret |= remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {}
return ret;
} catch (RowSpaceExceededException e) {
Log.logException(e);
return false;
@ -217,7 +223,7 @@ public class NoticedURL {
return removed;
}
public ArrayList<Request> top(final StackType stackType, final int count) {
public List<Request> top(final StackType stackType, final int count) {
switch (stackType) {
case CORE: return top(coreStack, count);
case LIMIT: return top(limitStack, count);
@ -279,12 +285,10 @@ public class NoticedURL {
return null;
}
private ArrayList<Request> top(final Balancer balancer, int count) {
private List<Request> top(final Balancer balancer, int count) {
// this is a filo - top
if (count > balancer.size()) count = balancer.size();
ArrayList<Request> list;
list = balancer.top(count);
return list;
return balancer.top(count);
}
public Iterator<Request> iterator(final StackType stackType) {

Loading…
Cancel
Save