From f7803a6ce45920a2ea438fdd2e45ccd2f07ed16e Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 21 Feb 2007 16:23:31 +0000 Subject: [PATCH] enhanced crawl balancer - new domains now get a chance to get crawled early - less IO operations - new balancing method - better dump order at shutdown time - bugfixes regarding not found url hashes (no more superfluous cache kill) - domain access time is now shared over all balancer stacks - viewing the stack does no more disturbish the balancing algorithm that much - intelligent selection of best next domain using domain access times - extra double-check (to double-check the double-check) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3384 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCreateWWWLocalQueue_p.java | 2 +- htroot/env/templates/header.template | 2 +- source/de/anomic/kelondro/kelondroStack.java | 29 +- .../dbImport/plasmaCrawlNURLImporter.java | 2 +- .../de/anomic/plasma/plasmaCrawlBalancer.java | 369 ++++++++++++------ source/de/anomic/plasma/plasmaCrawlNURL.java | 123 +++--- .../de/anomic/plasma/plasmaCrawlStacker.java | 1 - .../de/anomic/plasma/plasmaSwitchboard.java | 12 +- 8 files changed, 360 insertions(+), 180 deletions(-) diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 5d5042949..607d0bc37 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -103,7 +103,7 @@ public class IndexCreateWWWLocalQueue_p { Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE); while (iter.hasNext()) { String value = null; - String nextHash = new String((byte[]) iter.next()); + String nextHash = (String) iter.next(); Entry entry = null; try { entry = switchboard.noticeURL.getEntry(nextHash); diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template index bf7c9aa14..bb809290a 100644 --- a/htroot/env/templates/header.template +++ b/htroot/env/templates/header.template @@ -63,7 +63,7 @@
  • Deutsches Forum
  • Newsletters
  • Download YaCy
  • -
  • YaCy Wiki
  • +
  • YaCy Project Wiki
  • Contact YaCy Developer
  • diff --git a/source/de/anomic/kelondro/kelondroStack.java b/source/de/anomic/kelondro/kelondroStack.java index 14a5b2d48..10e7b43d9 100644 --- a/source/de/anomic/kelondro/kelondroStack.java +++ b/source/de/anomic/kelondro/kelondroStack.java @@ -284,10 +284,37 @@ public final class kelondroStack extends kelondroRecords { } public Iterator iterator() { - // iterates the elements in an ordered way. returns Node - type Objects + // iterates the elements in an ordered way. + // returns Node - type Objects return new Counter(); } + public Iterator keyIterator() { + // iterates byte[] - objects + return new keyIterator(iterator()); + } + + public class keyIterator implements Iterator { + + Iterator ni; + + public keyIterator(Iterator i) { + ni = i; + } + + public boolean hasNext() { + return ni.hasNext(); + } + + public Object next() { + return ((kelondroRecords.Node) ni.next()).getKey(); + } + + public void remove() { + } + + } + public int imp(File file, String separator) throws IOException { // imports a value-separated file, returns number of records that have been read RandomAccessFile f = null; diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java index 2385b5345..7e7683424 100644 --- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java @@ -178,7 +178,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor if (!this.sb.noticeURL.existsInStack(nextHash)) { plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(nextEntry); ne.store(); - this.sb.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.url().getHost(), ne.hash()); + this.sb.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.hash()); } // removing hash from the import db diff --git a/source/de/anomic/plasma/plasmaCrawlBalancer.java b/source/de/anomic/plasma/plasmaCrawlBalancer.java index 1aba9b9ba..c94fd37e3 100644 --- a/source/de/anomic/plasma/plasmaCrawlBalancer.java +++ b/source/de/anomic/plasma/plasmaCrawlBalancer.java @@ -43,178 +43,325 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedList; import java.util.Map; +import java.util.TreeMap; import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroStack; +import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; public class plasmaCrawlBalancer { - private kelondroStack stack; - private HashMap domainStacks; - private HashMap domainAccess; + // a shared domainAccess map for all balancers + private static final Map domainAccess = Collections.synchronizedMap(new HashMap()); + + // definition of payload for fileStack + private static final kelondroRow payload = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0); + + // class variables + private ArrayList ramStack; // a list that is flused first + private kelondroStack fileStack; // a file with url hashes + private HashMap domainStacks; // a map from domain name part to Lists with url hashs + private HashSet ramIndex; // an index is needed externally, we provide that internally public plasmaCrawlBalancer(File stackFile) { - stack = kelondroStack.open(stackFile, new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0)); + fileStack = kelondroStack.open(stackFile, payload); domainStacks = new HashMap(); - domainAccess = new HashMap(); + ramStack = new ArrayList(); + ramIndex = makeIndex(); } - public void close() { - try { flushSome(domainStacks.size()); } catch (IOException e) {} - try { stack.close(); } catch (IOException e) {} - stack = null; + public synchronized void close() { + ramIndex = null; + while (sizeDomainStacks() > 0) flushOnceDomStacks(true); + try { flushAllRamStack(); } catch (IOException e) {} + try { fileStack.close(); } catch (IOException e) {} + fileStack = null; } - public void reset() { - synchronized (domainStacks) { - stack = kelondroStack.reset(stack); - domainStacks.clear(); + public void finalize() { + if (fileStack != null) close(); + } + + public synchronized void clear() { + fileStack = kelondroStack.reset(fileStack); + domainStacks.clear(); + ramStack.clear(); + ramIndex = new HashSet(); + } + + private HashSet makeIndex() { + HashSet index = new HashSet(); // TODO: replace with kelondroIndex + + // take all elements from the file stack + try { + Iterator i = fileStack.keyIterator(); // iterates byte[] - objects + while (i.hasNext()) index.add(new String((byte[]) i.next(), "UTF-8")); + } catch (UnsupportedEncodingException e) {} + + // take elements from the ram stack + for (int i = 0; i < ramStack.size(); i++) index.add(ramStack.get(i)); + + // take elememts from domain stacks + Iterator i = domainStacks.entrySet().iterator(); + Map.Entry entry; + LinkedList list; + Iterator ii; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + list = (LinkedList) entry.getValue(); + ii = list.iterator(); + while (ii.hasNext()) index.add(ii.next()); } + + return index; + } + + public boolean has(String urlhash) { + return ramIndex.contains(urlhash); } public Iterator iterator() { - // iterates byte[] - objects - return new KeyIterator(stack.iterator()); + return ramIndex.iterator(); } - public int size() { - return stack.size() + sizeDomainStacks(); + public synchronized int size() { + int componentsize = fileStack.size() + ramStack.size() + sizeDomainStacks(); + //assert componentsize == ramIndex.size() : "componentsize = " + componentsize + ", ramIndex.size() = " + ramIndex.size(); + if (componentsize != ramIndex.size()) { + serverLog.logWarning("PLASMA BALANCER", "size operation wrong - componentsize = " + componentsize + ", ramIndex.size() = " + ramIndex.size()); + } + return componentsize; } private int sizeDomainStacks() { if (domainStacks == null) return 0; int sum = 0; - synchronized (domainStacks) { - Iterator i = domainStacks.values().iterator(); - while (i.hasNext()) sum += ((ArrayList) i.next()).size(); - } + Iterator i = domainStacks.values().iterator(); + while (i.hasNext()) sum += ((LinkedList) i.next()).size(); return sum; } - private void flushOnce() throws IOException { + private void flushOnceDomStacks(boolean ram) { // takes one entry from every domain stack and puts it on the file stack - synchronized (domainStacks) { - Iterator i = domainStacks.entrySet().iterator(); - Map.Entry entry; - ArrayList list; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - list = (ArrayList) entry.getValue(); - if (list.size() != 0) { - stack.push(stack.row().newEntry(new byte[][]{(byte[]) list.remove(0)})); + if (domainStacks.size() == 0) return; + Iterator i = domainStacks.entrySet().iterator(); + Map.Entry entry; + LinkedList list; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + list = (LinkedList) entry.getValue(); + if (list.size() != 0) { + if (ram) { + ramStack.add(list.removeFirst()); + } else try { + fileStack.push(fileStack.row().newEntry(new byte[][]{((String) list.removeFirst()).getBytes()})); + } catch (IOException e) { + e.printStackTrace(); } - if (list.size() == 0) i.remove(); } + if (list.size() == 0) i.remove(); } } - private void flushSome(int count) throws IOException { - while ((domainStacks.size() > 0) && (count-- > 0)) flushOnce(); + private void flushAllRamStack() throws IOException { + // this flushes only the ramStack to the fileStack, but does not flush the domainStacks + for (int i = 0; i < ramStack.size() / 2; i++) { + fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(i)).getBytes()})); + fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(ramStack.size() - i - 1)).getBytes()})); + } + if (ramStack.size() % 2 == 1) + fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(ramStack.size() / 2)).getBytes()})); } - public void add(String domain, byte[] hash) throws IOException { - synchronized (domainStacks) { - ArrayList domainList = (ArrayList) domainStacks.get(domain); - if (domainList == null) { - // create new list - domainList = new ArrayList(); - domainList.add(hash); - domainStacks.put(domain, domainList); - } else { - // extend existent domain list - domainList.add(hash); - } + public synchronized void add(String urlhash) throws IOException { + assert urlhash != null; + if (ramIndex.contains(urlhash)) { + serverLog.logWarning("PLASMA BALANCER", "double-check has failed for urlhash " + urlhash + " - fixed"); + return; + } + String dom = urlhash.substring(6); + LinkedList domainList = (LinkedList) domainStacks.get(dom); + if (domainList == null) { + // create new list + domainList = new LinkedList(); + domainList.addLast(urlhash); + domainStacks.put(dom, domainList); + } else { + // extend existent domain list + domainList.add(urlhash); } // check size of domainStacks and flush - if ((domainStacks.size() > 20) || (sizeDomainStacks() > 400)) { - flushOnce(); + if ((domainStacks.size() > 20) || (sizeDomainStacks() > 1000)) { + flushOnceDomStacks(false); } + + // add to index + ramIndex.add(urlhash); } - public String get(long minimumDelta) throws IOException { + public synchronized String get(long minimumDelta, long maximumAge) throws IOException { // returns an url-hash from the stack and ensures minimum delta times - synchronized (domainStacks) { - if ((stack.size() == 0) && (domainStacks.size() > 0)) flushOnce(); - if (stack.size() == 0) return null; + // we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack + + String result = null; // the result + + // 1st: check ramStack + if (ramStack.size() > 0) { + result = (String) ramStack.remove(0); + } + + // 2nd-a: check domainStacks for latest arrivals + if (result == null) { + // we select specific domains that have not been used for a long time + // i.e. 60 seconds. Latest arrivals that have not yet been crawled + // fit also in that scheme + Iterator i = domainStacks.entrySet().iterator(); + Map.Entry entry; + String domhash; + long delta, maxdelta = 0; + String maxhash = null; + LinkedList domlist; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + domhash = (String) entry.getKey(); + delta = lastAccessDelta(domhash); + if (delta == Integer.MAX_VALUE) { + // a brand new domain - we take it + domlist = (LinkedList) entry.getValue(); + result = (String) domlist.removeFirst(); + if (domlist.size() == 0) i.remove(); + break; + } + if (delta > maxdelta) { + maxdelta = delta; + maxhash = domhash; + } + } + if (maxdelta > maximumAge) { + // success - we found an entry from a domain that has not been used for a long time + domlist = (LinkedList) domainStacks.get(maxhash); + result = (String) domlist.removeFirst(); + if (domlist.size() == 0) domainStacks.remove(maxhash); + } + } + + // 2nd-b: check domainStacks for best match between stack size and retrieval time + if (result == null) { + // we order all domains by the number of entries per domain + // then we iterate through these domains in descending entry order + // and that that one, that has a delta > minimumDelta + Iterator i = domainStacks.entrySet().iterator(); + Map.Entry entry; + String domhash; + LinkedList domlist; + TreeMap hitlist = new TreeMap(); + int count = 0; + // first collect information about sizes of the domain lists + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + domhash = (String) entry.getKey(); + domlist = (LinkedList) entry.getValue(); + hitlist.put(new Integer(domlist.size() * 100 + count++), domhash); + } - String entry = null; - kelondroRow.Entry topentry = stack.top(); - if (topentry == null) return null; - String top = new String(topentry.getColBytes(0)); - - // check if the time after retrieval of last hash from same - // domain is not shorter than the minimumDelta - long delta = lastAccessDelta(top); - if (delta > minimumDelta) { - // the entry from top is fine - entry = new String(stack.pop().getColBytes(0)); - } else { - // try entry from bottom - entry = new String(stack.pot().getColBytes(0)); - delta = lastAccessDelta(entry); + // now iterate in descending order an fetch that one, + // that is acceptable by the minimumDelta constraint + long delta; + String maxhash = null; + while (hitlist.size() > 0) { + domhash = (String) hitlist.remove(hitlist.lastKey()); + if (maxhash == null) maxhash = domhash; // remember first entry + delta = lastAccessDelta(domhash); + if (delta > minimumDelta) { + domlist = (LinkedList) domainStacks.get(domhash); + result = (String) domlist.removeFirst(); + if (domlist.size() == 0) domainStacks.remove(domhash); + break; + } } - if (delta < minimumDelta) { - // force a busy waiting here - // in best case, this should never happen if the balancer works propertly - // this is only to protect against the worst case, where the crawler could - // behave in a DoS-manner - long sleeptime = minimumDelta - delta; - try {synchronized(this) { this.wait(sleeptime); }} catch (InterruptedException e) {} + // if we did yet not choose any entry, we simply take that one with the most entries + if ((result == null) && (maxhash != null)) { + domlist = (LinkedList) domainStacks.get(maxhash); + result = (String) domlist.removeFirst(); + if (domlist.size() == 0) domainStacks.remove(maxhash); } - domainAccess.put(entry.substring(6), new Long(System.currentTimeMillis())); - return entry; - } - } - - private long lastAccessDelta(String urlhash) { - assert urlhash != null; - Long lastAccess = (Long) domainAccess.get(urlhash.substring(6)); - if (lastAccess == null) return Long.MAX_VALUE; // never accessed - return System.currentTimeMillis() - lastAccess.longValue(); - } - - public byte[] top(int dist) throws IOException { - flushSome(1 + dist - stack.size()); // flush only that much as we need to display - synchronized (domainStacks) { - return stack.top(dist).getColBytes(0); } - } - - public void clear() throws IOException { - synchronized (domainStacks) { - domainStacks.clear(); - stack = kelondroStack.reset(stack); - } - } - - public class KeyIterator implements Iterator { - - Iterator ni; - public KeyIterator(Iterator i) { - ni = i; + // 3rd: take entry from file + if ((result == null) && (fileStack.size() > 0)) { + kelondroRow.Entry topentry = fileStack.top(); + if (topentry != null) { + String top = new String(topentry.getColBytes(0)); + + // check if the time after retrieval of last hash from same + // domain is not shorter than the minimumDelta + long delta = lastAccessDelta(top); + if (delta > minimumDelta) { + // the entry from top is fine + result = new String(fileStack.pop().getColBytes(0)); + } else { + // try entry from bottom + result = new String(fileStack.pot().getColBytes(0)); + delta = lastAccessDelta(result); + } + } } - public boolean hasNext() { - return ni.hasNext(); + // check case where we did not found anything + if (result == null) { + serverLog.logSevere("PLASMA BALANCER", "get() was not able to find a valid urlhash - total size = " + size() + ", fileStack.size() = " + fileStack.size() + ", ramStack.size() = " + ramStack.size() + ", domainStacks.size() = " + domainStacks.size()); + return null; } - public Object next() { - return ((kelondroRecords.Node) ni.next()).getKey(); + // finally: check minimumDelta and if necessary force a sleep + long delta = lastAccessDelta(result); + if (delta < minimumDelta) { + // force a busy waiting here + // in best case, this should never happen if the balancer works propertly + // this is only to protect against the worst case, where the crawler could + // behave in a DoS-manner + long sleeptime = minimumDelta - delta; + try {synchronized(this) { this.wait(sleeptime); }} catch (InterruptedException e) {} } - public void remove() { + // update statistical data + domainAccess.put(result.substring(6), new Long(System.currentTimeMillis())); + ramIndex.remove(result); + return result; + } + + private long lastAccessDelta(String hash) { + assert hash != null; + Long lastAccess = (Long) domainAccess.get((hash.length() > 6) ? hash.substring(6) : hash); + if (lastAccess == null) return Long.MAX_VALUE; // never accessed + return System.currentTimeMillis() - lastAccess.longValue(); + } + + public synchronized String top(int dist) { + int availableInRam = ramStack.size() + sizeDomainStacks(); + if ((availableInRam < dist) && (fileStack.size() > (dist - availableInRam))) { + // flush some entries from disc to domain stacks + try { + for (int i = 0; i < (dist - availableInRam); i++) { + ramStack.add(new String(fileStack.pop().getColBytes(0))); + } + } catch (IOException e) {} } - + while ((sizeDomainStacks() > 0) && (ramStack.size() <= dist)) flushOnceDomStacks(true); // flush only that much as we need to display + if (dist >= ramStack.size()) return null; + return (String) ramStack.get(dist); } } diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 29bbcd142..c58223ff7 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -76,7 +76,9 @@ public class plasmaCrawlNURL { public static final int STACK_TYPE_MOVIE = 12; // put on movie stack public static final int STACK_TYPE_MUSIC = 13; // put on music stack - private static final long minimumDelta = 500; // the minimum time difference between access of the same domain + private static final long minimumDelta = 500; // the minimum time difference between access of the same domain + private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt + /** * column length definition for the {@link plasmaURL#urlIndexFile} DB */ @@ -105,7 +107,7 @@ public class plasmaCrawlNURL { private kelondroStack movieStack; // links pointing to movie resources private kelondroStack musicStack; // links pointing to music resources - private final HashSet stackIndex; // to find out if a specific link is already on any stack + private final HashSet imageStackIndex, movieStackIndex, musicStackIndex; // to find out if a specific link is already on any stack private File cacheStacksPath; private int bufferkb; private long preloadTime; @@ -141,7 +143,9 @@ public class plasmaCrawlNURL { musicStack = kelondroStack.open(musicStackFile, rowdef); // init stack Index - stackIndex = new HashSet(); + imageStackIndex = new HashSet(); + movieStackIndex = new HashSet(); + musicStackIndex = new HashSet(); (initThead = new initStackIndex()).start(); } @@ -234,45 +238,21 @@ public class plasmaCrawlNURL { public class initStackIndex extends Thread { public void run() { Iterator i; - try { - i = coreStack.iterator(); - while (i.hasNext()) stackIndex.add(new String((byte[]) i.next(), "UTF-8")); - } catch (Exception e) { - coreStack.reset(); - } - try { - i = limitStack.iterator(); - while (i.hasNext()) stackIndex.add(new String((byte[]) i.next(), "UTF-8")); - } catch (Exception e) { - limitStack.reset(); - } - try { - i = overhangStack.iterator(); - while (i.hasNext()) stackIndex.add(new String((byte[]) i.next(), "UTF-8")); - } catch (Exception e) { - overhangStack.reset(); - } - try { - i = remoteStack.iterator(); - while (i.hasNext()) stackIndex.add(new String((byte[]) i.next(), "UTF-8")); - } catch (Exception e) { - remoteStack.reset(); - } try { i = imageStack.iterator(); - while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); + while (i.hasNext()) imageStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); } catch (Exception e) { imageStack = kelondroStack.reset(imageStack); } try { i = movieStack.iterator(); - while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); + while (i.hasNext()) movieStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); } catch (Exception e) { movieStack = kelondroStack.reset(movieStack); } try { i = musicStack.iterator(); - while (i.hasNext()) stackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); + while (i.hasNext()) musicStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); } catch (Exception e) { musicStack = kelondroStack.reset(musicStack); } @@ -315,7 +295,14 @@ public class plasmaCrawlNURL { } public boolean existsInStack(String urlhash) { - return stackIndex.contains(urlhash); + return + coreStack.has(urlhash) || + limitStack.has(urlhash) || + overhangStack.has(urlhash) || + remoteStack.has(urlhash) || + imageStackIndex.contains(urlhash) || + movieStackIndex.contains(urlhash) || + musicStackIndex.contains(urlhash); } public synchronized Entry newEntry(String initiator, URL url, Date loaddate, @@ -340,19 +327,35 @@ public class plasmaCrawlNURL { ); } - public void push(int stackType, String domain, String hash) { + public void push(int stackType, String urlhash) { try { switch (stackType) { - case STACK_TYPE_CORE: coreStack.add(domain, hash.getBytes()); break; - case STACK_TYPE_LIMIT: limitStack.add(domain, hash.getBytes()); break; - case STACK_TYPE_OVERHANG: overhangStack.add(domain, hash.getBytes()); break; - case STACK_TYPE_REMOTE: remoteStack.add(domain, hash.getBytes()); break; - case STACK_TYPE_IMAGE: imageStack.push(imageStack.row().newEntry(new byte[][] {hash.getBytes()})); break; - case STACK_TYPE_MOVIE: movieStack.push(movieStack.row().newEntry(new byte[][] {hash.getBytes()})); break; - case STACK_TYPE_MUSIC: musicStack.push(musicStack.row().newEntry(new byte[][] {hash.getBytes()})); break; + case STACK_TYPE_CORE: + coreStack.add(urlhash); + break; + case STACK_TYPE_LIMIT: + limitStack.add(urlhash); + break; + case STACK_TYPE_OVERHANG: + overhangStack.add(urlhash); + break; + case STACK_TYPE_REMOTE: + remoteStack.add(urlhash); + break; + case STACK_TYPE_IMAGE: + imageStack.push(imageStack.row().newEntry(new byte[][] {urlhash.getBytes()})); + imageStackIndex.add(urlhash); + break; + case STACK_TYPE_MOVIE: + movieStack.push(movieStack.row().newEntry(new byte[][] {urlhash.getBytes()})); + movieStackIndex.add(urlhash); + break; + case STACK_TYPE_MUSIC: + musicStack.push(musicStack.row().newEntry(new byte[][] {urlhash.getBytes()})); + musicStackIndex.add(urlhash); + break; default: break; } - stackIndex.add(hash); } catch (IOException er) {} } @@ -370,14 +373,15 @@ public class plasmaCrawlNURL { } public Iterator iterator(int stackType) { + // returns an iterator of String objects switch (stackType) { case STACK_TYPE_CORE: return coreStack.iterator(); case STACK_TYPE_LIMIT: return limitStack.iterator(); case STACK_TYPE_OVERHANG: return overhangStack.iterator(); case STACK_TYPE_REMOTE: return remoteStack.iterator(); - case STACK_TYPE_IMAGE: return imageStack.iterator(); - case STACK_TYPE_MOVIE: return movieStack.iterator(); - case STACK_TYPE_MUSIC: return musicStack.iterator(); + case STACK_TYPE_IMAGE: return imageStackIndex.iterator(); + case STACK_TYPE_MOVIE: return movieStackIndex.iterator(); + case STACK_TYPE_MUSIC: return musicStackIndex.iterator(); default: return null; } } @@ -398,15 +402,14 @@ public class plasmaCrawlNURL { public void shift(int fromStack, int toStack) { try { Entry entry = pop(fromStack); - push(toStack, entry.url.getHost(), entry.hash()); + push(toStack, entry.hash()); } catch (IOException e) { return; } } public void clear(int stackType) { - try { - switch (stackType) { + switch (stackType) { case STACK_TYPE_CORE: coreStack.clear(); break; case STACK_TYPE_LIMIT: limitStack.clear(); break; case STACK_TYPE_OVERHANG: overhangStack.clear(); break; @@ -416,14 +419,15 @@ public class plasmaCrawlNURL { case STACK_TYPE_MUSIC: musicStack = kelondroStack.reset(musicStack); break; default: return; } - } catch (IOException e) {} } private Entry pop(kelondroStack stack) throws IOException { // this is a filo - pop if (stack.size() > 0) { Entry e = new Entry(new String(stack.pop().getColBytes(0))); - stackIndex.remove(e.hash); + imageStackIndex.remove(e.hash); + movieStackIndex.remove(e.hash); + musicStackIndex.remove(e.hash); return e; } else { throw new IOException("crawl stack is empty"); @@ -433,10 +437,12 @@ public class plasmaCrawlNURL { private Entry pop(plasmaCrawlBalancer balancer) throws IOException { // this is a filo - pop if (balancer.size() > 0) { - String hash = balancer.get(minimumDelta); + String hash = balancer.get(minimumDelta, maximumDomAge); if (hash == null) throw new IOException("hash is null"); Entry e = new Entry(hash); - stackIndex.remove(e.hash); + imageStackIndex.remove(e.hash); + movieStackIndex.remove(e.hash); + musicStackIndex.remove(e.hash); return e; } else { throw new IOException("balancer stack is empty"); @@ -462,15 +468,16 @@ public class plasmaCrawlNURL { // this is a filo - top if (count > balancer.size()) count = balancer.size(); ArrayList list = new ArrayList(count); - for (int i = 0; i < count; i++) { - try { - byte[] hash = balancer.top(i); - list.add(new Entry(new String(hash))); - } catch (IOException e) { - continue; - } + for (int i = 0; i < count; i++) { + try { + String urlhash = balancer.top(i); + if (urlhash == null) break; + list.add(new Entry(urlhash)); + } catch (IOException e) { + break; } - return (Entry[])list.toArray(new Entry[list.size()]); + } + return (Entry[])list.toArray(new Entry[list.size()]); } public synchronized Entry getEntry(String hash) throws IOException { diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index f6fabcfdb..0b48dc758 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -452,7 +452,6 @@ public final class plasmaCrawlStacker { this.sb.noticeURL.push( ((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT : ((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/, - nexturl.getHost(), ne.hash()); return null; } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 2d1eb9700..7e2d76eca 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1890,8 +1890,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser processLocalCrawling(urlEntry, profile, stats); return true; } catch (IOException e) { - log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage()); - noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE); + log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); + if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE); } } return true; @@ -1975,8 +1975,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return true; } catch (IOException e) { - log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage()); - noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT); + log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); + if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT); return true; // if we return a false here we will block everything } } @@ -2041,8 +2041,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser processLocalCrawling(urlEntry, profile, stats); return true; } catch (IOException e) { - log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage()); - noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_REMOTE); + log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); + if (e.getMessage().indexOf("hash is null") > 0) noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_REMOTE); return true; } }