From f27f9ecf159a28cb8676ea484ef895c967dd10b6 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 12 Dec 2005 14:11:59 +0000 Subject: [PATCH] * activated write buffer for databases. This should increase IO performance and reduce HD activity * bugfixes for new exception-on-failure policy * bugfixes for new IOChunks * new Object pool for database write-buffer git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1204 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../kelondro/kelondroAbstractIOChunks.java | 11 +- .../anomic/kelondro/kelondroAbstractRA.java | 11 +- .../kelondro/kelondroBufferedIOChunks.java | 61 ++++--- .../anomic/kelondro/kelondroObjectSpace.java | 80 +++++++++ .../de/anomic/kelondro/kelondroRecords.java | 14 +- .../de/anomic/plasma/plasmaCrawlBalancer.java | 6 +- source/de/anomic/plasma/plasmaCrawlNURL.java | 97 +++++----- .../de/anomic/plasma/plasmaSwitchboard.java | 167 ++++++++---------- 8 files changed, 273 insertions(+), 174 deletions(-) create mode 100644 source/de/anomic/kelondro/kelondroObjectSpace.java diff --git a/source/de/anomic/kelondro/kelondroAbstractIOChunks.java b/source/de/anomic/kelondro/kelondroAbstractIOChunks.java index 0571aad3f..ee5820d86 100644 --- a/source/de/anomic/kelondro/kelondroAbstractIOChunks.java +++ b/source/de/anomic/kelondro/kelondroAbstractIOChunks.java @@ -59,11 +59,14 @@ public abstract class kelondroAbstractIOChunks { // derived methods: public void readFully(long pos, byte[] b, int off, int len) throws IOException { - final int r = read(pos, b, off, len); - if (r < 0) return; // read exceeded EOF - if (r < len) { + if (len < 0) throw new IndexOutOfBoundsException("length is negative:" + len); + if (b.length < off + len) throw new IndexOutOfBoundsException("bounds do not fit: b.length=" + b.length + ", off=" + off + ", len=" + len); + while (len > 0) { + int r = read(pos, b, off, len); + if (r < 0) throw new IOException("EOF"); // read exceeded EOF pos += r; - readFully(pos, b, off + r, len - r); + off += r; + len -= r; } } diff --git a/source/de/anomic/kelondro/kelondroAbstractRA.java b/source/de/anomic/kelondro/kelondroAbstractRA.java index 5f4e045fc..e48a76d84 100644 --- a/source/de/anomic/kelondro/kelondroAbstractRA.java +++ b/source/de/anomic/kelondro/kelondroAbstractRA.java @@ -71,9 +71,14 @@ abstract class kelondroAbstractRA implements kelondroRA { // derived methods: public void readFully(byte[] b, int off, int len) throws IOException { - final int r = read(b, off, len); - if (r < 0) return; // read exceeded EOF - if (r < len) { readFully(b, off + r, len - r); } + if (len < 0) throw new IndexOutOfBoundsException("length is negative:" + len); + if (b.length < off + len) throw new IndexOutOfBoundsException("bounds do not fit: b.length=" + b.length + ", off=" + off + ", len=" + len); + while (len > 0) { + int r = read(b, off, len); + if (r < 0) throw new IOException("EOF"); // read exceeded EOF + off += r; + len -= r; + } } public byte readByte() throws IOException { diff --git a/source/de/anomic/kelondro/kelondroBufferedIOChunks.java b/source/de/anomic/kelondro/kelondroBufferedIOChunks.java index 4cf587449..2acfa2184 100644 --- a/source/de/anomic/kelondro/kelondroBufferedIOChunks.java +++ b/source/de/anomic/kelondro/kelondroBufferedIOChunks.java @@ -52,16 +52,19 @@ import java.util.Map; public final class kelondroBufferedIOChunks extends kelondroAbstractIOChunks implements kelondroIOChunks { protected kelondroRA ra; - private int bufferkb; + private int bufferMaxSize, bufferCurrSize; private long commitTimeout; private HashMap buffer; private long lastCommit = 0; + private static final int overhead = 40; + public kelondroBufferedIOChunks(kelondroRA ra, String name, int bufferkb, long commitTimeout) { this.name = name; this.ra = ra; - this.bufferkb = bufferkb; + this.bufferMaxSize = 1024 * bufferkb; + this.bufferCurrSize = 0; this.commitTimeout = commitTimeout; this.buffer = new HashMap(); this.lastCommit = System.currentTimeMillis(); @@ -71,35 +74,33 @@ public final class kelondroBufferedIOChunks extends kelondroAbstractIOChunks imp assert (b.length >= off + len): "read pos=" + pos + ", b.length=" + b.length + ", off=" + off + ", len=" + len; // check commit time - if (this.lastCommit + this.commitTimeout > System.currentTimeMillis()) { + if ((bufferCurrSize > bufferMaxSize) || + (this.lastCommit + this.commitTimeout < System.currentTimeMillis())) { commit(); this.lastCommit = System.currentTimeMillis(); } - + // do the read - if ((off == 0) && (b.length == len)) { - synchronized (this.buffer) { - byte[] bb = (byte[]) buffer.get(new Long(pos)); - if (bb == null) { - synchronized (this.ra) { - this.ra.seek(pos); - return ra.read(b, off, len); - } + synchronized (this.buffer) { + byte[] bb = (byte[]) buffer.get(new Long(pos)); + if (bb == null) { + // entry not known, read direktly from IO + synchronized (this.ra) { + this.ra.seek(pos + off); + return ra.read(b, off, len); + } + } else { + // use buffered entry + if (bb.length >= off + len) { + // the bufferd entry is long enough + System.arraycopy(bb, off, b, off, len); + return len; } else { - if (bb.length >= len) { - System.arraycopy(bb, 0, b, off, len); - return len; - } else { - System.arraycopy(bb, 0, b, off, bb.length); - return bb.length; - } + // the entry is not long enough. transmit only a part + System.arraycopy(bb, off, b, off, bb.length - off); + return bb.length - off; } } - } else { - byte[] bb = new byte[len]; - int r = read(pos + off, bb, 0, len); - System.arraycopy(bb, 0, b, off, r); - return r; } } @@ -107,14 +108,16 @@ public final class kelondroBufferedIOChunks extends kelondroAbstractIOChunks imp assert (b.length >= off + len): "write pos=" + pos + ", b.length=" + b.length + ", b='" + new String(b) + "', off=" + off + ", len=" + len; // do the write into buffer - byte[] bb = new byte[len]; + byte[] bb = kelondroObjectSpace.alloc(len); System.arraycopy(b, off, bb, 0, len); synchronized (buffer) { - buffer.put(new Long(pos), bb); + buffer.put(new Long(pos + off), bb); + bufferCurrSize += overhead + pos + off; } // check commit time - if (this.lastCommit + this.commitTimeout > System.currentTimeMillis()) { + if ((bufferCurrSize > bufferMaxSize) || + (this.lastCommit + this.commitTimeout < System.currentTimeMillis())) { commit(); this.lastCommit = System.currentTimeMillis(); } @@ -134,9 +137,11 @@ public final class kelondroBufferedIOChunks extends kelondroAbstractIOChunks imp b = (byte[]) entry.getValue(); this.ra.seek(pos); this.ra.write(b); + kelondroObjectSpace.recycle(b); } } buffer.clear(); + bufferCurrSize = 0; } } @@ -152,5 +157,5 @@ public final class kelondroBufferedIOChunks extends kelondroAbstractIOChunks imp if (this.ra != null) this.close(); super.finalize(); } - + } diff --git a/source/de/anomic/kelondro/kelondroObjectSpace.java b/source/de/anomic/kelondro/kelondroObjectSpace.java new file mode 100644 index 000000000..756e0a61e --- /dev/null +++ b/source/de/anomic/kelondro/kelondroObjectSpace.java @@ -0,0 +1,80 @@ +// kelondroObjectSpace.java +// ------------------------ +// part of The Kelondro Database +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// created: 12.12.2004 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.kelondro; + +import java.util.ArrayList; +import java.util.HashMap; + +public class kelondroObjectSpace { + + private static final int minSize = 10; + private static HashMap objects = new HashMap(); + + public static byte[] alloc(int len) { + if (len < minSize) return new byte[len]; + synchronized (objects) { + ArrayList buf = (ArrayList) objects.get(new Integer(len)); + if ((buf == null) || (buf.size() == 0)) return new byte[len]; + return (byte[]) buf.remove(buf.size() - 1); + } + } + + public static void recycle(byte[] b) { + if (b.length < minSize) { + b = null; + return; + } + synchronized (objects) { + final Integer i = new Integer(b.length); + ArrayList buf = (ArrayList) objects.get(i); + if (buf == null) { + buf = new ArrayList(); + buf.add(b); + objects.put(i, buf); + } else { + buf.add(b); + } + } + b = null; + } + +} diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java index 77a513c32..a2c0a0f7a 100644 --- a/source/de/anomic/kelondro/kelondroRecords.java +++ b/source/de/anomic/kelondro/kelondroRecords.java @@ -72,6 +72,7 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.NoSuchElementException; +import java.util.Random; import java.util.StringTokenizer; import java.util.Map; import java.util.Iterator; @@ -151,6 +152,9 @@ public class kelondroRecords { // optional logger protected Logger theLogger = null; + + // Random. This is used to shift flush-times of write-buffers to differrent time + private static Random random = new Random(System.currentTimeMillis()); private class usageControl { private int USEDC; // counter of used elements @@ -219,8 +223,8 @@ public class kelondroRecords { int[] columns, int FHandles, int txtProps, int txtPropWidth) throws IOException { // create new Chunked IO - //this.entryFile = new kelondroBufferedIOChunks(ra, ra.name(), 1024, 5000); - this.entryFile = new kelondroRAIOChunks(ra, ra.name()); + this.entryFile = new kelondroBufferedIOChunks(ra, ra.name(), 1024, 8000 + random.nextLong() % 2000); + //this.entryFile = new kelondroRAIOChunks(ra, ra.name()); // store dynamic run-time data this.overhead = ohbytec + 4 * ohhandlec; @@ -276,6 +280,8 @@ public class kelondroRecords { for (int i = 0; i < this.TXTPROPS.length; i++) { entryFile.write(POS_TXTPROPS + TXTPROPW * i, ea); } + + this.entryFile.commit(); } public void setLogger(Logger newLogger) { @@ -320,8 +326,8 @@ public class kelondroRecords { private void init(kelondroRA ra) throws IOException { // read from Chunked IO - //this.entryFile = new kelondroBufferedIOChunks(ra, ra.name(), 1024, 5000); - this.entryFile = new kelondroRAIOChunks(ra, ra.name()); + this.entryFile = new kelondroBufferedIOChunks(ra, ra.name(), 1024, 8000 + random.nextLong() % 2000); + //this.entryFile = new kelondroRAIOChunks(ra, ra.name()); // read dynamic variables that are back-ups of stored values in file; // read/defined on instantiation diff --git a/source/de/anomic/plasma/plasmaCrawlBalancer.java b/source/de/anomic/plasma/plasmaCrawlBalancer.java index 012032f6c..463bb9e13 100644 --- a/source/de/anomic/plasma/plasmaCrawlBalancer.java +++ b/source/de/anomic/plasma/plasmaCrawlBalancer.java @@ -58,7 +58,11 @@ public class plasmaCrawlBalancer { public plasmaCrawlBalancer(File stackFile, long buffersize) throws IOException { if (stackFile.exists()) - stack = new kelondroStack(stackFile, buffersize); + try { + stack = new kelondroStack(stackFile, buffersize); + } catch (IOException e) { + stack = new kelondroStack(stackFile, buffersize, new int[] {plasmaURL.urlHashLength}); + } else stack = new kelondroStack(stackFile, buffersize, new int[] {plasmaURL.urlHashLength}); domainStacks = new HashMap(); diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 108e7dd9a..5122934c7 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -120,9 +120,27 @@ public class plasmaCrawlNURL extends plasmaURL { limitStack = new plasmaCrawlBalancer(limitStackFile, 0); overhangStack = new plasmaCrawlBalancer(overhangStackFile, 0); remoteStack = new plasmaCrawlBalancer(remoteStackFile, 0); - if (imageStackFile.exists()) imageStack = new kelondroStack(imageStackFile, 0); else imageStack = new kelondroStack(imageStackFile, 0, new int[] {plasmaURL.urlHashLength}); - if (movieStackFile.exists()) movieStack = new kelondroStack(movieStackFile, 0); else movieStack = new kelondroStack(movieStackFile, 0, new int[] {plasmaURL.urlHashLength}); - if (musicStackFile.exists()) musicStack = new kelondroStack(musicStackFile, 0); else musicStack = new kelondroStack(musicStackFile, 0, new int[] {plasmaURL.urlHashLength}); + if (imageStackFile.exists()) try { + imageStack = new kelondroStack(imageStackFile, 0); + } catch (IOException e) { + imageStack = new kelondroStack(imageStackFile, 0, new int[] {plasmaURL.urlHashLength}); + } else { + imageStack = new kelondroStack(imageStackFile, 0, new int[] {plasmaURL.urlHashLength}); + } + if (movieStackFile.exists()) try { + movieStack = new kelondroStack(movieStackFile, 0); + } catch (IOException e) { + movieStack = new kelondroStack(movieStackFile, 0, new int[] {plasmaURL.urlHashLength}); + } else { + movieStack = new kelondroStack(movieStackFile, 0, new int[] {plasmaURL.urlHashLength}); + } + if (musicStackFile.exists()) try { + musicStack = new kelondroStack(musicStackFile, 0); + } catch (IOException e) { + musicStack = new kelondroStack(musicStackFile, 0, new int[] {plasmaURL.urlHashLength}); + } else { + musicStack = new kelondroStack(musicStackFile, 0, new int[] {plasmaURL.urlHashLength}); + } // init stack Index stackIndex = new HashSet(); @@ -267,7 +285,7 @@ public class plasmaCrawlNURL extends plasmaURL { } } - public Entry pop(int stackType) { + public Entry pop(int stackType) throws IOException { switch (stackType) { case STACK_TYPE_CORE: return pop(coreStack); case STACK_TYPE_LIMIT: return pop(limitStack); @@ -281,9 +299,12 @@ public class plasmaCrawlNURL extends plasmaURL { } public void shift(int fromStack, int toStack) { - Entry entry = pop(fromStack); - if (entry.url() == null) return; - push(toStack, entry.url.getHost(), entry.hash()); + try { + Entry entry = pop(fromStack); + push(toStack, entry.url.getHost(), entry.hash()); + } catch (IOException e) { + return; + } } public void clear(int stackType) { @@ -301,33 +322,25 @@ public class plasmaCrawlNURL extends plasmaURL { } catch (IOException e) {} } - private Entry pop(kelondroStack stack) { + private Entry pop(kelondroStack stack) throws IOException { // this is a filo - pop - try { - if (stack.size() > 0) { - Entry e = new Entry(new String(stack.pop()[0])); - stackIndex.remove(e.hash); - return e; - } else { - return null; - } - } catch (IOException e) { - return null; + if (stack.size() > 0) { + Entry e = new Entry(new String(stack.pop()[0])); + stackIndex.remove(e.hash); + return e; + } else { + throw new IOException("crawl stack is empty"); } } - private Entry pop(plasmaCrawlBalancer balancer) { + private Entry pop(plasmaCrawlBalancer balancer) throws IOException { // this is a filo - pop - try { - if (balancer.size() > 0) { - Entry e = new Entry(new String((byte[]) balancer.get()[1])); - stackIndex.remove(e.hash); - return e; - } else { - return null; - } - } catch (IOException e) { - return null; + if (balancer.size() > 0) { + Entry e = new Entry(new String((byte[]) balancer.get()[1])); + stackIndex.remove(e.hash); + return e; + } else { + throw new IOException("balancer stack is empty"); } } @@ -335,32 +348,30 @@ public class plasmaCrawlNURL extends plasmaURL { // this is a filo - top if (count > stack.size()) count = stack.size(); ArrayList list = new ArrayList(count); - try { - for (int i = 0; i < count; i++) { + for (int i = 0; i < count; i++) { + try { byte[] hash = stack.top(i)[0]; - if (hash == null) continue; list.add(new Entry(new String(hash))); + } catch (IOException e) { + continue; } - return (Entry[])list.toArray(new Entry[list.size()]); - } catch (IOException e) { - return null; } + return (Entry[]) list.toArray(new Entry[list.size()]); } private Entry[] top(plasmaCrawlBalancer balancer, int count) { // this is a filo - top if (count > balancer.size()) count = balancer.size(); ArrayList list = new ArrayList(count); - try { for (int i = 0; i < count; i++) { - byte[] hash = balancer.top(i); - if (hash == null) continue; - list.add(new Entry(new String(hash))); + try { + byte[] hash = balancer.top(i); + list.add(new Entry(new String(hash))); + } catch (IOException e) { + continue; + } } return (Entry[])list.toArray(new Entry[list.size()]); - } catch (IOException e) { - return null; - } } public synchronized Entry getEntry(String hash) throws IOException { @@ -460,7 +471,7 @@ public class plasmaCrawlNURL extends plasmaURL { //} } else { // show that we found nothing - throw new IOException("hash not found"); + throw new IOException("NURL: hash " + hash + " not found"); //this.url = null; } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 0a91db140..6fbb6b4b6 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -925,7 +925,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } public boolean coreCrawlJob() { - try { if (urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) == 0) { //log.logDebug("CoreCrawl: queue is empty"); return false; @@ -958,46 +957,31 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // do a local crawl - String stats = null; - boolean validEntry = false; + String stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; plasmaCrawlNURL.Entry urlEntry = null; - do { + try { urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE); - stats = "LOCALCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; - - // if the queue is empty - if (urlEntry == null) return false; - - // if we have received a new entry - if ((urlEntry.url() == null) || (urlEntry.url().toString().length() < 10)) { - log.logInfo(stats + ": URL with hash " + ((urlEntry.hash()==null)?"Unknown":urlEntry.hash()) + " already removed from queue."); - validEntry = false; - } else { - validEntry = true; + String profileHandle = urlEntry.profileHandle(); + // System.out.println("DEBUG plasmaSwitchboard.processCrawling: + // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); + if (profileHandle == null) { + log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + return true; } - } while(!validEntry); - - String profileHandle = urlEntry.profileHandle(); - //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); - if (profileHandle == null) { - log.logSevere(stats + ": NULL PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); + if (profile == null) { + log.logSevere(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + return true; + } + log.logFine("LOCALCRAWL: URL=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() + + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); + + processLocalCrawling(urlEntry, profile, stats); return true; - } - plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); - if (profile == null) { - log.logSevere(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + } catch (IOException e) { + log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage()); return true; } - log.logFine("LOCALCRAWL: URL=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + - ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() + - ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); - - processLocalCrawling(urlEntry, profile, stats); - return true; - } catch (Exception e) { - e.printStackTrace(); - return false; - } } public int limitCrawlTriggerJobSize() { @@ -1049,37 +1033,37 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // start a global crawl, if possible - plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); - String stats = "REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; - if (urlEntry.url() == null) { - log.logSevere(stats + ": urlEntry.url() == null"); - return true; - } - String profileHandle = urlEntry.profileHandle(); - //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); - plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); - if (profile == null) { - log.logSevere(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + String stats = "REMOTECRAWLTRIGGER[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; + try { + plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); + String profileHandle = urlEntry.profileHandle(); + // System.out.println("DEBUG plasmaSwitchboard.processCrawling: + // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); + plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); + if (profile == null) { + log.logSevere(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + return true; + } + log.logFine("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + + profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); + + boolean tryRemote = ((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sbQueue.size() != 0)) && + (profile.remoteIndexing()) && + (urlEntry.initiator() != null) && + (!(urlEntry.initiator().equals(plasmaURL.dummyHash))) && + ((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())); + if (tryRemote) { + boolean success = processRemoteCrawlTrigger(urlEntry); + if (success) return true; + } + + processLocalCrawling(urlEntry, profile, stats); return true; + } catch (IOException e) { + log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage()); + return true; // if we return a false here we will block everything } - log.logFine("plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + - ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() + - ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); - - boolean tryRemote = - ((urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) != 0) || (sbQueue.size() != 0)) /* should do ourself */ && - (profile.remoteIndexing()) /* granted */ && - (urlEntry.initiator() != null) && (!(urlEntry.initiator().equals(plasmaURL.dummyHash))) /* not proxy */ && - ((yacyCore.seedDB.mySeed.isSenior()) || - (yacyCore.seedDB.mySeed.isPrincipal())) /* qualified */; - - if (tryRemote) { - boolean success = processRemoteCrawlTrigger(urlEntry); - if (success) return true; - } - - processLocalCrawling(urlEntry, profile, stats); - return true; } public int remoteTriggeredCrawlJobSize() { @@ -1111,26 +1095,29 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) - plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); - String stats = "REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; - if (urlEntry.url() == null) { - log.logSevere(stats + ": urlEntry.url() == null"); - return false; - } - String profileHandle = urlEntry.profileHandle(); - //System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); - plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); - - if (profile == null) { - log.logSevere(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); - return false; + String stats = "REMOTETRIGGEREDCRAWL[" + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; + try { + plasmaCrawlNURL.Entry urlEntry = urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); + String profileHandle = urlEntry.profileHandle(); + // System.out.println("DEBUG plasmaSwitchboard.processCrawling: + // profileHandle = " + profileHandle + ", urlEntry.url = " + + // urlEntry.url()); + plasmaCrawlProfile.entry profile = profiles.getEntry(profileHandle); + + if (profile == null) { + log.logSevere(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' (must be internal error) for URL " + urlEntry.url()); + return false; + } + log.logFine("plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + + profile.generalFilter() + ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); + + processLocalCrawling(urlEntry, profile, stats); + return true; + } catch (IOException e) { + log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage()); + return true; } - log.logFine("plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry.url() + ", initiator=" + urlEntry.initiator() + - ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.generalDepth() + ", filter=" + profile.generalFilter() + - ", permission=" + ((yacyCore.seedDB == null) ? "undefined" : (((yacyCore.seedDB.mySeed.isSenior()) || (yacyCore.seedDB.mySeed.isPrincipal())) ? "true" : "false"))); - - processLocalCrawling(urlEntry, profile, stats); - return true; } private void processResourceStack(plasmaSwitchboardQueue.Entry entry) { @@ -1219,14 +1206,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser nexturlstring = htmlFilterContentScraper.urlNormalform(null, nexturlstring); sbStackCrawlThread.enqueue(nexturlstring, entry.url().toString(), initiatorHash, (String) e.getValue(), docDate, entry.depth() + 1, entry.profile()); - - // rejectReason = stackCrawl(nexturlstring, entry.normalizedURLString(), initiatorHash, (String) e.getValue(), loadDate, entry.depth() + 1, entry.profile()); - // if (rejectReason == null) { - // c++; - // } else { - // urlPool.errorURL.newEntry(new URL(nexturlstring), entry.normalizedURLString(), entry.initiator(), yacyCore.seedDB.mySeed.hash, - // (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false); - // } + + // rejectReason = stackCrawl(nexturlstring, entry.normalizedURLString(), initiatorHash, (String) e.getValue(), loadDate, entry.depth() + 1, entry.profile()); + // if (rejectReason == null) { c++; } else { + // urlPool.errorURL.newEntry(new URL(nexturlstring), entry.normalizedURLString(), entry.initiator(), yacyCore.seedDB.mySeed.hash, + // (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false); + // } } log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.normalizedURLString() + ", NEW CRAWL STACK SIZE IS " + urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE));