From 5f72be2a951f6fa045b6db4bde8787102eef5168 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 24 Jul 2006 15:25:47 +0000 Subject: [PATCH] some redesign of EURL storage * store() is now called explicitely * more urls are written to the EURL table * the EURL stack does not store the complete entry any more, now only the URL hash git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2323 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCreateIndexingQueue_p.java | 32 +++++++------ htroot/IndexCreate_p.java | 13 ++++-- htroot/yacy/crawlReceipt.java | 5 +- source/de/anomic/plasma/plasmaCrawlEURL.java | 46 ++++++++----------- source/de/anomic/plasma/plasmaCrawlLURL.java | 2 + source/de/anomic/plasma/plasmaCrawlNURL.java | 37 ++++++++------- .../de/anomic/plasma/plasmaCrawlStacker.java | 12 +++-- .../de/anomic/plasma/plasmaCrawlWorker.java | 7 +-- .../de/anomic/plasma/plasmaSwitchboard.java | 6 ++- 9 files changed, 89 insertions(+), 71 deletions(-) diff --git a/htroot/IndexCreateIndexingQueue_p.java b/htroot/IndexCreateIndexingQueue_p.java index b11893069..b3fd74b86 100644 --- a/htroot/IndexCreateIndexingQueue_p.java +++ b/htroot/IndexCreateIndexingQueue_p.java @@ -196,20 +196,24 @@ public class IndexCreateIndexingQueue_p { plasmaCrawlEURL.Entry entry; yacySeed initiatorSeed, executorSeed; int j=0; - for ( int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) { - entry = switchboard.urlPool.errorURL.getStack(i); - initiatorHash = entry.initiator(); - executorHash = entry.executor(); - url = entry.url().toString(); - initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); - executorSeed = yacyCore.seedDB.getConnected(executorHash); - prop.put("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : wikiCode.replaceHTML(initiatorSeed.getName()))); - prop.put("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : wikiCode.replaceHTML(executorSeed.getName()))); - prop.put("rejected_list_"+j+"_url", wikiCode.replaceHTML(url)); - prop.put("rejected_list_"+j+"_failreason", entry.failreason()); - prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0)); - dark = !dark; - j++; + for (int i = switchboard.urlPool.errorURL.stackSize() - 1; i >= (switchboard.urlPool.errorURL.stackSize() - showRejectedCount); i--) { + try { + entry = switchboard.urlPool.errorURL.stackPopEntry(i); + initiatorHash = entry.initiator(); + executorHash = entry.executor(); + url = entry.url().toString(); + initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); + executorSeed = yacyCore.seedDB.getConnected(executorHash); + prop.put("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : wikiCode.replaceHTML(initiatorSeed.getName()))); + prop.put("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : wikiCode.replaceHTML(executorSeed.getName()))); + prop.put("rejected_list_"+j+"_url", wikiCode.replaceHTML(url)); + prop.put("rejected_list_"+j+"_failreason", entry.failreason()); + prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0)); + dark = !dark; + j++; + } catch (IOException e) { + e.printStackTrace(); + } } prop.put("rejected_list", j); } diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index f18ce87b2..e5f9e968b 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -59,6 +59,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterOutputStream; import de.anomic.http.httpHeader; import de.anomic.index.indexURL; +import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverFileUtils; @@ -195,8 +196,10 @@ public class IndexCreate_p { prop.put("error_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL")))); prop.put("error_reasonString", reasonString); - switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength), false); + plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, + crawlingStartURL.getHost(), reasonString, new bitfield(indexURL.urlFlagLength)); + ee.store(); + switchboard.urlPool.errorURL.stackPushEntry(ee); } } catch (Exception e) { // mist @@ -259,8 +262,10 @@ public class IndexCreate_p { if (rejectReason == null) { c++; } else { - switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - (String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength), false); + plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, + (String) e.getValue(), rejectReason, new bitfield(indexURL.urlFlagLength)); + ee.store(); + switchboard.urlPool.errorURL.stackPushEntry(ee); } } diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 218fcb6b3..297557f64 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -52,6 +52,7 @@ import de.anomic.http.httpHeader; import de.anomic.index.indexURL; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -148,7 +149,9 @@ public final class crawlReceipt { } else { try { plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash); - switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength), false); + plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(indexURL.urlFlagLength)); + ee.store(); + switchboard.urlPool.errorURL.stackPushEntry(ee); switchboard.urlPool.noticeURL.remove(receivedUrlhash); } catch (IOException e) { diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 4c53d661c..19f220cf3 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -48,7 +48,6 @@ import java.io.IOException; import de.anomic.net.URL; import java.util.Date; -import java.util.HashMap; import java.util.LinkedList; import java.util.Iterator; @@ -91,29 +90,24 @@ public class plasmaCrawlEURL extends indexURL { } public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor, - String name, String failreason, bitfield flags, boolean retry) { + String name, String failreason, bitfield flags) { if ((referrer == null) || (referrer.length() < urlHashLength)) referrer = dummyHash; if ((initiator == null) || (initiator.length() < urlHashLength)) initiator = dummyHash; if ((executor == null) || (executor.length() < urlHashLength)) executor = dummyHash; if (failreason == null) failreason = "unknown"; - - // create a stack entry - HashMap map = new HashMap(); - map.put("url", url); - map.put("referrer", referrer); - map.put("initiator", initiator); - map.put("executor", executor); - map.put("name", name); - map.put("failreason", failreason); - map.put("flags", flags); - rejectedStack.add(map); - Entry e = new Entry(url, referrer, initiator, executor, name, failreason, flags); - - // put in table - if (retry) e.store(); - return e; + return new Entry(url, referrer, initiator, executor, name, failreason, flags); } + public synchronized void stackPushEntry(Entry e) { + rejectedStack.add(e.hash); + } + + public Entry stackPopEntry(int pos) throws IOException { + String urlhash = (String) rejectedStack.get(pos); + if (urlhash == null) return null; + return new Entry(urlhash); + } + public synchronized Entry getEntry(String hash) throws IOException { return new Entry(hash); } @@ -134,12 +128,6 @@ public class plasmaCrawlEURL extends indexURL { return rejectedStack.size(); } - public Entry getStack(int pos) { - HashMap m = (HashMap) rejectedStack.get(pos); - return new Entry((URL) m.get("url"), (String) m.get("referrer"), (String) m.get("initiator"), (String) m.get("executor"), - (String) m.get("name"), (String) m.get("failreason"), (bitfield) m.get("flags")); - } - public class Entry { private String hash; // the url's hash @@ -153,10 +141,11 @@ public class plasmaCrawlEURL extends indexURL { private int trycount; // number of tryings private String failreason; // string describing reason for load fail private bitfield flags; // extra space + private boolean stored; public Entry(URL url, String referrer, String initiator, String executor, String name, String failreason, bitfield flags) { - // create new entry and store it into database + // create new entry this.hash = urlHash(url); this.referrer = (referrer == null) ? dummyHash : referrer; this.initiator = initiator; @@ -168,6 +157,7 @@ public class plasmaCrawlEURL extends indexURL { this.trycount = 0; this.failreason = failreason; this.flags = flags; + this.stored = false; } public Entry(String hash) throws IOException { @@ -183,10 +173,12 @@ public class plasmaCrawlEURL extends indexURL { if (entry != null) { insertEntry(entry); } + this.stored = true; } public Entry(kelondroRow.Entry entry) throws IOException { insertEntry(entry); + this.stored = false; } private void insertEntry(kelondroRow.Entry entry) throws IOException { @@ -205,8 +197,9 @@ public class plasmaCrawlEURL extends indexURL { return; } - private void store() { + public void store() { // stores the values from the object variables into the database + if (this.stored) return; String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, urlDateLength); String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, urlDateLength); @@ -227,6 +220,7 @@ public class plasmaCrawlEURL extends indexURL { this.flags.getBytes() }; urlHashCache.put(urlHashCache.row().newEntry(entry)); + this.stored = true; } catch (IOException e) { System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString()); } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 1f03a03a0..7c7b496d3 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -463,11 +463,13 @@ public final class plasmaCrawlLURL extends indexURL { kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes()); if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL"); insertEntry(entry, searchedWord); + this.stored = true; } public Entry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException { assert (entry != null); insertEntry(entry, word); + this.stored = false; } private void insertEntry(kelondroRow.Entry entry, indexURLEntry searchedWord) throws IOException { diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 81e376aab..987048cb9 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -460,6 +460,7 @@ public class plasmaCrawlNURL extends indexURL { private int forkfactor; // sum of anchors of all ancestors private bitfield flags; private int handle; + private boolean stored;; public Entry(String initiator, URL url, @@ -484,24 +485,10 @@ public class plasmaCrawlNURL extends indexURL { this.forkfactor = forkfactor; this.flags = new bitfield(urlFlagLength); this.handle = 0; + this.stored = false; store(); } - public String toString() { - StringBuffer str = new StringBuffer(); - str.append("hash: ").append(hash==null ? "null" : hash).append(" | ") - .append("initiator: ").append(initiator==null?"null":initiator).append(" | ") - .append("url: ").append(url==null?"null":url.toString()).append(" | ") - .append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ") - .append("name: ").append((name == null) ? "null" : name).append(" | ") - .append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ") - .append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ") - .append("depth: ").append(Integer.toString(depth)).append(" | ") - .append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ") - .append("flags: ").append((flags==null) ? "null" : flags.toString()); - return str.toString(); - } - public Entry(String hash) throws IOException { // generates an plasmaNURLEntry using the url hash // to speed up the access, the url-hashes are buffered @@ -525,6 +512,7 @@ public class plasmaCrawlNURL extends indexURL { this.forkfactor = (int) entry.getColLongB64E(9); this.flags = new bitfield(entry.getColBytes(10)); this.handle = Integer.parseInt(entry.getColString(11, null), 16); + this.stored = true; return; //} catch (MalformedURLException e) { // throw new IOException("plasmaCrawlNURL/Entry: " + e); @@ -536,8 +524,9 @@ public class plasmaCrawlNURL extends indexURL { } } - private void store() { + public void store() { // stores the values from the object variables into the database + if (this.stored) return; String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength); // store the hash in the hash cache try { @@ -557,6 +546,7 @@ public class plasmaCrawlNURL extends indexURL { normalizeHandle(this.handle).getBytes() }; urlHashCache.put(urlHashCache.row().newEntry(entry)); + this.stored = true; } catch (IOException e) { serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB"); e.printStackTrace(); @@ -568,6 +558,21 @@ public class plasmaCrawlNURL extends indexURL { } } + public String toString() { + StringBuffer str = new StringBuffer(); + str.append("hash: ").append(hash==null ? "null" : hash).append(" | ") + .append("initiator: ").append(initiator==null?"null":initiator).append(" | ") + .append("url: ").append(url==null?"null":url.toString()).append(" | ") + .append("referrer: ").append((referrer == null) ? dummyHash : referrer).append(" | ") + .append("name: ").append((name == null) ? "null" : name).append(" | ") + .append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ") + .append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ") + .append("depth: ").append(Integer.toString(depth)).append(" | ") + .append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ") + .append("flags: ").append((flags==null) ? "null" : flags.toString()); + return str.toString(); + } + /** * return a url-hash, based on the md5 algorithm * the result is a String of 12 bytes within a 72-bit space diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 4e38a73ca..4cb0ec56b 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -64,6 +64,7 @@ import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroTree; +import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.server.serverSemaphore; import de.anomic.server.logging.serverLog; import de.anomic.tools.bitfield; @@ -393,7 +394,7 @@ public final class plasmaCrawlStacker { this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global."); } - this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ + plasmaCrawlNURL.Entry ee = this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ nexturl, /* url clear text string */ loadDate, /* load date */ referrerHash, /* last url in crawling queue */ @@ -405,7 +406,7 @@ public final class plasmaCrawlStacker { ((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT : ((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/ ); - + ee.store(); return null; } @@ -937,16 +938,17 @@ public final class plasmaCrawlStacker { String rejectReason = dequeue(this.theMsg); if (rejectReason != null) { - plasmaCrawlStacker.this.sb.urlPool.errorURL.newEntry( + plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry( new URL(this.theMsg.url()), this.theMsg.referrerHash(), this.theMsg.initiatorHash(), yacyCore.seedDB.mySeed.hash, this.theMsg.name, rejectReason, - new bitfield(indexURL.urlFlagLength), - false + new bitfield(indexURL.urlFlagLength) ); + ee.store(); + sb.urlPool.errorURL.stackPushEntry(ee); } } catch (Exception e) { plasmaCrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index b406e5428..6970e887a 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -312,16 +312,17 @@ public final class plasmaCrawlWorker extends Thread { String hostlow = host.toLowerCase(); if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) { log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist."); - sb.urlPool.errorURL.newEntry( + plasmaCrawlEURL.Entry ee = sb.urlPool.errorURL.newEntry( url, referer, initiator, yacyCore.seedDB.mySeed.hash, name, "denied_(url_in_blacklist)", - new bitfield(indexURL.urlFlagLength), - true + new bitfield(indexURL.urlFlagLength) ); + ee.store(); + sb.urlPool.errorURL.stackPushEntry(ee); return null; } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index dd7e6a2c7..bae16e95d 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1563,10 +1563,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } else { log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason); - urlPool.errorURL.newEntry(entry.url(), referrerHash, + plasmaCrawlEURL.Entry ee = urlPool.errorURL.newEntry(entry.url(), referrerHash, ((entry.proxy()) ? indexURL.dummyHash : entry.initiator()), yacyCore.seedDB.mySeed.hash, - descr, noIndexReason, new bitfield(indexURL.urlFlagLength), true); + descr, noIndexReason, new bitfield(indexURL.urlFlagLength)); + ee.store(); + urlPool.errorURL.stackPushEntry(ee); if ((processCase == 6) && (initiator != null)) { yacyClient.crawlReceipt(initiator, "crawl", "rejected", noIndexReason, null, ""); }