From 3286b1f498db79ee05deead6eed9b45be0e1c2e1 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 17 Mar 2006 10:16:07 +0000 Subject: [PATCH] re-organisation of lurl-creation and -stacking this was necessary to prevent useless write to the database in case of blacklist appearance of the url git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1905 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/htdocsdefault/dir.java | 16 +++++++----- htroot/yacy/crawlReceipt.java | 3 ++- htroot/yacy/search.java | 1 - htroot/yacy/transferURL.java | 4 +-- .../plasma/dbImport/plasmaDbImporter.java | 3 ++- source/de/anomic/plasma/plasmaCrawlLURL.java | 26 ++++++++++++++----- .../de/anomic/plasma/plasmaSwitchboard.java | 17 +++++++----- source/de/anomic/plasma/plasmaWordIndex.java | 7 +---- source/de/anomic/yacy/yacyClient.java | 3 ++- source/yacy.java | 6 +++-- 10 files changed, 53 insertions(+), 33 deletions(-) diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index e938ac0ae..98bc19e26 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -456,10 +456,8 @@ public class dir { try { final URL url = new URL(urlstring); final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); - final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.addEntry( + final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.newEntry( url, "YaCyShare: " + descr, new Date(), new Date(), - "____________", /*initiator*/ - yacyCore.seedDB.mySeed.hash, /*executor*/ "AAAAAAAAAAAA", /*referrer*/ 0, /*copycount*/ false, /*localneed*/ @@ -467,10 +465,16 @@ public class dir { "**", /*language*/ plasmaWordIndexEntry.DT_SHARE, /*doctype*/ phrase.length(), /*size*/ - condenser.RESULT_NUMB_WORDS, - 5 /*process case*/ + condenser.RESULT_NUMB_WORDS ); - + newEntry.store(); + switchboard.urlPool.loadedURL.stackEntry( + newEntry, + "____________", /*initiator*/ + yacyCore.seedDB.mySeed.hash, /*executor*/ + 5 /*process case*/ + ); + final String urlHash = newEntry.hash(); /*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, null, condenser, "**", plasmaWordIndexEntry.DT_SHARE, 0, 0); } catch (IOException e) {} diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 1d117b7ad..2f982ef46 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -129,7 +129,8 @@ public final class crawlReceipt { "\n\tURL properties: "+ propStr); } else { // put new entry into database - switchboard.urlPool.loadedURL.addEntry(entry, youare, iam, 1); + entry.store(); + switchboard.urlPool.loadedURL.stackEntry(entry, youare, iam, 1); // generating url hash String newUrlHash = plasmaURL.urlHash(entry.url()); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 11521decf..b3674f486 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -88,7 +88,6 @@ public final class search { // tell all threads to do nothing for a specific time - sb.wordIndex.intermission(2 * duetime); sb.intermissionAllThreads(2 * duetime); // store accessing peer diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 5609dfe6c..96f8b7c82 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -97,12 +97,12 @@ public final class transferURL { if ((lEntry != null) && (lEntry.url() != null)) { if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed( lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) { - sb.urlPool.loadedURL.remove(lEntry.hash()); int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash()); yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); lEntry = null; } else { - sb.urlPool.loadedURL.addEntry(lEntry, iam, iam, 3); + lEntry.store(); + sb.urlPool.loadedURL.stackEntry(lEntry, iam, iam, 3); yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName); diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index be4a12754..254cfd3a3 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -163,7 +163,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, null); /* write it into the home url db */ - this.homeUrlDB.newEntry(urlEntry); + plasmaCrawlLURL.Entry homeEntry = this.homeUrlDB.newEntry(urlEntry); + homeEntry.store(); importedUrlBuffer.add(urlHash); this.urlCounter++; diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 7a0121b5b..929e759f6 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -131,6 +131,7 @@ public final class plasmaCrawlLURL extends plasmaURL { gcrawlResultStack = new LinkedList(); } + /* public synchronized Entry addEntry(URL url, String descr, Date moddate, Date loaddate, String initiatorHash, String executorHash, String referrerHash, int copyCount, boolean localNeed, @@ -150,8 +151,9 @@ public final class plasmaCrawlLURL extends plasmaURL { } return e; } - - public synchronized void addEntry(Entry e, String initiatorHash, String executorHash, int stackType) { + */ + + public synchronized void stackEntry(Entry e, String initiatorHash, String executorHash, int stackType) { if (e == null) { return; } try { if (initiatorHash == null) { initiatorHash = dummyHash; } @@ -205,6 +207,14 @@ public final class plasmaCrawlLURL extends plasmaURL { } } + public synchronized Entry newEntry(URL url, String descr, Date moddate, Date loaddate, + String referrerHash, int copyCount, boolean localNeed, + int quality, String language, char doctype, + int size, int wordCount) { + Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount); + return e; + } + public int getStackSize(int stack) { switch (stack) { case 1: return externResultStack.size(); @@ -400,7 +410,8 @@ public final class plasmaCrawlLURL extends plasmaURL { private int wordCount; private String snippet; private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests - + private boolean stored = false; + // more needed attributes: // - author / copyright owner // - keywords @@ -427,7 +438,7 @@ public final class plasmaCrawlLURL extends plasmaURL { this.wordCount = wordCount; this.snippet = null; this.word = null; - store(); + this.stored = false; } public Entry(String urlHash, plasmaWordIndexEntry searchedWord) throws IOException { @@ -441,6 +452,7 @@ public final class plasmaCrawlLURL extends plasmaURL { this.urlHash = urlHash; byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes()); if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL"); + this.stored = true; try { if (entry != null) { this.url = new URL(new String(entry[1], "UTF-8").trim()); @@ -491,15 +503,16 @@ public final class plasmaCrawlLURL extends plasmaURL { this.snippet = prop.getProperty("snippet", ""); if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null; - store(); + this.stored = false; //} } catch (Exception e) { serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2: " + e.toString(), e); } } - private void store() { + public void store() { // Check if there is a more recent Entry already in the DB + if (this.stored) return; Entry oldEntry; try { if (exists(urlHash)) { @@ -553,6 +566,7 @@ public final class plasmaCrawlLURL extends plasmaURL { kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(), }; urlHashCache.put(entry); + this.stored = true; } catch (Exception e) { serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e); } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 63b59ba66..700f2c5de 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1402,20 +1402,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser //log.logDebug("Create LURL-Entry for '" + entry.normalizedURLString() + "', " + // "responseHeader=" + entry.responseHeader().toString()); - plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.addEntry( + plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.newEntry( entry.url(), descr, docDate, new Date(), - initiatorHash, - yacyCore.seedDB.mySeed.hash, referrerHash, 0, true, condenser.RESULT_WORD_ENTROPHY, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()), (int) entry.size(), - condenser.RESULT_NUMB_WORDS, + condenser.RESULT_NUMB_WORDS + ); + newEntry.store(); + urlPool.loadedURL.stackEntry( + newEntry, + initiatorHash, + yacyCore.seedDB.mySeed.hash, processCase ); - String urlHash = newEntry.hash(); if (((processCase == 4) || (processCase == 5) || (processCase == 6)) && (entry.profile().localIndexing())) { @@ -1729,7 +1732,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if ((lurl != null) && (lurl.length() != 0)) { String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true); - urlPool.loadedURL.addEntry(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? + entry.store(); + urlPool.loadedURL.stackEntry(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? urlPool.noticeURL.remove(entry.hash()); log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'"); return true; @@ -1763,7 +1767,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser plasmaSearchTimingProfile remoteTiming) { // tell all threads to do nothing for a specific time - wordIndex.intermission(2 * query.maximumTime); intermissionAllThreads(2 * query.maximumTime); serverObjects prop = new serverObjects(); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 8a615616b..579b58334 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -261,7 +261,6 @@ public final class plasmaWordIndex { outlinksSame, outlinksOther, true); addEntry(wordHash, ientry, System.currentTimeMillis(), false); - //addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), System.currentTimeMillis(), false); } // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + // condenser.getWords().size() + " words, flushed " + c + " entries"); @@ -324,7 +323,7 @@ public final class plasmaWordIndex { public int size() { return java.lang.Math.max(assortmentCluster.sizeTotal(), - java.lang.Math.max(backend.size(), ramCache.wSize() + ramCache.kSize())); + java.lang.Math.max(backend.size(), ramCache.size())); } public int indexSize(String wordHash) { @@ -341,10 +340,6 @@ public final class plasmaWordIndex { return size; } - public void intermission(long pause) { - //this.ramCache.intermission(pause); - } - public void close(int waitingBoundSeconds) { ramCache.close(waitingBoundSeconds); assortmentCluster.close(); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 384426319..b6ad658c7 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -466,10 +466,11 @@ public final class yacyClient { // get one single search result urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist + urlEntry.store(); int urlLength = urlEntry.url().toString().length(); int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length; - urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); + urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); // save the url entry final plasmaWordIndexEntry entry; if (urlEntry.word() == null) { diff --git a/source/yacy.java b/source/yacy.java index 6febb9d4c..015d746cd 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -883,7 +883,8 @@ public final class yacy { // importing the new url plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash, null); urlCounter++; - homeUrlDB.newEntry(urlEntry); + plasmaCrawlLURL.Entry homeEntry = homeUrlDB.newEntry(urlEntry); + homeEntry.store(); if (urlCounter % 500 == 0) { log.logFine(urlCounter + " URLs processed so far."); @@ -985,7 +986,8 @@ public final class yacy { if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null); urlCounter++; - /*plasmaCrawlLURL.Entry newEntry =*/ minimizedUrlDB.newEntry(urlEntry); + plasmaCrawlLURL.Entry newEntry = minimizedUrlDB.newEntry(urlEntry); + newEntry.store(); if (urlCounter % 500 == 0) { log.logInfo(urlCounter + " URLs found so far."); }