diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 0c75a8cd7..4b4f02392 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -136,7 +136,7 @@ public class Bookmarks { if (bookmark == null) { // try to get the bookmark from the LURL database try { - plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.getEntry(urlHash, null); + plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); prop.put("mode_edit", 0); // create mode prop.put("mode_title", urlentry.descr()); prop.put("mode_description", urlentry.descr()); diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 070c15ddb..c570e1520 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -212,7 +212,7 @@ public class IndexControl_p { if (post.containsKey("urlhashdelete")) { try { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); URL url = entry.url(); urlstring = url.toNormalform(); prop.put("urlstring", ""); @@ -262,7 +262,7 @@ public class IndexControl_p { while (urlIter.hasNext()) { iEntry = (indexEntry) urlIter.next(); try { - lurl = switchboard.urlPool.loadedURL.getEntry(iEntry.urlHash(), null); + lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null); if (lurl.toString() == null) { unknownURLEntries.add(iEntry.urlHash()); urlIter.remove(); @@ -311,7 +311,7 @@ public class IndexControl_p { URL url = new URL(urlstring); urlhash = indexURL.urlHash(url); prop.put("urlhash", urlhash); - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); prop.put("result", genUrlProfile(switchboard, entry, urlhash)); } catch (MalformedURLException e) { prop.put("urlstring", "bad url: " + urlstring); @@ -324,7 +324,7 @@ public class IndexControl_p { if (post.containsKey("urlhashsearch")) { try { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); URL url = entry.url(); urlstring = url.toString(); prop.put("urlstring", urlstring); @@ -387,7 +387,7 @@ public class IndexControl_p { URL url = entry.url(); String referrer = null; try { - referrer = switchboard.urlPool.loadedURL.getEntry(entry.referrerHash(), null).url().toString(); + referrer = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null).url().toString(); } catch (IOException e) { referrer = ""; } @@ -444,7 +444,7 @@ public class IndexControl_p { xi = (indexEntry) en.next(); uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())}; try { - us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString(); + us = switchboard.urlPool.loadedURL.load(uh[0], null).url().toString(); tm.put(us, uh); } catch (IOException e) { tm.put(uh[0], uh); diff --git a/htroot/PerformanceQueues_p.html b/htroot/PerformanceQueues_p.html index 8dbc4277e..45a29508d 100644 --- a/htroot/PerformanceQueues_p.html +++ b/htroot/PerformanceQueues_p.html @@ -75,6 +75,14 @@ Changes take effect immediately DHT Description + + URLs in RAM cache: + #[urlCacheSize]# + + This is the size of the URL cache. Its purpose is to buffer incoming URLs + in case of search result transmission and during DHT transfer. + + Words in RAM cache: #[wordCacheWSize]# diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index fa827de07..a04275431 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -258,6 +258,7 @@ public class PerformanceQueues_p { } // table cache settings + prop.put("urlCacheSize", switchboard.urlPool.loadedURL.writeCacheSize()); prop.put("wordCacheWSize", switchboard.wordIndex.wSize()); prop.put("wordCacheKSize", switchboard.wordIndex.kSize()); prop.put("maxURLinWCache", "" + switchboard.wordIndex.maxURLinWCache()); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index f1b7b06a5..bcfecfcc6 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -107,7 +107,7 @@ public class ViewFile { // getting the urlEntry that belongs to the url hash Entry urlEntry = null; try { - urlEntry = sb.urlPool.loadedURL.getEntry(urlHash, null); + urlEntry = sb.urlPool.loadedURL.load(urlHash, null); } catch (IOException e) { prop.put("error",2); prop.put("viewMode",VIEW_MODE_NO_TEXT); diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index 4eb87d196..2855ea38e 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -368,8 +368,8 @@ public class dir { phrase.length(), /*size*/ condenser.RESULT_NUMB_WORDS ); - newEntry.store(); - switchboard.urlPool.loadedURL.stackEntry( + switchboard.urlPool.loadedURL.store(newEntry, false); + switchboard.urlPool.loadedURL.stack( newEntry, "____________", /*initiator*/ yacyCore.seedDB.mySeed.hash, /*executor*/ diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index 984604229..4eb2ce83a 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -250,7 +250,7 @@ public final class crawlOrder { reason = reasonString; // send lurl-Entry as response try { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(indexURL.urlHash(url), null); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null); response = "double"; switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare); lurl = crypt.simpleEncode(entry.toString()); diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 297557f64..9df0d85e3 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -128,10 +128,10 @@ public final class crawlReceipt { if ((entry == null)||(entry.url()==null)) { log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam + "\n\tURL properties: "+ propStr); - } else { + } else try { // put new entry into database - entry.store(); - switchboard.urlPool.loadedURL.stackEntry(entry, youare, iam, 1); + switchboard.urlPool.loadedURL.store(entry, false); + switchboard.urlPool.loadedURL.stack(entry, youare, iam, 1); // generating url hash String newUrlHash = indexURL.urlHash(entry.url()); @@ -142,6 +142,8 @@ public final class crawlReceipt { switchboard.urlPool.noticeURL.remove(oldUrlHash); log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + entry.url()); + } catch (IOException e) { + e.printStackTrace(); } // ready for more diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 828d35b80..c08ad1bc4 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -45,6 +45,8 @@ // You must compile this file with // javac -classpath .:../classes transferRWI.java +import java.io.IOException; + import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaCrawlLURL; @@ -103,13 +105,13 @@ public final class transferURL { yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); lEntry = null; blocked++; - } else { - lEntry.store(); - sb.urlPool.loadedURL.stackEntry(lEntry, iam, iam, 3); - yacyCore.log.logFine("transferURL: received URL '" - + lEntry.url() + "' from peer " - + otherPeerName); + } else try { + sb.urlPool.loadedURL.store(lEntry, true); + sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3); + yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName); received++; + } catch (IOException e) { + e.printStackTrace(); } } else { yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + diff --git a/source/de/anomic/index/indexURL.java b/source/de/anomic/index/indexURL.java index 7d2eb4823..c413c1e5d 100644 --- a/source/de/anomic/index/indexURL.java +++ b/source/de/anomic/index/indexURL.java @@ -28,13 +28,17 @@ package de.anomic.index; import java.io.IOException; import de.anomic.net.URL; + import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.HashMap; +import java.util.Iterator; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroIndex; +import de.anomic.kelondro.kelondroRAMIndex; import de.anomic.kelondro.kelondroTree; +import de.anomic.kelondro.kelondroRow; import de.anomic.server.serverCodings; import de.anomic.yacy.yacySeedDB; @@ -406,24 +410,57 @@ public class indexURL { // the class object - protected kelondroIndex urlHashCache; + protected kelondroIndex urlIndexFile = null; + protected kelondroRAMIndex urlIndexCache = null; public indexURL() { - urlHashCache = null; + urlIndexFile = null; + urlIndexCache = null; } public int size() { try { - return urlHashCache.size(); + return urlIndexFile.size() + ((urlIndexCache == null) ? 0 : urlIndexCache.size()); } catch (IOException e) { return 0; } } + public void store(kelondroRow.Entry entry, boolean cached) throws IOException { + if ((cached) && (urlIndexCache != null)) + synchronized (urlIndexCache) { + urlIndexCache.put(entry); + } + else + urlIndexFile.put(entry); + } + + public void flushCacheSome() { + if (urlIndexCache == null) return; + if (urlIndexCache.size() == 0) return; + int flush = Math.max(1, urlIndexCache.size() / 10); + while (flush-- > 0) flushCacheOnce(); + } + + public void flushCacheOnce() { + if (urlIndexCache == null) return; + if (urlIndexCache.size() == 0) return; + synchronized (urlIndexCache) { + Iterator i = urlIndexCache.rows(true, false, null); + try { + urlIndexFile.put((kelondroRow.Entry) i.next()); + i.remove(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + public boolean remove(String hash) { if (hash == null) return false; try { - urlHashCache.remove(hash.getBytes()); + urlIndexFile.remove(hash.getBytes()); + if (urlIndexCache != null) synchronized (urlIndexCache) {urlIndexCache.remove(hash.getBytes());} return true; } catch (IOException e) { return false; @@ -431,26 +468,38 @@ public class indexURL { } public void close() throws IOException { - if (urlHashCache != null) urlHashCache.close(); + while ((urlIndexCache != null) && (urlIndexCache.size() > 0)) flushCacheOnce(); + if (urlIndexFile != null) { + urlIndexFile.close(); + urlIndexFile = null; + } + if (urlIndexCache != null) { + urlIndexCache.close(); + urlIndexCache = null; + } } + public int writeCacheSize() { + return (urlIndexCache == null) ? 0 : urlIndexCache.size(); + } + public int cacheNodeChunkSize() { - if (urlHashCache instanceof kelondroTree) return ((kelondroTree) urlHashCache).cacheNodeChunkSize(); + if (urlIndexFile instanceof kelondroTree) return ((kelondroTree) urlIndexFile).cacheNodeChunkSize(); return 0; } public int[] cacheNodeStatus() { - if (urlHashCache instanceof kelondroTree) return ((kelondroTree) urlHashCache).cacheNodeStatus(); + if (urlIndexFile instanceof kelondroTree) return ((kelondroTree) urlIndexFile).cacheNodeStatus(); return new int[]{0,0,0,0,0,0,0,0,0,0}; } public int cacheObjectChunkSize() { - if (urlHashCache instanceof kelondroTree) return ((kelondroTree) urlHashCache).cacheObjectChunkSize(); + if (urlIndexFile instanceof kelondroTree) return ((kelondroTree) urlIndexFile).cacheObjectChunkSize(); return 0; } public long[] cacheObjectStatus() { - if (urlHashCache instanceof kelondroTree) return ((kelondroTree) urlHashCache).cacheObjectStatus(); + if (urlIndexFile instanceof kelondroTree) return ((kelondroTree) urlIndexFile).cacheObjectStatus(); return new long[]{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; } diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index 308387138..268e7f44e 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -258,13 +258,14 @@ public class indexURLEntry implements Cloneable, indexEntry { } static void normalize(indexURLEntry t, indexEntry min, indexEntry max) { + if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm() + "\nmax=" + max.toPropertyForm()); t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount())); t.entry.setCol(col_wordcount , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount())); t.entry.setCol(col_phrasecount , (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount())); t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext())); t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase())); t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase())); - t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); + t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat. t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified())); t.entry.setCol(col_quality , (t.quality() == 0) ? 0 : 1 + 255 * (t.quality() - min.quality() ) / (1 + max.quality() - min.quality())); } diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 40cb34aa1..535eeb04c 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -156,11 +156,10 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { // we need to import the url try { // getting the url entry - plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, null); + plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.load(urlHash, null); /* write it into the home url db */ - plasmaCrawlLURL.Entry homeEntry = this.homeUrlDB.newEntry(urlEntry); - homeEntry.store(); + this.homeUrlDB.store(urlEntry, false); importedUrlBuffer.add(urlHash); this.urlCounter++; diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 933b5c127..084fd4b7f 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -144,7 +144,7 @@ public class plasmaCrawlEURL extends indexURL { String newCacheName = "urlErr3.table"; cachePath.mkdirs(); try { - urlHashCache = new kelondroFlexTable(cachePath, newCacheName, bufferkb * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder); + urlIndexFile = new kelondroFlexTable(cachePath, newCacheName, bufferkb * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder); } catch (IOException e) { e.printStackTrace(); System.exit(-1); @@ -152,7 +152,7 @@ public class plasmaCrawlEURL extends indexURL { } else { File oldCacheFile = new File(cachePath, "urlErr0.db"); oldCacheFile.getParentFile().mkdirs(); - urlHashCache = kelondroTree.open(oldCacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef); + urlIndexFile = kelondroTree.open(oldCacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef); } } @@ -181,7 +181,7 @@ public class plasmaCrawlEURL extends indexURL { public boolean exists(String urlHash) { try { - return (urlHashCache.get(urlHash.getBytes()) != null); + return (urlIndexFile.get(urlHash.getBytes()) != null); } catch (IOException e) { return false; } @@ -236,7 +236,7 @@ public class plasmaCrawlEURL extends indexURL { // - look into the filed properties // if the url cannot be found, this returns null this.hash = hash; - kelondroRow.Entry entry = urlHashCache.get(hash.getBytes()); + kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes()); if (entry != null) { insertEntry(entry); } @@ -288,7 +288,7 @@ public class plasmaCrawlEURL extends indexURL { this.failreason.getBytes(), this.flags.getBytes() }; - urlHashCache.put(urlHashCache.row().newEntry(entry)); + urlIndexFile.put(urlIndexFile.row().newEntry(entry)); this.stored = true; } catch (IOException e) { System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString()); @@ -346,7 +346,7 @@ public class plasmaCrawlEURL extends indexURL { boolean error = false; public kiter(boolean up, boolean rotating, String firstHash) throws IOException { - i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes()); + i = urlIndexFile.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes()); error = false; } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index f5add6e87..1501ade5d 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -70,6 +70,8 @@ import de.anomic.index.indexEntry; import de.anomic.index.indexURL; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.kelondro.kelondroNaturalOrder; +import de.anomic.kelondro.kelondroRAMIndex; import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroRow; import de.anomic.plasma.plasmaHTCache; @@ -117,7 +119,8 @@ public final class plasmaCrawlLURL extends indexURL { cacheFile.getParentFile().mkdirs(); try { - urlHashCache = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef); + urlIndexFile = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef); + urlIndexCache = new kelondroRAMIndex(kelondroNaturalOrder.naturalOrder, rowdef); } catch (IOException e) { e.printStackTrace(); System.exit(-1); @@ -132,7 +135,7 @@ public final class plasmaCrawlLURL extends indexURL { gcrawlResultStack = new LinkedList(); } - public synchronized void stackEntry(Entry e, String initiatorHash, String executorHash, int stackType) { + public synchronized void stack(Entry e, String initiatorHash, String executorHash, int stackType) { if (e == null) { return; } try { if (initiatorHash == null) { initiatorHash = dummyHash; } @@ -157,27 +160,58 @@ public final class plasmaCrawlLURL extends indexURL { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } - public Entry getEntry(String hash, indexEntry searchedWord) throws IOException { - return new Entry(hash, searchedWord); + public Entry load(String urlHash, indexEntry searchedWord) throws IOException { + // generates an plasmaLURLEntry using the url hash + // to speed up the access, the url-hashes are buffered + // in the hash cache. + // we have two options to find the url: + // - look into the hash cache + // - look into the filed properties + // if the url cannot be found, this returns null + kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes()); + if (entry == null) entry = urlIndexCache.get(urlHash.getBytes()); + if (entry == null) return null; + return new Entry(entry, searchedWord); } - public synchronized Entry newEntry(Entry oldEntry) { - if (oldEntry == null) return null; - return new Entry( - oldEntry.url(), - oldEntry.descr(), - oldEntry.moddate(), - oldEntry.loaddate(), - oldEntry.referrerHash(), - oldEntry.copyCount(), - oldEntry.local(), - oldEntry.quality(), - oldEntry.language(), - oldEntry.doctype(), - oldEntry.size(), - oldEntry.wordCount()); + public void store(Entry entry, boolean cached) throws IOException { + // Check if there is a more recent Entry already in the DB + if (entry.stored) return; + Entry oldEntry; + try { + if (exists(entry.urlHash)) { + oldEntry = load(entry.urlHash, null); + } else { + oldEntry = null; + } + } catch (Exception e) { + oldEntry = null; + } + if ((oldEntry != null) && (entry.isOlder(oldEntry))) { + // the fetched oldEntry is better, so return its properties instead of the new ones + // this.urlHash = oldEntry.urlHash; // unnecessary, should be the same + // this.url = oldEntry.url; // unnecessary, should be the same + entry.descr = oldEntry.descr; + entry.moddate = oldEntry.moddate; + entry.loaddate = oldEntry.loaddate; + entry.referrerHash = oldEntry.referrerHash; + entry.copyCount = oldEntry.copyCount; + entry.flags = oldEntry.flags; + entry.quality = oldEntry.quality; + entry.language = oldEntry.language; + entry.doctype = oldEntry.doctype; + entry.size = oldEntry.size; + entry.wordCount = oldEntry.wordCount; + // this.snippet // not read from db + // this.word // not read from db + entry.stored = true; + return; // this did not need to be stored, but is updated + } + + super.store(entry.toRowEntry(), cached); + entry.stored = true; } - + public synchronized Entry newEntry(String propStr, boolean setGlobal) { if (propStr.startsWith("{") && propStr.endsWith("}")) { return new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal); @@ -281,7 +315,7 @@ public final class plasmaCrawlLURL extends indexURL { public boolean exists(String urlHash) { try { - if (urlHashCache.get(urlHash.getBytes()) != null) { + if (urlIndexFile.get(urlHash.getBytes()) != null) { return true; } else { return false; @@ -343,7 +377,7 @@ public final class plasmaCrawlLURL extends indexURL { urlHash = getUrlHash(tabletype, i); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); try { - urle = getEntry(urlHash, null); + urle = load(urlHash, null); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); executorSeed = yacyCore.seedDB.getConnected(executorHash); @@ -374,7 +408,7 @@ public final class plasmaCrawlLURL extends indexURL { prop.put("table_indexed", cnt); return prop; } - + public class Entry { private URL url; @@ -426,29 +460,8 @@ public final class plasmaCrawlLURL extends indexURL { this.word = null; this.stored = false; } - - public Entry(String urlHash, indexEntry searchedWord) throws IOException { - // generates an plasmaLURLEntry using the url hash - // to speed up the access, the url-hashes are buffered - // in the hash cache. - // we have two options to find the url: - // - look into the hash cache - // - look into the filed properties - // if the url cannot be found, this returns null - this.urlHash = urlHash; - kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes()); - if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL"); - insertEntry(entry, searchedWord); - this.stored = true; - } public Entry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException { - assert (entry != null); - insertEntry(entry, word); - this.stored = false; - } - - private void insertEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException { try { this.urlHash = entry.getColString(0, null); this.url = new URL(entry.getColString(1, "UTF-8").trim()); @@ -505,48 +518,12 @@ public final class plasmaCrawlLURL extends indexURL { serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2: " + e.toString(), e); } } + + public kelondroRow.Entry toRowEntry() throws IOException { + final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength); + final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength); - public void store() { - // Check if there is a more recent Entry already in the DB - if (this.stored) return; - Entry oldEntry; - try { - if (exists(urlHash)) { - oldEntry = new Entry(urlHash, null); - } else { - oldEntry = null; - } - } catch (Exception e) { - oldEntry = null; - } - if ((oldEntry != null) && (isOlder(oldEntry))) { - // the fetched oldEntry is better, so return its properties instead of the new ones - // this.urlHash = oldEntry.urlHash; // unnecessary, should be the same - // this.url = oldEntry.url; // unnecessary, should be the same - this.descr = oldEntry.descr; - this.moddate = oldEntry.moddate; - this.loaddate = oldEntry.loaddate; - this.referrerHash = oldEntry.referrerHash; - this.copyCount = oldEntry.copyCount; - this.flags = oldEntry.flags; - this.quality = oldEntry.quality; - this.language = oldEntry.language; - this.doctype = oldEntry.doctype; - this.size = oldEntry.size; - this.wordCount = oldEntry.wordCount; - // this.snippet // not read from db - // this.word // not read from db - return; - } - - // stores the values from the object variables into the database - final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength); - final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength); - - // store the hash in the hash cache - try { - // even if the entry exists, we simply overwrite it - final byte[][] entry = new byte[][] { + final byte[][] entry = new byte[][] { urlHash.getBytes(), url.toString().getBytes(), descr.getBytes(), // null? @@ -560,13 +537,8 @@ public final class plasmaCrawlLURL extends indexURL { new byte[] {(byte) doctype}, kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(), - }; - urlHashCache.put(urlHashCache.row().newEntry(entry)); - //serverLog.logFine("PLASMA","STORED new LURL " + url.toString()); - this.stored = true; - } catch (Exception e) { - serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e); - } + }; + return urlIndexFile.row().newEntry(entry); } public String hash() { @@ -751,7 +723,7 @@ public final class plasmaCrawlLURL extends indexURL { boolean error = false; public kiter(boolean up, boolean rotating, String firstHash) throws IOException { - i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes()); + i = urlIndexFile.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes()); error = false; } @@ -817,7 +789,7 @@ public final class plasmaCrawlLURL extends indexURL { String oldUrlStr = null; try { // getting the url data as byte array - kelondroRow.Entry entry = urlHashCache.get(urlHash.getBytes()); + kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes()); // getting the wrong url string oldUrlStr = entry.getColString(1, null).trim(); @@ -834,7 +806,7 @@ public final class plasmaCrawlLURL extends indexURL { if (res.statusCode == 200) { entry.setCol(1, newUrl.toString().getBytes()); - urlHashCache.put(entry); + urlIndexFile.put(entry); log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr); } else { remove(urlHash); diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 34b754d1c..9b6b8794a 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -75,7 +75,7 @@ public class plasmaCrawlNURL extends indexURL { public static final int STACK_TYPE_MUSIC = 13; // put on music stack /** - * column length definition for the {@link plasmaURL#urlHashCache} DB + * column length definition for the {@link plasmaURL#urlIndexFile} DB */ public final static kelondroRow rowdef = new kelondroRow( "String urlhash-" + urlHashLength + ", " + // the url's hash @@ -153,7 +153,7 @@ public class plasmaCrawlNURL extends indexURL { String newCacheName = "urlNotice4.table"; cacheStacksPath.mkdirs(); try { - urlHashCache = new kelondroFlexTable(cacheStacksPath, newCacheName, bufferkb * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder); + urlIndexFile = new kelondroFlexTable(cacheStacksPath, newCacheName, bufferkb * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder); } catch (IOException e) { e.printStackTrace(); System.exit(-1); @@ -161,14 +161,14 @@ public class plasmaCrawlNURL extends indexURL { } else { File oldCacheFile = new File(cacheStacksPath, "urlNotice1.db"); oldCacheFile.getParentFile().mkdirs(); - urlHashCache = kelondroTree.open(oldCacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef); + urlIndexFile = kelondroTree.open(oldCacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef); } } private void resetHashCache() { - if (urlHashCache != null) { - try {urlHashCache.close();} catch (IOException e) {} - urlHashCache = null; + if (urlIndexFile != null) { + try {urlIndexFile.close();} catch (IOException e) {} + urlIndexFile = null; File cacheFile = new File(cacheStacksPath, "urlNotice1.db"); cacheFile.delete(); } @@ -176,7 +176,7 @@ public class plasmaCrawlNURL extends indexURL { } public void close() { - try {urlHashCache.close();} catch (IOException e) {} + try {urlIndexFile.close();} catch (IOException e) {} coreStack.close(); limitStack.close(); overhangStack.close(); @@ -475,7 +475,7 @@ public class plasmaCrawlNURL extends indexURL { // if the url cannot be found, this returns null this.hash = hash; if (hash == null) throw new IOException("hash is null"); - kelondroRow.Entry entry = urlHashCache.get(hash.getBytes()); + kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes()); if (entry != null) { insertEntry(entry); this.stored = true; @@ -532,9 +532,9 @@ public class plasmaCrawlNURL extends indexURL { this.flags.getBytes(), normalizeHandle(this.handle).getBytes() }; - if (urlHashCache == null) System.out.println("urlHashCache is NULL"); - if ((urlHashCache != null) && (urlHashCache.row() == null)) System.out.println("row() is NULL"); - urlHashCache.put(urlHashCache.row().newEntry(entry)); + if (urlIndexFile == null) System.out.println("urlHashCache is NULL"); + if ((urlIndexFile != null) && (urlIndexFile.row() == null)) System.out.println("row() is NULL"); + urlIndexFile.put(urlIndexFile.row().newEntry(entry)); this.stored = true; } catch (IOException e) { serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB"); @@ -614,7 +614,7 @@ public class plasmaCrawlNURL extends indexURL { boolean error = false; public kiter(boolean up, boolean rotating, String firstHash) throws IOException { - i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes()); + i = urlIndexFile.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes()); error = false; } diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 103962769..7e5b58062 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -380,7 +380,7 @@ public final class plasmaCrawlStacker { String dbocc = this.sb.urlPool.exists(nexturlhash); plasmaCrawlLURL.Entry oldEntry = null; if (dbocc != null) try { - oldEntry = this.sb.urlPool.loadedURL.getEntry(nexturlhash, null); + oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null); } catch (IOException e) {} boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index 9f952030d..bd44b7f8a 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -228,7 +228,7 @@ public class plasmaDHTChunk { while ((urlIter.hasNext()) && (maxcount > refcount)) { iEntry = (indexEntry) urlIter.next(); try { - lurl = lurls.getEntry(iEntry.urlHash(), iEntry); + lurl = lurls.load(iEntry.urlHash(), iEntry); if ((lurl == null) || (lurl.url() == null)) { //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash()); notBoundCounter++; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index bb760f184..da1fa3b91 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -220,8 +220,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { searchResult.add(rcGlobal, preorderTime); preorderTime = preorderTime - (System.currentTimeMillis() - pst); if (preorderTime < 0) preorderTime = 200; - plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking); - preorder.addContainer(searchResult, preorderTime); + plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, searchResult, preorderTime); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT); profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size()); @@ -244,9 +243,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable { entry = preorder.next(); // find the url entry try { - page = urlStore.getEntry(entry.urlHash(), entry); + page = urlStore.load(entry.urlHash(), entry); // add a result - acc.addResult(entry, page); + if (page != null) acc.addResult(entry, page); } catch (IOException e) { // result was not found } @@ -279,8 +278,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { profileLocal.startTimer(); if (maxtime < 0) maxtime = 200; - plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking); - preorder.addContainer(rcLocal, maxtime); + plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, maxtime); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT); profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size()); @@ -301,9 +299,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable { entry = preorder.next(); // find the url entry try { - page = urlStore.getEntry(entry.urlHash(), entry); + page = urlStore.load(entry.urlHash(), entry); // add a result - acc.addResult(entry, page); + if (page != null) acc.addResult(entry, page); } catch (IOException e) { // result was not found } diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index a74bcdda7..c3a552f71 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -63,6 +63,43 @@ public final class plasmaSearchPreOrder { private plasmaSearchQuery query; private plasmaSearchRankingProfile ranking; + public plasmaSearchPreOrder() { + this.entryMin = null; + this.entryMax = null; + this.pageAcc = new TreeMap(); + this.query = null; + this.ranking = null; + } + + public plasmaSearchPreOrder(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, indexContainer container, long maxTime) { + this.query = query; + this.ranking = ranking; + + long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; + indexEntry iEntry; + + // first pass: find min/max to obtain limits for normalization + Iterator i = container.entries(); + int count = 0; + this.entryMin = null; + this.entryMax = null; + while (i.hasNext()) { + if (System.currentTimeMillis() > limitTime) break; + iEntry = (indexEntry) i.next(); + if (this.entryMin == null) this.entryMin = (indexEntry) iEntry.clone(); else this.entryMin.min(iEntry); + if (this.entryMax == null) this.entryMax = (indexEntry) iEntry.clone(); else this.entryMax.max(iEntry); + count++; + } + + // second pass: normalize entries and get ranking + i = container.entries(); + this.pageAcc = new TreeMap(); + for (int j = 0; j < count; j++) { + iEntry = (indexEntry) i.next(); + pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax)), 16) + iEntry.urlHash(), iEntry); + } + } + public static void loadYBR(File rankingPath, int count) { // load ranking tables if (rankingPath.exists()) { @@ -99,17 +136,11 @@ public final class plasmaSearchPreOrder { useYBR = usage; } - public plasmaSearchPreOrder(plasmaSearchQuery query, plasmaSearchRankingProfile ranking) { - entryMin = null; - entryMax = null; - this.pageAcc = new TreeMap(); - this.query = query; - this.ranking = ranking; - } - public plasmaSearchPreOrder cloneSmart() { // clones only the top structure - plasmaSearchPreOrder theClone = new plasmaSearchPreOrder(query, ranking); + plasmaSearchPreOrder theClone = new plasmaSearchPreOrder(); + theClone.query = this.query; + theClone.ranking = this.ranking; theClone.pageAcc = (TreeMap) this.pageAcc.clone(); return theClone; } @@ -123,29 +154,6 @@ public final class plasmaSearchPreOrder { return (indexEntry) pageAcc.remove(top); } - public void addContainer(indexContainer container, long maxTime) { - long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime; - indexEntry iEntry; - - // first pass: find min/max to obtain limits for normalization - Iterator i = container.entries(); - int count = 0; - while (i.hasNext()) { - if (System.currentTimeMillis() > limitTime) break; - iEntry = (indexEntry) i.next(); - if (entryMin == null) entryMin = (indexEntry) iEntry.clone(); else entryMin.min(iEntry); - if (entryMax == null) entryMax = (indexEntry) iEntry.clone(); else entryMax.max(iEntry); - count++; - } - - // second pass: normalize entries and get ranking - i = container.entries(); - for (int j = 0; j < count; j++) { - iEntry = (indexEntry) i.next(); - pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(entryMin, entryMax)), 16) + iEntry.urlHash(), iEntry); - } - } - public indexEntry[] getNormalizer() { return new indexEntry[] {entryMin, entryMax}; } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 49f3e15f9..5b36c4af9 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -987,6 +987,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // flush some entries from the RAM cache // (new permanent cache flushing) wordIndex.flushCacheSome(sbQueue.size() != 0); + urlPool.loadedURL.flushCacheSome(); boolean doneSomething = false; @@ -1560,8 +1561,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser /* ======================================================================== * STORE URL TO LOADED-URL-DB * ======================================================================== */ - newEntry.store(); - urlPool.loadedURL.stackEntry( + urlPool.loadedURL.store(newEntry, false); + urlPool.loadedURL.stack( newEntry, // loaded url db entry initiatorPeerHash, // initiator peer hash yacyCore.seedDB.mySeed.hash, // executor peer hash @@ -1942,8 +1943,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if ((lurl != null) && (lurl.length() != 0)) { String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true); - entry.store(); - urlPool.loadedURL.stackEntry(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? + urlPool.loadedURL.store(entry, false); + urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? urlPool.noticeURL.remove(entry.hash()); log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'"); return true; @@ -2157,7 +2158,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // determine the url string try { - plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash, null); + plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null); URL url = entry.url(); if (url == null) return 0; diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index eda6f0e90..0f33d875e 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -329,7 +329,7 @@ public class plasmaSwitchboardQueue { if (referrerURL == null) { if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null; try { - referrerURL = lurls.getEntry(referrerHash, null).url(); + referrerURL = lurls.load(referrerHash, null).url(); } catch (IOException e) { referrerURL = null; return null; diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java index f2589915c..8f41adbd7 100644 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -82,7 +82,7 @@ public class plasmaURLPool { plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); if (ne != null) return ne.url(); try { - plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash, null); + plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null); if (le != null) return le.url(); } catch (IOException e) {} plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 08bb390de..acaaa2504 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -692,7 +692,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { // System.out.println("Wordhash: "+wordHash+" UrlHash: // "+entry.getUrlHash()); try { - url = lurl.getEntry(entry.urlHash(), null).url(); + url = lurl.load(entry.urlHash(), null).url(); if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) { urlHashs.add(entry.urlHash()); } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 3f2cdd90a..b11e12ece 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -348,7 +348,13 @@ public final class yacyClient { // yacyCore.log("DEBUG QUERY: query=" + querystr + "; result = " + result.toString()); if ((result == null) || (result.size() == 0)) return -1; final String resp = (String) result.get("response"); - if (resp == null) { return -1; } else { return Integer.parseInt(resp); } + if (resp == null) { + return -1; + } else try { + return Integer.parseInt(resp); + } catch (NumberFormatException e) { + return -1; + } } catch (IOException e) { yacyCore.log.logSevere("yacyClient.queryUrlCount error asking peer '" + target.getName() + "':" + e.toString()); return -1; @@ -477,15 +483,16 @@ public final class yacyClient { // get one single search result urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); if ((urlEntry == null) || (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, urlEntry.url()))) { continue; } // block with backlist - urlEntry.store(); - int urlLength = urlEntry.url().toString().length(); - int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length; - - urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); + urlManager.store(urlEntry, true); + urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); + // save the url entry final indexEntry entry; if (urlEntry.word() == null) { // the old way to define words + int urlLength = urlEntry.url().toString().length(); + int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length; + entry = new indexURLEntry( urlEntry.hash(), urlLength, urlComps, diff --git a/source/yacy.java b/source/yacy.java index 3dc42e518..9de684fa6 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -727,10 +727,9 @@ public final class yacy { iEntry = (indexEntry) wordIdxEntries.next(); String urlHash = iEntry.urlHash(); if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { - plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null); + plasmaCrawlLURL.Entry urlEntry = currentUrlDB.load(urlHash, null); urlCounter++; - plasmaCrawlLURL.Entry newEntry = minimizedUrlDB.newEntry(urlEntry); - newEntry.store(); + minimizedUrlDB.store(urlEntry, false); if (urlCounter % 500 == 0) { log.logInfo(urlCounter + " URLs found so far."); }