added write cache for LURLs

This was necessary to speed up the index receive process during global search


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2498 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 8a0e35618b
commit 4866868c0e

@ -136,7 +136,7 @@ public class Bookmarks {
if (bookmark == null) {
// try to get the bookmark from the LURL database
try {
plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.getEntry(urlHash, null);
plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
prop.put("mode_edit", 0); // create mode
prop.put("mode_title", urlentry.descr());
prop.put("mode_description", urlentry.descr());

@ -212,7 +212,7 @@ public class IndexControl_p {
if (post.containsKey("urlhashdelete")) {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
URL url = entry.url();
urlstring = url.toNormalform();
prop.put("urlstring", "");
@ -262,7 +262,7 @@ public class IndexControl_p {
while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next();
try {
lurl = switchboard.urlPool.loadedURL.getEntry(iEntry.urlHash(), null);
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
if (lurl.toString() == null) {
unknownURLEntries.add(iEntry.urlHash());
urlIter.remove();
@ -311,7 +311,7 @@ public class IndexControl_p {
URL url = new URL(urlstring);
urlhash = indexURL.urlHash(url);
prop.put("urlhash", urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (MalformedURLException e) {
prop.put("urlstring", "bad url: " + urlstring);
@ -324,7 +324,7 @@ public class IndexControl_p {
if (post.containsKey("urlhashsearch")) {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash, null);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
URL url = entry.url();
urlstring = url.toString();
prop.put("urlstring", urlstring);
@ -387,7 +387,7 @@ public class IndexControl_p {
URL url = entry.url();
String referrer = null;
try {
referrer = switchboard.urlPool.loadedURL.getEntry(entry.referrerHash(), null).url().toString();
referrer = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null).url().toString();
} catch (IOException e) {
referrer = "<unknown>";
}
@ -444,7 +444,7 @@ public class IndexControl_p {
xi = (indexEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
try {
us = switchboard.urlPool.loadedURL.getEntry(uh[0], null).url().toString();
us = switchboard.urlPool.loadedURL.load(uh[0], null).url().toString();
tm.put(us, uh);
} catch (IOException e) {
tm.put(uh[0], uh);

@ -75,6 +75,14 @@ Changes take effect immediately</td>
<td class=small>DHT</td>
<td class=small>Description</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>URLs in RAM cache:</td>
<td class=small colspan="2" align="center">#[urlCacheSize]#</td>
<td class=small>
This is the size of the URL cache. Its purpose is to buffer incoming URLs
in case of search result transmission and during DHT transfer.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Words in RAM cache:</td>
<td class=small>#[wordCacheWSize]#</td>

@ -258,6 +258,7 @@ public class PerformanceQueues_p {
}
// table cache settings
prop.put("urlCacheSize", switchboard.urlPool.loadedURL.writeCacheSize());
prop.put("wordCacheWSize", switchboard.wordIndex.wSize());
prop.put("wordCacheKSize", switchboard.wordIndex.kSize());
prop.put("maxURLinWCache", "" + switchboard.wordIndex.maxURLinWCache());

@ -107,7 +107,7 @@ public class ViewFile {
// getting the urlEntry that belongs to the url hash
Entry urlEntry = null;
try {
urlEntry = sb.urlPool.loadedURL.getEntry(urlHash, null);
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
} catch (IOException e) {
prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT);

@ -368,8 +368,8 @@ public class dir {
phrase.length(), /*size*/
condenser.RESULT_NUMB_WORDS
);
newEntry.store();
switchboard.urlPool.loadedURL.stackEntry(
switchboard.urlPool.loadedURL.store(newEntry, false);
switchboard.urlPool.loadedURL.stack(
newEntry,
"____________", /*initiator*/
yacyCore.seedDB.mySeed.hash, /*executor*/

@ -250,7 +250,7 @@ public final class crawlOrder {
reason = reasonString;
// send lurl-Entry as response
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(indexURL.urlHash(url), null);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
response = "double";
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
lurl = crypt.simpleEncode(entry.toString());

@ -128,10 +128,10 @@ public final class crawlReceipt {
if ((entry == null)||(entry.url()==null)) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
} else {
} else try {
// put new entry into database
entry.store();
switchboard.urlPool.loadedURL.stackEntry(entry, youare, iam, 1);
switchboard.urlPool.loadedURL.store(entry, false);
switchboard.urlPool.loadedURL.stack(entry, youare, iam, 1);
// generating url hash
String newUrlHash = indexURL.urlHash(entry.url());
@ -142,6 +142,8 @@ public final class crawlReceipt {
switchboard.urlPool.noticeURL.remove(oldUrlHash);
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + entry.url());
} catch (IOException e) {
e.printStackTrace();
}
// ready for more

@ -45,6 +45,8 @@
// You must compile this file with
// javac -classpath .:../classes transferRWI.java
import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaCrawlLURL;
@ -103,13 +105,13 @@ public final class transferURL {
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
blocked++;
} else {
lEntry.store();
sb.urlPool.loadedURL.stackEntry(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '"
+ lEntry.url() + "' from peer "
+ otherPeerName);
} else try {
sb.urlPool.loadedURL.store(lEntry, true);
sb.urlPool.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
e.printStackTrace();
}
} else {
yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName +

@ -28,13 +28,17 @@ package de.anomic.index;
import java.io.IOException;
import de.anomic.net.URL;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRAMIndex;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroRow;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
@ -406,24 +410,57 @@ public class indexURL {
// the class object
protected kelondroIndex urlHashCache;
protected kelondroIndex urlIndexFile = null;
protected kelondroRAMIndex urlIndexCache = null;
public indexURL() {
urlHashCache = null;
urlIndexFile = null;
urlIndexCache = null;
}
public int size() {
try {
return urlHashCache.size();
return urlIndexFile.size() + ((urlIndexCache == null) ? 0 : urlIndexCache.size());
} catch (IOException e) {
return 0;
}
}
public void store(kelondroRow.Entry entry, boolean cached) throws IOException {
if ((cached) && (urlIndexCache != null))
synchronized (urlIndexCache) {
urlIndexCache.put(entry);
}
else
urlIndexFile.put(entry);
}
public void flushCacheSome() {
if (urlIndexCache == null) return;
if (urlIndexCache.size() == 0) return;
int flush = Math.max(1, urlIndexCache.size() / 10);
while (flush-- > 0) flushCacheOnce();
}
public void flushCacheOnce() {
if (urlIndexCache == null) return;
if (urlIndexCache.size() == 0) return;
synchronized (urlIndexCache) {
Iterator i = urlIndexCache.rows(true, false, null);
try {
urlIndexFile.put((kelondroRow.Entry) i.next());
i.remove();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public boolean remove(String hash) {
if (hash == null) return false;
try {
urlHashCache.remove(hash.getBytes());
urlIndexFile.remove(hash.getBytes());
if (urlIndexCache != null) synchronized (urlIndexCache) {urlIndexCache.remove(hash.getBytes());}
return true;
} catch (IOException e) {
return false;
@ -431,26 +468,38 @@ public class indexURL {
}
public void close() throws IOException {
if (urlHashCache != null) urlHashCache.close();
while ((urlIndexCache != null) && (urlIndexCache.size() > 0)) flushCacheOnce();
if (urlIndexFile != null) {
urlIndexFile.close();
urlIndexFile = null;
}
if (urlIndexCache != null) {
urlIndexCache.close();
urlIndexCache = null;
}
}
public int writeCacheSize() {
return (urlIndexCache == null) ? 0 : urlIndexCache.size();
}
public int cacheNodeChunkSize() {
if (urlHashCache instanceof kelondroTree) return ((kelondroTree) urlHashCache).cacheNodeChunkSize();
if (urlIndexFile instanceof kelondroTree) return ((kelondroTree) urlIndexFile).cacheNodeChunkSize();
return 0;
}
public int[] cacheNodeStatus() {
if (urlHashCache instanceof kelondroTree) return ((kelondroTree) urlHashCache).cacheNodeStatus();
if (urlIndexFile instanceof kelondroTree) return ((kelondroTree) urlIndexFile).cacheNodeStatus();
return new int[]{0,0,0,0,0,0,0,0,0,0};
}
public int cacheObjectChunkSize() {
if (urlHashCache instanceof kelondroTree) return ((kelondroTree) urlHashCache).cacheObjectChunkSize();
if (urlIndexFile instanceof kelondroTree) return ((kelondroTree) urlIndexFile).cacheObjectChunkSize();
return 0;
}
public long[] cacheObjectStatus() {
if (urlHashCache instanceof kelondroTree) return ((kelondroTree) urlHashCache).cacheObjectStatus();
if (urlIndexFile instanceof kelondroTree) return ((kelondroTree) urlIndexFile).cacheObjectStatus();
return new long[]{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
}

@ -258,13 +258,14 @@ public class indexURLEntry implements Cloneable, indexEntry {
}
static void normalize(indexURLEntry t, indexEntry min, indexEntry max) {
if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm() + "\nmax=" + max.toPropertyForm());
t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount()));
t.entry.setCol(col_wordcount , (t.wordcount() == 0) ? 0 : 1 + 255 * (t.wordcount() - min.wordcount() ) / (1 + max.wordcount() - min.wordcount()));
t.entry.setCol(col_phrasecount , (t.phrasecount() == 0) ? 0 : 1 + 255 * (t.phrasecount() - min.phrasecount() ) / (1 + max.phrasecount() - min.phrasecount()));
t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext()));
t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase()));
t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase()));
t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance()));
t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat.
t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified()));
t.entry.setCol(col_quality , (t.quality() == 0) ? 0 : 1 + 255 * (t.quality() - min.quality() ) / (1 + max.quality() - min.quality()));
}

@ -156,11 +156,10 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// we need to import the url
try {
// getting the url entry
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, null);
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.load(urlHash, null);
/* write it into the home url db */
plasmaCrawlLURL.Entry homeEntry = this.homeUrlDB.newEntry(urlEntry);
homeEntry.store();
this.homeUrlDB.store(urlEntry, false);
importedUrlBuffer.add(urlHash);
this.urlCounter++;

@ -144,7 +144,7 @@ public class plasmaCrawlEURL extends indexURL {
String newCacheName = "urlErr3.table";
cachePath.mkdirs();
try {
urlHashCache = new kelondroFlexTable(cachePath, newCacheName, bufferkb * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder);
urlIndexFile = new kelondroFlexTable(cachePath, newCacheName, bufferkb * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@ -152,7 +152,7 @@ public class plasmaCrawlEURL extends indexURL {
} else {
File oldCacheFile = new File(cachePath, "urlErr0.db");
oldCacheFile.getParentFile().mkdirs();
urlHashCache = kelondroTree.open(oldCacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef);
urlIndexFile = kelondroTree.open(oldCacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef);
}
}
@ -181,7 +181,7 @@ public class plasmaCrawlEURL extends indexURL {
public boolean exists(String urlHash) {
try {
return (urlHashCache.get(urlHash.getBytes()) != null);
return (urlIndexFile.get(urlHash.getBytes()) != null);
} catch (IOException e) {
return false;
}
@ -236,7 +236,7 @@ public class plasmaCrawlEURL extends indexURL {
// - look into the filed properties
// if the url cannot be found, this returns null
this.hash = hash;
kelondroRow.Entry entry = urlHashCache.get(hash.getBytes());
kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes());
if (entry != null) {
insertEntry(entry);
}
@ -288,7 +288,7 @@ public class plasmaCrawlEURL extends indexURL {
this.failreason.getBytes(),
this.flags.getBytes()
};
urlHashCache.put(urlHashCache.row().newEntry(entry));
urlIndexFile.put(urlIndexFile.row().newEntry(entry));
this.stored = true;
} catch (IOException e) {
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
@ -346,7 +346,7 @@ public class plasmaCrawlEURL extends indexURL {
boolean error = false;
public kiter(boolean up, boolean rotating, String firstHash) throws IOException {
i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes());
i = urlIndexFile.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}

@ -70,6 +70,8 @@ import de.anomic.index.indexEntry;
import de.anomic.index.indexURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRAMIndex;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroRow;
import de.anomic.plasma.plasmaHTCache;
@ -117,7 +119,8 @@ public final class plasmaCrawlLURL extends indexURL {
cacheFile.getParentFile().mkdirs();
try {
urlHashCache = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef);
urlIndexFile = new kelondroTree(cacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef);
urlIndexCache = new kelondroRAMIndex(kelondroNaturalOrder.naturalOrder, rowdef);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@ -132,7 +135,7 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack = new LinkedList();
}
public synchronized void stackEntry(Entry e, String initiatorHash, String executorHash, int stackType) {
public synchronized void stack(Entry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return; }
try {
if (initiatorHash == null) { initiatorHash = dummyHash; }
@ -157,27 +160,58 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
public Entry getEntry(String hash, indexEntry searchedWord) throws IOException {
return new Entry(hash, searchedWord);
public Entry load(String urlHash, indexEntry searchedWord) throws IOException {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
// we have two options to find the url:
// - look into the hash cache
// - look into the filed properties
// if the url cannot be found, this returns null
kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) entry = urlIndexCache.get(urlHash.getBytes());
if (entry == null) return null;
return new Entry(entry, searchedWord);
}
public synchronized Entry newEntry(Entry oldEntry) {
if (oldEntry == null) return null;
return new Entry(
oldEntry.url(),
oldEntry.descr(),
oldEntry.moddate(),
oldEntry.loaddate(),
oldEntry.referrerHash(),
oldEntry.copyCount(),
oldEntry.local(),
oldEntry.quality(),
oldEntry.language(),
oldEntry.doctype(),
oldEntry.size(),
oldEntry.wordCount());
public void store(Entry entry, boolean cached) throws IOException {
// Check if there is a more recent Entry already in the DB
if (entry.stored) return;
Entry oldEntry;
try {
if (exists(entry.urlHash)) {
oldEntry = load(entry.urlHash, null);
} else {
oldEntry = null;
}
} catch (Exception e) {
oldEntry = null;
}
if ((oldEntry != null) && (entry.isOlder(oldEntry))) {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
entry.descr = oldEntry.descr;
entry.moddate = oldEntry.moddate;
entry.loaddate = oldEntry.loaddate;
entry.referrerHash = oldEntry.referrerHash;
entry.copyCount = oldEntry.copyCount;
entry.flags = oldEntry.flags;
entry.quality = oldEntry.quality;
entry.language = oldEntry.language;
entry.doctype = oldEntry.doctype;
entry.size = oldEntry.size;
entry.wordCount = oldEntry.wordCount;
// this.snippet // not read from db
// this.word // not read from db
entry.stored = true;
return; // this did not need to be stored, but is updated
}
super.store(entry.toRowEntry(), cached);
entry.stored = true;
}
public synchronized Entry newEntry(String propStr, boolean setGlobal) {
if (propStr.startsWith("{") && propStr.endsWith("}")) {
return new Entry(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)), setGlobal);
@ -281,7 +315,7 @@ public final class plasmaCrawlLURL extends indexURL {
public boolean exists(String urlHash) {
try {
if (urlHashCache.get(urlHash.getBytes()) != null) {
if (urlIndexFile.get(urlHash.getBytes()) != null) {
return true;
} else {
return false;
@ -343,7 +377,7 @@ public final class plasmaCrawlLURL extends indexURL {
urlHash = getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = getEntry(urlHash, null);
urle = load(urlHash, null);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
@ -374,7 +408,7 @@ public final class plasmaCrawlLURL extends indexURL {
prop.put("table_indexed", cnt);
return prop;
}
public class Entry {
private URL url;
@ -426,29 +460,8 @@ public final class plasmaCrawlLURL extends indexURL {
this.word = null;
this.stored = false;
}
public Entry(String urlHash, indexEntry searchedWord) throws IOException {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
// we have two options to find the url:
// - look into the hash cache
// - look into the filed properties
// if the url cannot be found, this returns null
this.urlHash = urlHash;
kelondroRow.Entry entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
insertEntry(entry, searchedWord);
this.stored = true;
}
public Entry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
assert (entry != null);
insertEntry(entry, word);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry, indexEntry searchedWord) throws IOException {
try {
this.urlHash = entry.getColString(0, null);
this.url = new URL(entry.getColString(1, "UTF-8").trim());
@ -505,48 +518,12 @@ public final class plasmaCrawlLURL extends indexURL {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2: " + e.toString(), e);
}
}
public kelondroRow.Entry toRowEntry() throws IOException {
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
public void store() {
// Check if there is a more recent Entry already in the DB
if (this.stored) return;
Entry oldEntry;
try {
if (exists(urlHash)) {
oldEntry = new Entry(urlHash, null);
} else {
oldEntry = null;
}
} catch (Exception e) {
oldEntry = null;
}
if ((oldEntry != null) && (isOlder(oldEntry))) {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
this.descr = oldEntry.descr;
this.moddate = oldEntry.moddate;
this.loaddate = oldEntry.loaddate;
this.referrerHash = oldEntry.referrerHash;
this.copyCount = oldEntry.copyCount;
this.flags = oldEntry.flags;
this.quality = oldEntry.quality;
this.language = oldEntry.language;
this.doctype = oldEntry.doctype;
this.size = oldEntry.size;
this.wordCount = oldEntry.wordCount;
// this.snippet // not read from db
// this.word // not read from db
return;
}
// stores the values from the object variables into the database
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
final byte[][] entry = new byte[][] {
final byte[][] entry = new byte[][] {
urlHash.getBytes(),
url.toString().getBytes(),
descr.getBytes(), // null?
@ -560,13 +537,8 @@ public final class plasmaCrawlLURL extends indexURL {
new byte[] {(byte) doctype},
kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(),
};
urlHashCache.put(urlHashCache.row().newEntry(entry));
//serverLog.logFine("PLASMA","STORED new LURL " + url.toString());
this.stored = true;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e);
}
};
return urlIndexFile.row().newEntry(entry);
}
public String hash() {
@ -751,7 +723,7 @@ public final class plasmaCrawlLURL extends indexURL {
boolean error = false;
public kiter(boolean up, boolean rotating, String firstHash) throws IOException {
i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes());
i = urlIndexFile.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}
@ -817,7 +789,7 @@ public final class plasmaCrawlLURL extends indexURL {
String oldUrlStr = null;
try {
// getting the url data as byte array
kelondroRow.Entry entry = urlHashCache.get(urlHash.getBytes());
kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes());
// getting the wrong url string
oldUrlStr = entry.getColString(1, null).trim();
@ -834,7 +806,7 @@ public final class plasmaCrawlLURL extends indexURL {
if (res.statusCode == 200) {
entry.setCol(1, newUrl.toString().getBytes());
urlHashCache.put(entry);
urlIndexFile.put(entry);
log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
} else {
remove(urlHash);

@ -75,7 +75,7 @@ public class plasmaCrawlNURL extends indexURL {
public static final int STACK_TYPE_MUSIC = 13; // put on music stack
/**
* column length definition for the {@link plasmaURL#urlHashCache} DB
* column length definition for the {@link plasmaURL#urlIndexFile} DB
*/
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + urlHashLength + ", " + // the url's hash
@ -153,7 +153,7 @@ public class plasmaCrawlNURL extends indexURL {
String newCacheName = "urlNotice4.table";
cacheStacksPath.mkdirs();
try {
urlHashCache = new kelondroFlexTable(cacheStacksPath, newCacheName, bufferkb * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder);
urlIndexFile = new kelondroFlexTable(cacheStacksPath, newCacheName, bufferkb * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@ -161,14 +161,14 @@ public class plasmaCrawlNURL extends indexURL {
} else {
File oldCacheFile = new File(cacheStacksPath, "urlNotice1.db");
oldCacheFile.getParentFile().mkdirs();
urlHashCache = kelondroTree.open(oldCacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef);
urlIndexFile = kelondroTree.open(oldCacheFile, bufferkb * 0x400, preloadTime, kelondroTree.defaultObjectCachePercent, rowdef);
}
}
private void resetHashCache() {
if (urlHashCache != null) {
try {urlHashCache.close();} catch (IOException e) {}
urlHashCache = null;
if (urlIndexFile != null) {
try {urlIndexFile.close();} catch (IOException e) {}
urlIndexFile = null;
File cacheFile = new File(cacheStacksPath, "urlNotice1.db");
cacheFile.delete();
}
@ -176,7 +176,7 @@ public class plasmaCrawlNURL extends indexURL {
}
public void close() {
try {urlHashCache.close();} catch (IOException e) {}
try {urlIndexFile.close();} catch (IOException e) {}
coreStack.close();
limitStack.close();
overhangStack.close();
@ -475,7 +475,7 @@ public class plasmaCrawlNURL extends indexURL {
// if the url cannot be found, this returns null
this.hash = hash;
if (hash == null) throw new IOException("hash is null");
kelondroRow.Entry entry = urlHashCache.get(hash.getBytes());
kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes());
if (entry != null) {
insertEntry(entry);
this.stored = true;
@ -532,9 +532,9 @@ public class plasmaCrawlNURL extends indexURL {
this.flags.getBytes(),
normalizeHandle(this.handle).getBytes()
};
if (urlHashCache == null) System.out.println("urlHashCache is NULL");
if ((urlHashCache != null) && (urlHashCache.row() == null)) System.out.println("row() is NULL");
urlHashCache.put(urlHashCache.row().newEntry(entry));
if (urlIndexFile == null) System.out.println("urlHashCache is NULL");
if ((urlIndexFile != null) && (urlIndexFile.row() == null)) System.out.println("row() is NULL");
urlIndexFile.put(urlIndexFile.row().newEntry(entry));
this.stored = true;
} catch (IOException e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB");
@ -614,7 +614,7 @@ public class plasmaCrawlNURL extends indexURL {
boolean error = false;
public kiter(boolean up, boolean rotating, String firstHash) throws IOException {
i = urlHashCache.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes());
i = urlIndexFile.rows(up, rotating, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}

@ -380,7 +380,7 @@ public final class plasmaCrawlStacker {
String dbocc = this.sb.urlPool.exists(nexturlhash);
plasmaCrawlLURL.Entry oldEntry = null;
if (dbocc != null) try {
oldEntry = this.sb.urlPool.loadedURL.getEntry(nexturlhash, null);
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
} catch (IOException e) {}
boolean recrawl = (oldEntry != null) &&
(((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());

@ -228,7 +228,7 @@ public class plasmaDHTChunk {
while ((urlIter.hasNext()) && (maxcount > refcount)) {
iEntry = (indexEntry) urlIter.next();
try {
lurl = lurls.getEntry(iEntry.urlHash(), iEntry);
lurl = lurls.load(iEntry.urlHash(), iEntry);
if ((lurl == null) || (lurl.url() == null)) {
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
notBoundCounter++;

@ -220,8 +220,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
searchResult.add(rcGlobal, preorderTime);
preorderTime = preorderTime - (System.currentTimeMillis() - pst);
if (preorderTime < 0) preorderTime = 200;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking);
preorder.addContainer(searchResult, preorderTime);
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, searchResult, preorderTime);
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size());
@ -244,9 +243,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
entry = preorder.next();
// find the url entry
try {
page = urlStore.getEntry(entry.urlHash(), entry);
page = urlStore.load(entry.urlHash(), entry);
// add a result
acc.addResult(entry, page);
if (page != null) acc.addResult(entry, page);
} catch (IOException e) {
// result was not found
}
@ -279,8 +278,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
profileLocal.startTimer();
if (maxtime < 0) maxtime = 200;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking);
preorder.addContainer(rcLocal, maxtime);
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, maxtime);
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size());
@ -301,9 +299,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
entry = preorder.next();
// find the url entry
try {
page = urlStore.getEntry(entry.urlHash(), entry);
page = urlStore.load(entry.urlHash(), entry);
// add a result
acc.addResult(entry, page);
if (page != null) acc.addResult(entry, page);
} catch (IOException e) {
// result was not found
}

@ -63,6 +63,43 @@ public final class plasmaSearchPreOrder {
private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking;
public plasmaSearchPreOrder() {
this.entryMin = null;
this.entryMax = null;
this.pageAcc = new TreeMap();
this.query = null;
this.ranking = null;
}
public plasmaSearchPreOrder(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, indexContainer container, long maxTime) {
this.query = query;
this.ranking = ranking;
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
indexEntry iEntry;
// first pass: find min/max to obtain limits for normalization
Iterator i = container.entries();
int count = 0;
this.entryMin = null;
this.entryMax = null;
while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break;
iEntry = (indexEntry) i.next();
if (this.entryMin == null) this.entryMin = (indexEntry) iEntry.clone(); else this.entryMin.min(iEntry);
if (this.entryMax == null) this.entryMax = (indexEntry) iEntry.clone(); else this.entryMax.max(iEntry);
count++;
}
// second pass: normalize entries and get ranking
i = container.entries();
this.pageAcc = new TreeMap();
for (int j = 0; j < count; j++) {
iEntry = (indexEntry) i.next();
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax)), 16) + iEntry.urlHash(), iEntry);
}
}
public static void loadYBR(File rankingPath, int count) {
// load ranking tables
if (rankingPath.exists()) {
@ -99,17 +136,11 @@ public final class plasmaSearchPreOrder {
useYBR = usage;
}
public plasmaSearchPreOrder(plasmaSearchQuery query, plasmaSearchRankingProfile ranking) {
entryMin = null;
entryMax = null;
this.pageAcc = new TreeMap();
this.query = query;
this.ranking = ranking;
}
public plasmaSearchPreOrder cloneSmart() {
// clones only the top structure
plasmaSearchPreOrder theClone = new plasmaSearchPreOrder(query, ranking);
plasmaSearchPreOrder theClone = new plasmaSearchPreOrder();
theClone.query = this.query;
theClone.ranking = this.ranking;
theClone.pageAcc = (TreeMap) this.pageAcc.clone();
return theClone;
}
@ -123,29 +154,6 @@ public final class plasmaSearchPreOrder {
return (indexEntry) pageAcc.remove(top);
}
public void addContainer(indexContainer container, long maxTime) {
long limitTime = (maxTime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxTime;
indexEntry iEntry;
// first pass: find min/max to obtain limits for normalization
Iterator i = container.entries();
int count = 0;
while (i.hasNext()) {
if (System.currentTimeMillis() > limitTime) break;
iEntry = (indexEntry) i.next();
if (entryMin == null) entryMin = (indexEntry) iEntry.clone(); else entryMin.min(iEntry);
if (entryMax == null) entryMax = (indexEntry) iEntry.clone(); else entryMax.max(iEntry);
count++;
}
// second pass: normalize entries and get ranking
i = container.entries();
for (int j = 0; j < count; j++) {
iEntry = (indexEntry) i.next();
pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(entryMin, entryMax)), 16) + iEntry.urlHash(), iEntry);
}
}
public indexEntry[] getNormalizer() {
return new indexEntry[] {entryMin, entryMax};
}

@ -987,6 +987,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// flush some entries from the RAM cache
// (new permanent cache flushing)
wordIndex.flushCacheSome(sbQueue.size() != 0);
urlPool.loadedURL.flushCacheSome();
boolean doneSomething = false;
@ -1560,8 +1561,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
/* ========================================================================
* STORE URL TO LOADED-URL-DB
* ======================================================================== */
newEntry.store();
urlPool.loadedURL.stackEntry(
urlPool.loadedURL.store(newEntry, false);
urlPool.loadedURL.stack(
newEntry, // loaded url db entry
initiatorPeerHash, // initiator peer hash
yacyCore.seedDB.mySeed.hash, // executor peer hash
@ -1942,8 +1943,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true);
entry.store();
urlPool.loadedURL.stackEntry(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.loadedURL.store(entry, false);
urlPool.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
@ -2157,7 +2158,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// determine the url string
try {
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash, null);
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
URL url = entry.url();
if (url == null)
return 0;

@ -329,7 +329,7 @@ public class plasmaSwitchboardQueue {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
try {
referrerURL = lurls.getEntry(referrerHash, null).url();
referrerURL = lurls.load(referrerHash, null).url();
} catch (IOException e) {
referrerURL = null;
return null;

@ -82,7 +82,7 @@ public class plasmaURLPool {
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
try {
plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash, null);
plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null);
if (le != null) return le.url();
} catch (IOException e) {}
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);

@ -692,7 +692,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
try {
url = lurl.getEntry(entry.urlHash(), null).url();
url = lurl.load(entry.urlHash(), null).url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash());
}

@ -348,7 +348,13 @@ public final class yacyClient {
// yacyCore.log("DEBUG QUERY: query=" + querystr + "; result = " + result.toString());
if ((result == null) || (result.size() == 0)) return -1;
final String resp = (String) result.get("response");
if (resp == null) { return -1; } else { return Integer.parseInt(resp); }
if (resp == null) {
return -1;
} else try {
return Integer.parseInt(resp);
} catch (NumberFormatException e) {
return -1;
}
} catch (IOException e) {
yacyCore.log.logSevere("yacyClient.queryUrlCount error asking peer '" + target.getName() + "':" + e.toString());
return -1;
@ -477,15 +483,16 @@ public final class yacyClient {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if ((urlEntry == null) || (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, urlEntry.url()))) { continue; } // block with backlist
urlEntry.store();
int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
urlManager.store(urlEntry, true);
urlManager.stack(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
final indexEntry entry;
if (urlEntry.word() == null) {
// the old way to define words
int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
entry = new indexURLEntry(
urlEntry.hash(),
urlLength, urlComps,

@ -727,10 +727,9 @@ public final class yacy {
iEntry = (indexEntry) wordIdxEntries.next();
String urlHash = iEntry.urlHash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null);
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.load(urlHash, null);
urlCounter++;
plasmaCrawlLURL.Entry newEntry = minimizedUrlDB.newEntry(urlEntry);
newEntry.store();
minimizedUrlDB.store(urlEntry, false);
if (urlCounter % 500 == 0) {
log.logInfo(urlCounter + " URLs found so far.");
}

Loading…
Cancel
Save