re-organisation of lurl-creation and -stacking

this was necessary to prevent useless write to the database
in case of blacklist appearance of the url

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1905 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 289da326e5
commit 3286b1f498

@ -456,10 +456,8 @@ public class dir {
try {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.addEntry(
final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, "YaCyShare: " + descr, new Date(), new Date(),
"____________", /*initiator*/
yacyCore.seedDB.mySeed.hash, /*executor*/
"AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/
false, /*localneed*/
@ -467,7 +465,13 @@ public class dir {
"**", /*language*/
plasmaWordIndexEntry.DT_SHARE, /*doctype*/
phrase.length(), /*size*/
condenser.RESULT_NUMB_WORDS,
condenser.RESULT_NUMB_WORDS
);
newEntry.store();
switchboard.urlPool.loadedURL.stackEntry(
newEntry,
"____________", /*initiator*/
yacyCore.seedDB.mySeed.hash, /*executor*/
5 /*process case*/
);

@ -129,7 +129,8 @@ public final class crawlReceipt {
"\n\tURL properties: "+ propStr);
} else {
// put new entry into database
switchboard.urlPool.loadedURL.addEntry(entry, youare, iam, 1);
entry.store();
switchboard.urlPool.loadedURL.stackEntry(entry, youare, iam, 1);
// generating url hash
String newUrlHash = plasmaURL.urlHash(entry.url());

@ -88,7 +88,6 @@ public final class search {
// tell all threads to do nothing for a specific time
sb.wordIndex.intermission(2 * duetime);
sb.intermissionAllThreads(2 * duetime);
// store accessing peer

@ -97,12 +97,12 @@ public final class transferURL {
if ((lEntry != null) && (lEntry.url() != null)) {
if ((blockBlacklist) &&
(plasmaSwitchboard.urlBlacklist.isListed( lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) {
sb.urlPool.loadedURL.remove(lEntry.hash());
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
} else {
sb.urlPool.loadedURL.addEntry(lEntry, iam, iam, 3);
lEntry.store();
sb.urlPool.loadedURL.stackEntry(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '"
+ lEntry.url() + "' from peer "
+ otherPeerName);

@ -163,7 +163,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, null);
/* write it into the home url db */
this.homeUrlDB.newEntry(urlEntry);
plasmaCrawlLURL.Entry homeEntry = this.homeUrlDB.newEntry(urlEntry);
homeEntry.store();
importedUrlBuffer.add(urlHash);
this.urlCounter++;

@ -131,6 +131,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
gcrawlResultStack = new LinkedList();
}
/*
public synchronized Entry addEntry(URL url, String descr, Date moddate, Date loaddate,
String initiatorHash, String executorHash,
String referrerHash, int copyCount, boolean localNeed,
@ -150,8 +151,9 @@ public final class plasmaCrawlLURL extends plasmaURL {
}
return e;
}
*/
public synchronized void addEntry(Entry e, String initiatorHash, String executorHash, int stackType) {
public synchronized void stackEntry(Entry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return; }
try {
if (initiatorHash == null) { initiatorHash = dummyHash; }
@ -205,6 +207,14 @@ public final class plasmaCrawlLURL extends plasmaURL {
}
}
public synchronized Entry newEntry(URL url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype,
int size, int wordCount) {
Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
return e;
}
public int getStackSize(int stack) {
switch (stack) {
case 1: return externResultStack.size();
@ -400,6 +410,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
private int wordCount;
private String snippet;
private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests
private boolean stored = false;
// more needed attributes:
// - author / copyright owner
@ -427,7 +438,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.wordCount = wordCount;
this.snippet = null;
this.word = null;
store();
this.stored = false;
}
public Entry(String urlHash, plasmaWordIndexEntry searchedWord) throws IOException {
@ -441,6 +452,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.urlHash = urlHash;
byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
this.stored = true;
try {
if (entry != null) {
this.url = new URL(new String(entry[1], "UTF-8").trim());
@ -491,15 +503,16 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
store();
this.stored = false;
//}
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2: " + e.toString(), e);
}
}
private void store() {
public void store() {
// Check if there is a more recent Entry already in the DB
if (this.stored) return;
Entry oldEntry;
try {
if (exists(urlHash)) {
@ -553,6 +566,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(),
};
urlHashCache.put(entry);
this.stored = true;
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e);
}

@ -1402,20 +1402,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//log.logDebug("Create LURL-Entry for '" + entry.normalizedURLString() + "', " +
// "responseHeader=" + entry.responseHeader().toString());
plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.addEntry(
plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.newEntry(
entry.url(), descr, docDate, new Date(),
initiatorHash,
yacyCore.seedDB.mySeed.hash,
referrerHash,
0, true,
condenser.RESULT_WORD_ENTROPHY,
plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()),
(int) entry.size(),
condenser.RESULT_NUMB_WORDS,
condenser.RESULT_NUMB_WORDS
);
newEntry.store();
urlPool.loadedURL.stackEntry(
newEntry,
initiatorHash,
yacyCore.seedDB.mySeed.hash,
processCase
);
String urlHash = newEntry.hash();
if (((processCase == 4) || (processCase == 5) || (processCase == 6)) && (entry.profile().localIndexing())) {
@ -1729,7 +1732,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true);
urlPool.loadedURL.addEntry(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
entry.store();
urlPool.loadedURL.stackEntry(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
@ -1763,7 +1767,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaSearchTimingProfile remoteTiming) {
// tell all threads to do nothing for a specific time
wordIndex.intermission(2 * query.maximumTime);
intermissionAllThreads(2 * query.maximumTime);
serverObjects prop = new serverObjects();

@ -261,7 +261,6 @@ public final class plasmaWordIndex {
outlinksSame, outlinksOther,
true);
addEntry(wordHash, ientry, System.currentTimeMillis(), false);
//addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), System.currentTimeMillis(), false);
}
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
// condenser.getWords().size() + " words, flushed " + c + " entries");
@ -324,7 +323,7 @@ public final class plasmaWordIndex {
public int size() {
return java.lang.Math.max(assortmentCluster.sizeTotal(),
java.lang.Math.max(backend.size(), ramCache.wSize() + ramCache.kSize()));
java.lang.Math.max(backend.size(), ramCache.size()));
}
public int indexSize(String wordHash) {
@ -341,10 +340,6 @@ public final class plasmaWordIndex {
return size;
}
public void intermission(long pause) {
//this.ramCache.intermission(pause);
}
public void close(int waitingBoundSeconds) {
ramCache.close(waitingBoundSeconds);
assortmentCluster.close();

@ -466,10 +466,11 @@ public final class yacyClient {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist
urlEntry.store();
int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry
final plasmaWordIndexEntry entry;
if (urlEntry.word() == null) {

@ -883,7 +883,8 @@ public final class yacy {
// importing the new url
plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash, null);
urlCounter++;
homeUrlDB.newEntry(urlEntry);
plasmaCrawlLURL.Entry homeEntry = homeUrlDB.newEntry(urlEntry);
homeEntry.store();
if (urlCounter % 500 == 0) {
log.logFine(urlCounter + " URLs processed so far.");
@ -985,7 +986,8 @@ public final class yacy {
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null);
urlCounter++;
/*plasmaCrawlLURL.Entry newEntry =*/ minimizedUrlDB.newEntry(urlEntry);
plasmaCrawlLURL.Entry newEntry = minimizedUrlDB.newEntry(urlEntry);
newEntry.store();
if (urlCounter % 500 == 0) {
log.logInfo(urlCounter + " URLs found so far.");
}

Loading…
Cancel
Save