re-organisation of lurl-creation and -stacking

this was necessary to prevent useless write to the database
in case of blacklist appearance of the url

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1905 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 289da326e5
commit 3286b1f498

@ -456,10 +456,8 @@ public class dir {
try { try {
final URL url = new URL(urlstring); final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes())); final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.addEntry( final plasmaCrawlLURL.Entry newEntry = switchboard.urlPool.loadedURL.newEntry(
url, "YaCyShare: " + descr, new Date(), new Date(), url, "YaCyShare: " + descr, new Date(), new Date(),
"____________", /*initiator*/
yacyCore.seedDB.mySeed.hash, /*executor*/
"AAAAAAAAAAAA", /*referrer*/ "AAAAAAAAAAAA", /*referrer*/
0, /*copycount*/ 0, /*copycount*/
false, /*localneed*/ false, /*localneed*/
@ -467,7 +465,13 @@ public class dir {
"**", /*language*/ "**", /*language*/
plasmaWordIndexEntry.DT_SHARE, /*doctype*/ plasmaWordIndexEntry.DT_SHARE, /*doctype*/
phrase.length(), /*size*/ phrase.length(), /*size*/
condenser.RESULT_NUMB_WORDS, condenser.RESULT_NUMB_WORDS
);
newEntry.store();
switchboard.urlPool.loadedURL.stackEntry(
newEntry,
"____________", /*initiator*/
yacyCore.seedDB.mySeed.hash, /*executor*/
5 /*process case*/ 5 /*process case*/
); );

@ -129,7 +129,8 @@ public final class crawlReceipt {
"\n\tURL properties: "+ propStr); "\n\tURL properties: "+ propStr);
} else { } else {
// put new entry into database // put new entry into database
switchboard.urlPool.loadedURL.addEntry(entry, youare, iam, 1); entry.store();
switchboard.urlPool.loadedURL.stackEntry(entry, youare, iam, 1);
// generating url hash // generating url hash
String newUrlHash = plasmaURL.urlHash(entry.url()); String newUrlHash = plasmaURL.urlHash(entry.url());

@ -88,7 +88,6 @@ public final class search {
// tell all threads to do nothing for a specific time // tell all threads to do nothing for a specific time
sb.wordIndex.intermission(2 * duetime);
sb.intermissionAllThreads(2 * duetime); sb.intermissionAllThreads(2 * duetime);
// store accessing peer // store accessing peer

@ -97,12 +97,12 @@ public final class transferURL {
if ((lEntry != null) && (lEntry.url() != null)) { if ((lEntry != null) && (lEntry.url() != null)) {
if ((blockBlacklist) && if ((blockBlacklist) &&
(plasmaSwitchboard.urlBlacklist.isListed( lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) { (plasmaSwitchboard.urlBlacklist.isListed( lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) {
sb.urlPool.loadedURL.remove(lEntry.hash());
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash()); int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null; lEntry = null;
} else { } else {
sb.urlPool.loadedURL.addEntry(lEntry, iam, iam, 3); lEntry.store();
sb.urlPool.loadedURL.stackEntry(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" yacyCore.log.logFine("transferURL: received URL '"
+ lEntry.url() + "' from peer " + lEntry.url() + "' from peer "
+ otherPeerName); + otherPeerName);

@ -163,7 +163,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, null); plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, null);
/* write it into the home url db */ /* write it into the home url db */
this.homeUrlDB.newEntry(urlEntry); plasmaCrawlLURL.Entry homeEntry = this.homeUrlDB.newEntry(urlEntry);
homeEntry.store();
importedUrlBuffer.add(urlHash); importedUrlBuffer.add(urlHash);
this.urlCounter++; this.urlCounter++;

@ -131,6 +131,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
gcrawlResultStack = new LinkedList(); gcrawlResultStack = new LinkedList();
} }
/*
public synchronized Entry addEntry(URL url, String descr, Date moddate, Date loaddate, public synchronized Entry addEntry(URL url, String descr, Date moddate, Date loaddate,
String initiatorHash, String executorHash, String initiatorHash, String executorHash,
String referrerHash, int copyCount, boolean localNeed, String referrerHash, int copyCount, boolean localNeed,
@ -150,8 +151,9 @@ public final class plasmaCrawlLURL extends plasmaURL {
} }
return e; return e;
} }
*/
public synchronized void addEntry(Entry e, String initiatorHash, String executorHash, int stackType) { public synchronized void stackEntry(Entry e, String initiatorHash, String executorHash, int stackType) {
if (e == null) { return; } if (e == null) { return; }
try { try {
if (initiatorHash == null) { initiatorHash = dummyHash; } if (initiatorHash == null) { initiatorHash = dummyHash; }
@ -205,6 +207,14 @@ public final class plasmaCrawlLURL extends plasmaURL {
} }
} }
public synchronized Entry newEntry(URL url, String descr, Date moddate, Date loaddate,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype,
int size, int wordCount) {
Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
return e;
}
public int getStackSize(int stack) { public int getStackSize(int stack) {
switch (stack) { switch (stack) {
case 1: return externResultStack.size(); case 1: return externResultStack.size();
@ -400,6 +410,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
private int wordCount; private int wordCount;
private String snippet; private String snippet;
private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests private plasmaWordIndexEntry word; // this is only used if the url is transported via remote search requests
private boolean stored = false;
// more needed attributes: // more needed attributes:
// - author / copyright owner // - author / copyright owner
@ -427,7 +438,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.wordCount = wordCount; this.wordCount = wordCount;
this.snippet = null; this.snippet = null;
this.word = null; this.word = null;
store(); this.stored = false;
} }
public Entry(String urlHash, plasmaWordIndexEntry searchedWord) throws IOException { public Entry(String urlHash, plasmaWordIndexEntry searchedWord) throws IOException {
@ -441,6 +452,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.urlHash = urlHash; this.urlHash = urlHash;
byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes()); byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL"); if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
this.stored = true;
try { try {
if (entry != null) { if (entry != null) {
this.url = new URL(new String(entry[1], "UTF-8").trim()); this.url = new URL(new String(entry[1], "UTF-8").trim());
@ -491,15 +503,16 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.snippet = prop.getProperty("snippet", ""); this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null; this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
store(); this.stored = false;
//} //}
} catch (Exception e) { } catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2: " + e.toString(), e); serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2: " + e.toString(), e);
} }
} }
private void store() { public void store() {
// Check if there is a more recent Entry already in the DB // Check if there is a more recent Entry already in the DB
if (this.stored) return;
Entry oldEntry; Entry oldEntry;
try { try {
if (exists(urlHash)) { if (exists(urlHash)) {
@ -553,6 +566,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(),
}; };
urlHashCache.put(entry); urlHashCache.put(entry);
this.stored = true;
} catch (Exception e) { } catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e); serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e);
} }

@ -1402,20 +1402,23 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//log.logDebug("Create LURL-Entry for '" + entry.normalizedURLString() + "', " + //log.logDebug("Create LURL-Entry for '" + entry.normalizedURLString() + "', " +
// "responseHeader=" + entry.responseHeader().toString()); // "responseHeader=" + entry.responseHeader().toString());
plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.addEntry( plasmaCrawlLURL.Entry newEntry = urlPool.loadedURL.newEntry(
entry.url(), descr, docDate, new Date(), entry.url(), descr, docDate, new Date(),
initiatorHash,
yacyCore.seedDB.mySeed.hash,
referrerHash, referrerHash,
0, true, 0, true,
condenser.RESULT_WORD_ENTROPHY, condenser.RESULT_WORD_ENTROPHY,
plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()), plasmaWordIndexEntry.docType(document.getMimeType()),
(int) entry.size(), (int) entry.size(),
condenser.RESULT_NUMB_WORDS, condenser.RESULT_NUMB_WORDS
);
newEntry.store();
urlPool.loadedURL.stackEntry(
newEntry,
initiatorHash,
yacyCore.seedDB.mySeed.hash,
processCase processCase
); );
String urlHash = newEntry.hash(); String urlHash = newEntry.hash();
if (((processCase == 4) || (processCase == 5) || (processCase == 6)) && (entry.profile().localIndexing())) { if (((processCase == 4) || (processCase == 5) || (processCase == 6)) && (entry.profile().localIndexing())) {
@ -1729,7 +1732,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((lurl != null) && (lurl.length() != 0)) { if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true); plasmaCrawlLURL.Entry entry = urlPool.loadedURL.newEntry(propStr, true);
urlPool.loadedURL.addEntry(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? entry.store();
urlPool.loadedURL.stackEntry(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
urlPool.noticeURL.remove(entry.hash()); urlPool.noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'"); log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true; return true;
@ -1763,7 +1767,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaSearchTimingProfile remoteTiming) { plasmaSearchTimingProfile remoteTiming) {
// tell all threads to do nothing for a specific time // tell all threads to do nothing for a specific time
wordIndex.intermission(2 * query.maximumTime);
intermissionAllThreads(2 * query.maximumTime); intermissionAllThreads(2 * query.maximumTime);
serverObjects prop = new serverObjects(); serverObjects prop = new serverObjects();

@ -261,7 +261,6 @@ public final class plasmaWordIndex {
outlinksSame, outlinksOther, outlinksSame, outlinksOther,
true); true);
addEntry(wordHash, ientry, System.currentTimeMillis(), false); addEntry(wordHash, ientry, System.currentTimeMillis(), false);
//addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), System.currentTimeMillis(), false);
} }
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
// condenser.getWords().size() + " words, flushed " + c + " entries"); // condenser.getWords().size() + " words, flushed " + c + " entries");
@ -324,7 +323,7 @@ public final class plasmaWordIndex {
public int size() { public int size() {
return java.lang.Math.max(assortmentCluster.sizeTotal(), return java.lang.Math.max(assortmentCluster.sizeTotal(),
java.lang.Math.max(backend.size(), ramCache.wSize() + ramCache.kSize())); java.lang.Math.max(backend.size(), ramCache.size()));
} }
public int indexSize(String wordHash) { public int indexSize(String wordHash) {
@ -341,10 +340,6 @@ public final class plasmaWordIndex {
return size; return size;
} }
public void intermission(long pause) {
//this.ramCache.intermission(pause);
}
public void close(int waitingBoundSeconds) { public void close(int waitingBoundSeconds) {
ramCache.close(waitingBoundSeconds); ramCache.close(waitingBoundSeconds);
assortmentCluster.close(); assortmentCluster.close();

@ -466,10 +466,11 @@ public final class yacyClient {
// get one single search result // get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist
urlEntry.store();
int urlLength = urlEntry.url().toString().length(); int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length; int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
urlManager.addEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); urlManager.stackEntry(urlEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2);
// save the url entry // save the url entry
final plasmaWordIndexEntry entry; final plasmaWordIndexEntry entry;
if (urlEntry.word() == null) { if (urlEntry.word() == null) {

@ -883,7 +883,8 @@ public final class yacy {
// importing the new url // importing the new url
plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash, null); plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash, null);
urlCounter++; urlCounter++;
homeUrlDB.newEntry(urlEntry); plasmaCrawlLURL.Entry homeEntry = homeUrlDB.newEntry(urlEntry);
homeEntry.store();
if (urlCounter % 500 == 0) { if (urlCounter % 500 == 0) {
log.logFine(urlCounter + " URLs processed so far."); log.logFine(urlCounter + " URLs processed so far.");
@ -985,7 +986,8 @@ public final class yacy {
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null); plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash, null);
urlCounter++; urlCounter++;
/*plasmaCrawlLURL.Entry newEntry =*/ minimizedUrlDB.newEntry(urlEntry); plasmaCrawlLURL.Entry newEntry = minimizedUrlDB.newEntry(urlEntry);
newEntry.store();
if (urlCounter % 500 == 0) { if (urlCounter % 500 == 0) {
log.logInfo(urlCounter + " URLs found so far."); log.logInfo(urlCounter + " URLs found so far.");
} }

Loading…
Cancel
Save