some ranking enhancements

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1460 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 0f84591d57
commit b946e28e61

@ -472,7 +472,7 @@ public class dir {
); );
final String urlHash = newEntry.hash(); final String urlHash = newEntry.hash();
/*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), condenser, "**", plasmaWordIndexEntry.DT_SHARE); /*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, condenser, "**", plasmaWordIndexEntry.DT_SHARE);
} catch (IOException e) {} } catch (IOException e) {}
} }

@ -135,7 +135,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
String initiatorHash, String executorHash, String initiatorHash, String executorHash,
String referrerHash, int copyCount, boolean localNeed, String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype, int quality, String language, char doctype,
long size, int wordCount, int stackType) { int size, int wordCount, int stackType) {
Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount); Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
if (initiatorHash == null) { initiatorHash = dummyHash; } if (initiatorHash == null) { initiatorHash = dummyHash; }
if (executorHash == null) { executorHash = dummyHash; } if (executorHash == null) { executorHash = dummyHash; }
@ -396,12 +396,12 @@ public final class plasmaCrawlLURL extends plasmaURL {
private int quality; private int quality;
private String language; private String language;
private char doctype; private char doctype;
private long size; private int size;
private int wordCount; private int wordCount;
private String snippet; private String snippet;
private plasmaWordIndexEntry word; private plasmaWordIndexEntry word;
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, long size, int wordCount) { public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database // create new entry and store it into database
this.urlHash = urlHash(url); this.urlHash = urlHash(url);
this.url = url; this.url = url;
@ -444,7 +444,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[8], "UTF-8")); this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[8], "UTF-8"));
this.language = new String(entry[9], "UTF-8"); this.language = new String(entry[9], "UTF-8");
this.doctype = (char) entry[10][0]; this.doctype = (char) entry[10][0];
this.size = kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[11], "UTF-8")); this.size = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[11], "UTF-8"));
this.wordCount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[12], "UTF-8")); this.wordCount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[12], "UTF-8"));
this.snippet = null; this.snippet = null;
this.word = searchedWord; this.word = searchedWord;
@ -477,7 +477,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", "")); this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
this.language = prop.getProperty("lang", "uk"); this.language = prop.getProperty("lang", "uk");
this.doctype = prop.getProperty("dt", "t").charAt(0); this.doctype = prop.getProperty("dt", "t").charAt(0);
this.size = Long.parseLong(prop.getProperty("size", "0")); this.size = Integer.parseInt(prop.getProperty("size", "0"));
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0")); this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", ""); this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
@ -570,7 +570,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
return language; return language;
} }
public long size() { public int size() {
return size; return size;
} }

@ -132,14 +132,14 @@ public final class plasmaSearchPreOrder {
long factor = 4096L*4096L; long factor = 4096L*4096L;
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = factor * indexEntry.getQuality() / 64L; if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += factor * indexEntry.getQuality() / 64L;
else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking = factor * indexEntry.getVirtualAge() / 64L; else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking += factor * indexEntry.getVirtualAge() / 64L;
else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking = factor * ybr_p(indexEntry.getUrlHash()); else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * ybr_p(indexEntry.getUrlHash());
factor = factor / 4096L; factor = factor / 4096L;
} }
int wordpos = indexEntry.posintext(); int wordpos = indexEntry.posintext();
if (wordpos == 0) wordpos = 1000; if (wordpos == 0) wordpos = 1000;
ranking = ranking + 1000 - wordpos + indexEntry.hitcount(); ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry); pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
} }

@ -159,7 +159,7 @@ public final class plasmaSearchResult {
} }
int wordpos = indexEntry.posintext(); int wordpos = indexEntry.posintext();
if (wordpos == 0) wordpos = 1000; if (wordpos == 0) wordpos = 1000;
ranking = ranking + 1000 - wordpos + indexEntry.hitcount(); ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance());
// apply 'common-sense' heuristic using references // apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length; for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length;

@ -820,25 +820,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return false; return false;
} }
synchronized (sbQueue) {
if (sbQueue.size() == 0) {
// log.logDebug("DEQUEUE: queue is empty");
return false; // nothing to do
}
if (wordIndex.wordCacheRAMSize() + 1000 > (int) getConfigLong("wordCacheMaxLow", 8000)) { if (wordIndex.wordCacheRAMSize() + 1000 > (int) getConfigLong("wordCacheMaxLow", 8000)) {
log.logFine("deQueue: word index ram cache too full (" + ((int) getConfigLong("wordCacheMaxLow", 8000) - wordIndex.wordCacheRAMSize()) + log.logFine("deQueue: word index ram cache too full (" + ((int) getConfigLong("wordCacheMaxLow", 8000) - wordIndex.wordCacheRAMSize()) + " slots left); dismissed to omit ram flush lock");
" slots left); dismissed to omit ram flush lock");
return false; return false;
} }
int stackCrawlQueueSize; int stackCrawlQueueSize;
if ((stackCrawlQueueSize = sbStackCrawlThread.size()) >= stackCrawlSlots) { if ((stackCrawlQueueSize = sbStackCrawlThread.size()) >= stackCrawlSlots) {
log.logFine("deQueue: too many processes in stack crawl thread queue, dismissed to protect emergency case (" + log.logFine("deQueue: too many processes in stack crawl thread queue, dismissed to protect emergency case (" + "stackCrawlQueue=" + stackCrawlQueueSize + ")");
"stackCrawlQueue=" + stackCrawlQueueSize + ")");
return false; return false;
} }
plasmaSwitchboardQueue.Entry nextentry; plasmaSwitchboardQueue.Entry nextentry;
synchronized (sbQueue) {
if (sbQueue.size() == 0) {
//log.logDebug("DEQUEUE: queue is empty");
return false; // nothing to do
}
// if we were interrupted we should return now // if we were interrupted we should return now
if (Thread.currentThread().isInterrupted()) return false; if (Thread.currentThread().isInterrupted()) return false;
@ -856,13 +856,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logSevere("IOError in plasmaSwitchboard.deQueue: " + e.getMessage(), e); log.logSevere("IOError in plasmaSwitchboard.deQueue: " + e.getMessage(), e);
return false; return false;
} }
}
synchronized (this.indexingTasksInProcess) { synchronized (this.indexingTasksInProcess) {
this.indexingTasksInProcess.put(nextentry.urlHash(),nextentry); this.indexingTasksInProcess.put(nextentry.urlHash(), nextentry);
} }
processResourceStack(nextentry); processResourceStack(nextentry);
}
return true; return true;
} }
@ -1288,7 +1288,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
condenser.RESULT_INFORMATION_VALUE, condenser.RESULT_INFORMATION_VALUE,
plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()), plasmaWordIndexEntry.docType(document.getMimeType()),
entry.size(), (int) entry.size(),
condenser.RESULT_NUMB_WORDS, condenser.RESULT_NUMB_WORDS,
processCase processCase
); );
@ -1309,7 +1309,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (((storagePeerHash = getConfig("storagePeerHash",null))== null) || if (((storagePeerHash = getConfig("storagePeerHash",null))== null) ||
(storagePeerHash.trim().length() == 0) || (storagePeerHash.trim().length() == 0) ||
((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){ ((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
} else { } else {
HashMap urlCache = new HashMap(1); HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry); urlCache.put(newEntry.hash(),newEntry);
@ -1341,7 +1341,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wordStat.posInPhrase, wordStat.posInPhrase,
wordStat.numOfPhrase, wordStat.numOfPhrase,
0, 0,
newEntry.size(),
docDate.getTime(), docDate.getTime(),
System.currentTimeMillis(),
quality, language, doctype, true); quality, language, doctype, true);
wordIdxEntity.addEntry(wordIdxEntry); wordIdxEntity.addEntry(wordIdxEntry);
tmpEntities.add(wordIdxEntity); tmpEntities.add(wordIdxEntity);
@ -1354,7 +1356,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000); String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000);
if (error != null) { if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
} }
// cleanup // cleanup

@ -49,20 +49,15 @@ package de.anomic.plasma;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.TreeSet;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.Date; import java.util.Date;
import java.net.URL; import java.net.URL;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public final class plasmaWordIndex { public final class plasmaWordIndex {
@ -139,7 +134,7 @@ public final class plasmaWordIndex {
return ((long) microDateDays) * ((long) day); return ((long) microDateDays) * ((long) day);
} }
public int addPageIndex(URL url, String urlHash, Date urlModified, plasmaCondenser condenser, String language, char doctype) { public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) {
// this is called by the switchboard to put in a new page into the index // this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index // use all the words in one condenser object to simultanous create index
// entries // entries
@ -172,7 +167,9 @@ public final class plasmaWordIndex {
wprop.posInPhrase, wprop.posInPhrase,
wprop.numOfPhrase, wprop.numOfPhrase,
0, 0,
size,
urlModified.getTime(), urlModified.getTime(),
System.currentTimeMillis(),
quality, language, doctype, true); quality, language, doctype, true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false); addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false);
} }

@ -107,10 +107,16 @@ public final class plasmaWordIndexEntry {
public static final int AP_H4 = 4; // h4-tag public static final int AP_H4 = 4; // h4-tag
public static final int AP_H5 = 5; // h5-tag public static final int AP_H5 = 5; // h5-tag
public static final int AP_H6 = 6; // h6-tag public static final int AP_H6 = 6; // h6-tag
public static final int AP_ANCHOR = 7; // anchor description public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
public static final int AP_URL = 8; // word inside an url public static final int AP_URL = 8; // word inside an url
public static final int AP_IMG = 9; // tag inside image references public static final int AP_IMG = 9; // tag inside image references
public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags) public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_ANCHOR = 11; // anchor description
// URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally
public static final int UA_TILDE = 1; // tilde appears in URL
public static final int UA_REDIRECT = 2; // The URL is a redirection
// local flag attributes // local flag attributes
public static final char LT_LOCAL = 'L'; public static final char LT_LOCAL = 'L';
@ -202,26 +208,34 @@ public final class plasmaWordIndexEntry {
// the class instantiation can only be done by a plasmaStore method // the class instantiation can only be done by a plasmaStore method
// therefore they are all public // therefore they are all public
public plasmaWordIndexEntry(String urlHash, public plasmaWordIndexEntry(String urlHash,
int hitcount, // how often appears this word in the text int hitcount, //*how often appears this word in the text
int wordcount, // total number of words int wordcount, //*total number of words
int phrasecount, // total number of phrases int phrasecount, //*total number of phrases
int posintext, // position of word in all words int posintext, //*position of word in all words
int posinphrase, // position of word in its phrase int posinphrase, //*position of word in its phrase
int posofphrase, // number of the phrase where word appears int posofphrase, //*number of the phrase where word appears
int distance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search int distance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
long lastmodified, // last-modified time of the document where word appears int sizeOfPage, // # of bytes of the page
int quality, // long lastmodified, //*last-modified time of the document where word appears
String language, // long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
char doctype, // int quality, //*the entropy value
boolean local) { String language, //*(guessed) language of document
char doctype, //*type of document
boolean local //*flag shows that this index was generated locally; othervise its from a remote peer
) {
// more needed attributes: // more needed attributes:
// - long: update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short // - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
// - boolean: appears in title, appears in header, anchor-descr, image-tag etc // - boolean: URL attributes
// - int: url-length (shorter are better) // - int: url-length (shorter are better)
// - int: url-number of components / length of path // - int: url-number of components / length of path
// - int: length of description tag / title tag (longer are better) // - int: length of description tag / title tag (longer are better)
// - int: number of chapters // - int: number of chapters
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: length of description
// - int: length of title
// - int: # of keywords
if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk"; if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";
this.urlHash = urlHash; this.urlHash = urlHash;

@ -469,7 +469,9 @@ public final class yacyClient {
urlEntry.hash(), urlEntry.hash(),
urlEntry.wordCount(), urlEntry.wordCount(),
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
urlEntry.size(),
urlEntry.moddate().getTime(), urlEntry.moddate().getTime(),
System.currentTimeMillis(),
urlEntry.quality(), urlEntry.quality(),
urlEntry.language(), urlEntry.language(),
urlEntry.doctype(), urlEntry.doctype(),

Loading…
Cancel
Save