some ranking enhancements

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1460 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 0f84591d57
commit b946e28e61

@ -472,7 +472,7 @@ public class dir {
);
final String urlHash = newEntry.hash();
/*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), condenser, "**", plasmaWordIndexEntry.DT_SHARE);
/*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, condenser, "**", plasmaWordIndexEntry.DT_SHARE);
} catch (IOException e) {}
}

@ -135,7 +135,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
String initiatorHash, String executorHash,
String referrerHash, int copyCount, boolean localNeed,
int quality, String language, char doctype,
long size, int wordCount, int stackType) {
int size, int wordCount, int stackType) {
Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
if (initiatorHash == null) { initiatorHash = dummyHash; }
if (executorHash == null) { executorHash = dummyHash; }
@ -396,12 +396,12 @@ public final class plasmaCrawlLURL extends plasmaURL {
private int quality;
private String language;
private char doctype;
private long size;
private int size;
private int wordCount;
private String snippet;
private plasmaWordIndexEntry word;
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, long size, int wordCount) {
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database
this.urlHash = urlHash(url);
this.url = url;
@ -444,7 +444,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[8], "UTF-8"));
this.language = new String(entry[9], "UTF-8");
this.doctype = (char) entry[10][0];
this.size = kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[11], "UTF-8"));
this.size = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[11], "UTF-8"));
this.wordCount = (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(entry[12], "UTF-8"));
this.snippet = null;
this.word = searchedWord;
@ -477,7 +477,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
this.language = prop.getProperty("lang", "uk");
this.doctype = prop.getProperty("dt", "t").charAt(0);
this.size = Long.parseLong(prop.getProperty("size", "0"));
this.size = Integer.parseInt(prop.getProperty("size", "0"));
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
@ -570,7 +570,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
return language;
}
public long size() {
public int size() {
return size;
}

@ -132,14 +132,14 @@ public final class plasmaSearchPreOrder {
long factor = 4096L*4096L;
for (int i = 0; i < 3; i++) {
if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = factor * indexEntry.getQuality() / 64L;
else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking = factor * indexEntry.getVirtualAge() / 64L;
else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking = factor * ybr_p(indexEntry.getUrlHash());
if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += factor * indexEntry.getQuality() / 64L;
else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking += factor * indexEntry.getVirtualAge() / 64L;
else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * ybr_p(indexEntry.getUrlHash());
factor = factor / 4096L;
}
int wordpos = indexEntry.posintext();
if (wordpos == 0) wordpos = 1000;
ranking = ranking + 1000 - wordpos + indexEntry.hitcount();
ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
}

@ -159,7 +159,7 @@ public final class plasmaSearchResult {
}
int wordpos = indexEntry.posintext();
if (wordpos == 0) wordpos = 1000;
ranking = ranking + 1000 - wordpos + indexEntry.hitcount();
ranking = ranking + 4096L*4096L * (1000 - wordpos + indexEntry.hitcount() - 2 * indexEntry.worddistance());
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length;

@ -820,25 +820,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return false;
}
synchronized (sbQueue) {
if (sbQueue.size() == 0) {
// log.logDebug("DEQUEUE: queue is empty");
return false; // nothing to do
}
if (wordIndex.wordCacheRAMSize() + 1000 > (int) getConfigLong("wordCacheMaxLow", 8000)) {
log.logFine("deQueue: word index ram cache too full (" + ((int) getConfigLong("wordCacheMaxLow", 8000) - wordIndex.wordCacheRAMSize()) +
" slots left); dismissed to omit ram flush lock");
log.logFine("deQueue: word index ram cache too full (" + ((int) getConfigLong("wordCacheMaxLow", 8000) - wordIndex.wordCacheRAMSize()) + " slots left); dismissed to omit ram flush lock");
return false;
}
int stackCrawlQueueSize;
if ((stackCrawlQueueSize = sbStackCrawlThread.size()) >= stackCrawlSlots) {
log.logFine("deQueue: too many processes in stack crawl thread queue, dismissed to protect emergency case (" +
"stackCrawlQueue=" + stackCrawlQueueSize + ")");
log.logFine("deQueue: too many processes in stack crawl thread queue, dismissed to protect emergency case (" + "stackCrawlQueue=" + stackCrawlQueueSize + ")");
return false;
}
plasmaSwitchboardQueue.Entry nextentry;
synchronized (sbQueue) {
if (sbQueue.size() == 0) {
//log.logDebug("DEQUEUE: queue is empty");
return false; // nothing to do
}
// if we were interrupted we should return now
if (Thread.currentThread().isInterrupted()) return false;
@ -856,13 +856,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logSevere("IOError in plasmaSwitchboard.deQueue: " + e.getMessage(), e);
return false;
}
}
synchronized (this.indexingTasksInProcess) {
this.indexingTasksInProcess.put(nextentry.urlHash(), nextentry);
}
processResourceStack(nextentry);
}
return true;
}
@ -1288,7 +1288,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
condenser.RESULT_INFORMATION_VALUE,
plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()),
entry.size(),
(int) entry.size(),
condenser.RESULT_NUMB_WORDS,
processCase
);
@ -1309,7 +1309,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (((storagePeerHash = getConfig("storagePeerHash",null))== null) ||
(storagePeerHash.trim().length() == 0) ||
((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
} else {
HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry);
@ -1341,7 +1341,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wordStat.posInPhrase,
wordStat.numOfPhrase,
0,
newEntry.size(),
docDate.getTime(),
System.currentTimeMillis(),
quality, language, doctype, true);
wordIdxEntity.addEntry(wordIdxEntry);
tmpEntities.add(wordIdxEntity);
@ -1354,7 +1356,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000);
if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));
}
// cleanup

@ -49,20 +49,15 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import java.util.HashSet;
import java.util.Set;
import java.util.Date;
import java.net.URL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public final class plasmaWordIndex {
@ -139,7 +134,7 @@ public final class plasmaWordIndex {
return ((long) microDateDays) * ((long) day);
}
public int addPageIndex(URL url, String urlHash, Date urlModified, plasmaCondenser condenser, String language, char doctype) {
public int addPageIndex(URL url, String urlHash, Date urlModified, int size, plasmaCondenser condenser, String language, char doctype) {
// this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index
// entries
@ -172,7 +167,9 @@ public final class plasmaWordIndex {
wprop.posInPhrase,
wprop.numOfPhrase,
0,
size,
urlModified.getTime(),
System.currentTimeMillis(),
quality, language, doctype, true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), ientry), false);
}

@ -107,10 +107,16 @@ public final class plasmaWordIndexEntry {
public static final int AP_H4 = 4; // h4-tag
public static final int AP_H5 = 5; // h5-tag
public static final int AP_H6 = 6; // h6-tag
public static final int AP_ANCHOR = 7; // anchor description
public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
public static final int AP_URL = 8; // word inside an url
public static final int AP_IMG = 9; // tag inside image references
public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_ANCHOR = 11; // anchor description
// URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally
public static final int UA_TILDE = 1; // tilde appears in URL
public static final int UA_REDIRECT = 2; // The URL is a redirection
// local flag attributes
public static final char LT_LOCAL = 'L';
@ -202,26 +208,34 @@ public final class plasmaWordIndexEntry {
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
public plasmaWordIndexEntry(String urlHash,
int hitcount, // how often appears this word in the text
int wordcount, // total number of words
int phrasecount, // total number of phrases
int posintext, // position of word in all words
int posinphrase, // position of word in its phrase
int posofphrase, // number of the phrase where word appears
int distance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
long lastmodified, // last-modified time of the document where word appears
int quality, //
String language, //
char doctype, //
boolean local) {
int hitcount, //*how often appears this word in the text
int wordcount, //*total number of words
int phrasecount, //*total number of phrases
int posintext, //*position of word in all words
int posinphrase, //*position of word in its phrase
int posofphrase, //*number of the phrase where word appears
int distance, //*word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
int sizeOfPage, // # of bytes of the page
long lastmodified, //*last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, //*the entropy value
String language, //*(guessed) language of document
char doctype, //*type of document
boolean local //*flag shows that this index was generated locally; othervise its from a remote peer
) {
// more needed attributes:
// - long: update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
// - boolean: appears in title, appears in header, anchor-descr, image-tag etc
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag etc
// - boolean: URL attributes
// - int: url-length (shorter are better)
// - int: url-number of components / length of path
// - int: length of description tag / title tag (longer are better)
// - int: number of chapters
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: length of description
// - int: length of title
// - int: # of keywords
if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";
this.urlHash = urlHash;

@ -469,7 +469,9 @@ public final class yacyClient {
urlEntry.hash(),
urlEntry.wordCount(),
0, 0, 0, 0, 0, 0,
urlEntry.size(),
urlEntry.moddate().getTime(),
System.currentTimeMillis(),
urlEntry.quality(),
urlEntry.language(),
urlEntry.doctype(),

Loading…
Cancel
Save