added word-position to ranking (this is only a first step)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1395 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent bb2095fe39
commit fc4ae899f7

@ -113,40 +113,6 @@ public final class plasmaCondenser {
// key is a String (the word), value is a wordStatProp Object
return words.entrySet().iterator();
}
/*
public int wordCount(String word) {
// number of occurrences of one word
// if the word did not occur, this simply returns 0
wordStatProp sp = (wordStatProp) words.get(word);
if (sp == null) return 0;
return sp.count;
}
public int wordPositionInText(String word) {
// position of word in text
// if unknown and word does not exist, the method returns 0
wordStatProp sp = (wordStatProp) words.get(word);
if (sp == null) return 0;
return sp.posInText;
}
public int wordPositionInPhrase(String word) {
// position of word in text
// if unknown and word does not exist, the method returns 0
wordStatProp sp = (wordStatProp) words.get(word);
if (sp == null) return 0;
return sp.posInPhrase;
}
public int wordNumberOfPhrase(String word) {
// position of word in text
// if unknown and word does not exist, the method returns 0
wordStatProp sp = (wordStatProp) words.get(word);
if (sp == null) return 0;
return sp.numOfPhrase;
}
*/
public static class wordStatProp {
// object carries statistics for words and sentences

@ -137,7 +137,9 @@ public final class plasmaSearchPreOrder {
else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking = factor * ybr_p(indexEntry.getUrlHash());
factor = factor / 4096L;
}
int wordpos = indexEntry.posintext();
if (wordpos == 0) wordpos = 1000;
ranking = ranking + 1000 - wordpos + indexEntry.hitcount();
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
}

@ -157,7 +157,10 @@ public final class plasmaSearchResult {
else if (query.order[j].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * plasmaSearchPreOrder.ybr_p(indexEntry.getUrlHash());
factor = factor / 4096L;
}
int wordpos = indexEntry.posintext();
if (wordpos == 0) wordpos = 1000;
ranking = ranking + 1000 - wordpos + indexEntry.hitcount();
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length;
for (int j = 0; j < descrcomps.length; j++) if (commonSense.contains(descrcomps[j])) ranking += 10L*4096L*4096L / descrcomps.length;

@ -73,13 +73,13 @@ public final class plasmaWordIndexEntry {
private final String urlHash;
// discrete values
private int hitcount; // words in file
private int wordcount;
private int phrasecount;
private int hitcount; // number of this words in file
private int wordcount; // number of all words in the file
private int phrasecount; // number of all phrases in the file
private int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position
private int posinphrase; // position within a phrase of the word
private int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text
private int worddistance;
private int worddistance;// distance between the words, only used if the index is artificial (from a conjunction)
private long lastModified;// calculated by using last-modified
private int quality; // result of a heuristic on the source file
private byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only
@ -100,16 +100,16 @@ public final class plasmaWordIndexEntry {
public static final char DT_UNKNOWN = 'u';
// appearance locations: (used for flags)
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // h0-tag
public static final int AP_H2 = 2;
public static final int AP_H3 = 3;
public static final int AP_H4 = 4;
public static final int AP_H5 = 5;
public static final int AP_H6 = 6;
public static final int AP_ANCHOR = 7; // anchor description
public static final int AP_URL = 8; // word inside an url
public static final int AP_IMG = 9; // tag inside image references
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // h1-tag
public static final int AP_H2 = 2; // h2-tag
public static final int AP_H3 = 3; // h3-tag
public static final int AP_H4 = 4; // h4-tag
public static final int AP_H5 = 5; // h5-tag
public static final int AP_H6 = 6; // h6-tag
public static final int AP_ANCHOR = 7; // anchor description
public static final int AP_URL = 8; // word inside an url
public static final int AP_IMG = 9; // tag inside image references
public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags)
// local flag attributes
@ -254,6 +254,9 @@ public final class plasmaWordIndexEntry {
this.worddistance = (code.length() >= 19) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(18, 20)) : 0;
this.wordcount = (code.length() >= 21) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(20, 22)) : 0;
this.phrasecount = (code.length() >= 23) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(22, 24)) : 0;
if (hitcount == 0) hitcount = 1;
if (wordcount == 0) wordcount = 1000;
if (phrasecount == 0) phrasecount = 100;
}
public plasmaWordIndexEntry(String external) {
@ -335,7 +338,7 @@ public final class plasmaWordIndexEntry {
public int getQuality() { return quality; }
public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); }
public long getLastModified() { return lastModified; }
public int getCount() { return hitcount; }
public int hitcount() { return hitcount; }
public int posintext() { return posintext; }
public int posinphrase() { return posinphrase; }
public int posofphrase() { return posofphrase; }

Loading…
Cancel
Save