From fc4ae899f75507cac43c4f38a379535bd674af3a Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 22 Jan 2006 02:16:09 +0000 Subject: [PATCH] added word-position to ranking (this is only a first step) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1395 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/plasmaCondenser.java | 34 ------------------- .../anomic/plasma/plasmaSearchPreOrder.java | 4 ++- .../de/anomic/plasma/plasmaSearchResult.java | 5 ++- .../anomic/plasma/plasmaWordIndexEntry.java | 33 ++++++++++-------- 4 files changed, 25 insertions(+), 51 deletions(-) diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index e0cbbc7a8..b912a4101 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -113,40 +113,6 @@ public final class plasmaCondenser { // key is a String (the word), value is a wordStatProp Object return words.entrySet().iterator(); } - - /* - public int wordCount(String word) { - // number of occurrences of one word - // if the word did not occur, this simply returns 0 - wordStatProp sp = (wordStatProp) words.get(word); - if (sp == null) return 0; - return sp.count; - } - - public int wordPositionInText(String word) { - // position of word in text - // if unknown and word does not exist, the method returns 0 - wordStatProp sp = (wordStatProp) words.get(word); - if (sp == null) return 0; - return sp.posInText; - } - - public int wordPositionInPhrase(String word) { - // position of word in text - // if unknown and word does not exist, the method returns 0 - wordStatProp sp = (wordStatProp) words.get(word); - if (sp == null) return 0; - return sp.posInPhrase; - } - - public int wordNumberOfPhrase(String word) { - // position of word in text - // if unknown and word does not exist, the method returns 0 - wordStatProp sp = (wordStatProp) words.get(word); - if (sp == null) return 0; - return sp.numOfPhrase; - } - */ public static class wordStatProp { // object carries statistics for words and sentences diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index b48fc4720..c7011ba08 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -137,7 +137,9 @@ public final class plasmaSearchPreOrder { else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking = factor * ybr_p(indexEntry.getUrlHash()); factor = factor / 4096L; } - + int wordpos = indexEntry.posintext(); + if (wordpos == 0) wordpos = 1000; + ranking = ranking + 1000 - wordpos + indexEntry.hitcount(); pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry); } diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 7cd7023d2..1e53c4502 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -157,7 +157,10 @@ public final class plasmaSearchResult { else if (query.order[j].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * plasmaSearchPreOrder.ybr_p(indexEntry.getUrlHash()); factor = factor / 4096L; } - + int wordpos = indexEntry.posintext(); + if (wordpos == 0) wordpos = 1000; + ranking = ranking + 1000 - wordpos + indexEntry.hitcount(); + // apply 'common-sense' heuristic using references for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length; for (int j = 0; j < descrcomps.length; j++) if (commonSense.contains(descrcomps[j])) ranking += 10L*4096L*4096L / descrcomps.length; diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index 55ef694b6..989443b9d 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -73,13 +73,13 @@ public final class plasmaWordIndexEntry { private final String urlHash; // discrete values - private int hitcount; // words in file - private int wordcount; - private int phrasecount; + private int hitcount; // number of this words in file + private int wordcount; // number of all words in the file + private int phrasecount; // number of all phrases in the file private int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position private int posinphrase; // position within a phrase of the word private int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text - private int worddistance; + private int worddistance;// distance between the words, only used if the index is artificial (from a conjunction) private long lastModified;// calculated by using last-modified private int quality; // result of a heuristic on the source file private byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only @@ -100,16 +100,16 @@ public final class plasmaWordIndexEntry { public static final char DT_UNKNOWN = 'u'; // appearance locations: (used for flags) - public static final int AP_TITLE = 0; // title tag from html header - public static final int AP_H1 = 1; // h0-tag - public static final int AP_H2 = 2; - public static final int AP_H3 = 3; - public static final int AP_H4 = 4; - public static final int AP_H5 = 5; - public static final int AP_H6 = 6; - public static final int AP_ANCHOR = 7; // anchor description - public static final int AP_URL = 8; // word inside an url - public static final int AP_IMG = 9; // tag inside image references + public static final int AP_TITLE = 0; // title tag from html header + public static final int AP_H1 = 1; // h1-tag + public static final int AP_H2 = 2; // h2-tag + public static final int AP_H3 = 3; // h3-tag + public static final int AP_H4 = 4; // h4-tag + public static final int AP_H5 = 5; // h5-tag + public static final int AP_H6 = 6; // h6-tag + public static final int AP_ANCHOR = 7; // anchor description + public static final int AP_URL = 8; // word inside an url + public static final int AP_IMG = 9; // tag inside image references public static final int AP_TAG = 10; // for tagged indexeing (i.e. using mp3 tags) // local flag attributes @@ -254,6 +254,9 @@ public final class plasmaWordIndexEntry { this.worddistance = (code.length() >= 19) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(18, 20)) : 0; this.wordcount = (code.length() >= 21) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(20, 22)) : 0; this.phrasecount = (code.length() >= 23) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(22, 24)) : 0; + if (hitcount == 0) hitcount = 1; + if (wordcount == 0) wordcount = 1000; + if (phrasecount == 0) phrasecount = 100; } public plasmaWordIndexEntry(String external) { @@ -335,7 +338,7 @@ public final class plasmaWordIndexEntry { public int getQuality() { return quality; } public int getVirtualAge() { return plasmaWordIndex.microDateDays(lastModified); } public long getLastModified() { return lastModified; } - public int getCount() { return hitcount; } + public int hitcount() { return hitcount; } public int posintext() { return posintext; } public int posinphrase() { return posinphrase; } public int posofphrase() { return posofphrase; }