Refacture rwi reference word position and word distance calculation

used for rwi ranking.
Main changes:  
- introduce a  posintext() to access the stored value. This reduces also mem alloc of position array for WordReferenceRow (index access)
- use the positions() array for joined references on multi-word queries if needed (otherwise allow positions() to be null
- adjust assignments and the min() max() and distance() calculation accordingly
pull/88/head
reger 9 years ago
parent f0639d810c
commit 3c7220bc7b

@ -519,7 +519,7 @@ public class IndexControlRWIs_p {
prop.putNum("genUrlList_urlList_" + i + "_urlExists_lother", entry.word().lother()); prop.putNum("genUrlList_urlList_" + i + "_urlExists_lother", entry.word().lother());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_hitcount", entry.word().hitcount()); prop.putNum("genUrlList_urlList_" + i + "_urlExists_hitcount", entry.word().hitcount());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_worddistance", 0); prop.putNum("genUrlList_urlList_" + i + "_urlExists_worddistance", 0);
prop.putNum("genUrlList_urlList_" + i + "_urlExists_pos", entry.word().minposition()); prop.putNum("genUrlList_urlList_" + i + "_urlExists_pos", entry.word().posintext());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_phrase", entry.word().posofphrase()); prop.putNum("genUrlList_urlList_" + i + "_urlExists_phrase", entry.word().posofphrase());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_posinphrase", entry.word().posinphrase()); prop.putNum("genUrlList_urlList_" + i + "_urlExists_posinphrase", entry.word().posinphrase());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_urlcomps", entry.word().urlcomps()); prop.putNum("genUrlList_urlList_" + i + "_urlExists_urlcomps", entry.word().urlcomps());

@ -156,17 +156,12 @@ public class CitationReference implements Reference, Serializable {
} }
@Override @Override
public int maxposition() { public Collection<Integer> positions() {
throw new UnsupportedOperationException();
}
@Override
public int minposition() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override @Override
public Collection<Integer> positions() { public int posintext() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }

@ -183,4 +183,9 @@ public final class NavigationReferenceRow extends AbstractReference implements N
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public int posintext() {
throw new UnsupportedOperationException();
}
} }

@ -168,4 +168,8 @@ public class NavigationReferenceVars extends AbstractReference implements Navig
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override
public int posintext() {
throw new UnsupportedOperationException(); }
} }

@ -26,7 +26,6 @@
package net.yacy.kelondro.data.word; package net.yacy.kelondro.data.word;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import net.yacy.cora.date.MicroDate; import net.yacy.cora.date.MicroDate;
@ -250,7 +249,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
@Override @Override
public long lastModified() { public long lastModified() {
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified)); return MicroDate.reverseMicroDateDays(this.entry.getColLong(col_lastModified));
} }
@Override @Override
@ -259,18 +258,24 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
} }
/** /**
* First position of word in text. * @return first positon of word in text
*/
@Override
public int posintext() {
int pos = (int) this.entry.getColLong(col_posintext);
return pos;
}
/**
* positions() is used to remember word positions for each query word of an * positions() is used to remember word positions for each query word of an
* multi word search query. As we currently don't include a separate posintext() * multi word search query.
* function, we use positions to make the posintext value available. * WordReferenceRow is for one WordReference and has no means to return multiple positions
* @return Collection with one element * but is required by the interface.
* @return null
*/ */
@Override @Override
public Collection<Integer> positions() { public Collection<Integer> positions() {
int pos = (int) this.entry.getColLong(col_posintext); return null;
ArrayList<Integer> arr = new ArrayList<Integer>(1);
arr.add(pos);
return arr;
} }
@Override @Override

@ -62,11 +62,12 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
private final char type; private final char type;
private int hitcount, // how often appears this word in the text private int hitcount, // how often appears this word in the text
llocal, lother, phrasesintext, llocal, lother, phrasesintext,
posintext, // word position in text
posinphrase, posofphrase, posinphrase, posofphrase,
urlcomps, urllength, urlcomps, urllength,
wordsintext, wordsintitle; wordsintext, wordsintitle;
private int virtualAge; private int virtualAge;
private final Queue<Integer> positions; private Queue<Integer> positions; // word positons of joined references
private double termFrequency; private double termFrequency;
private final boolean local; private final boolean local;
@ -78,6 +79,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
final int hitcount, // how often appears this word in the text final int hitcount, // how often appears this word in the text
final int wordcount, // total number of words final int wordcount, // total number of words
final int phrasecount, // total number of phrases final int phrasecount, // total number of phrases
final int posintext, // first position of word in text
final Queue<Integer> ps, // positions of words that are joined into the reference final Queue<Integer> ps, // positions of words that are joined into the reference
final int posinphrase, // position of word in its phrase final int posinphrase, // position of word in its phrase
final int posofphrase, // number of the phrase where word appears final int posofphrase, // number of the phrase where word appears
@ -100,9 +102,15 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.llocal = outlinksSame; this.llocal = outlinksSame;
this.lother = outlinksOther; this.lother = outlinksOther;
this.phrasesintext = phrasecount; this.phrasesintext = phrasecount;
this.positions = new LinkedBlockingQueue<Integer>();
if (!ps.isEmpty()) for (final Integer i: ps) this.positions.add(i); if (ps != null && !ps.isEmpty()) {
this.positions = new LinkedBlockingQueue<Integer>();
for (final Integer i : ps) this.positions.add(i);
} else {
this.positions = null;
}
this.posinphrase = posinphrase; this.posinphrase = posinphrase;
this.posintext = posintext;
this.posofphrase = posofphrase; this.posofphrase = posofphrase;
this.urlcomps = urlComps; this.urlcomps = urlComps;
this.urllength = urlLength; this.urllength = urlLength;
@ -124,9 +132,15 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.llocal = e.llocal(); this.llocal = e.llocal();
this.lother = e.lother(); this.lother = e.lother();
this.phrasesintext = e.phrasesintext(); this.phrasesintext = e.phrasesintext();
this.positions = new LinkedBlockingQueue<Integer>();
if (!e.positions().isEmpty()) for (final Integer i: e.positions()) this.positions.add(i); if (e.positions() != null && !e.positions().isEmpty()) {
this.positions = new LinkedBlockingQueue<Integer>();
for (final Integer i: e.positions()) this.positions.add(i);
} else {
this.positions = null;
}
this.posinphrase = e.posinphrase(); this.posinphrase = e.posinphrase();
this.posintext = e.posintext();
this.posofphrase = e.posofphrase(); this.posofphrase = e.posofphrase();
this.urlcomps = e.urlcomps(); this.urlcomps = e.urlcomps();
this.urllength = e.urllength(); this.urllength = e.urllength();
@ -152,6 +166,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.phrasesintext = 0; this.phrasesintext = 0;
this.positions = null; this.positions = null;
this.posinphrase = 0; this.posinphrase = 0;
this.posintext = 0;
this.posofphrase = 0; this.posofphrase = 0;
this.urlcomps = 0; this.urlcomps = 0;
this.urllength = 0; this.urllength = 0;
@ -172,6 +187,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.hitcount, this.hitcount,
this.wordsintext, this.wordsintext,
this.phrasesintext, this.phrasesintext,
this.posintext,
this.positions, this.positions,
this.posinphrase, this.posinphrase,
this.posofphrase, this.posofphrase,
@ -234,6 +250,20 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
return this.posinphrase; return this.posinphrase;
} }
/**
* First word position in text.
* @return min position
*/
@Override
public int posintext() {
return this.posintext;
}
/**
* Word positions for joined references (for multi word queries).
* @see posintext()
* @return the word positions of the joined references
*/
@Override @Override
public Collection<Integer> positions() { public Collection<Integer> positions() {
return this.positions; return this.positions;
@ -253,7 +283,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.hitcount, // how often appears this word in the text this.hitcount, // how often appears this word in the text
this.wordsintext, // total number of words this.wordsintext, // total number of words
this.phrasesintext, // total number of phrases this.phrasesintext, // total number of phrases
this.positions.isEmpty() ? 0 : minposition(), // position of word in all words (WordReferenceRow stores first position in text, minpos also important for joined references) this.posintext, // position of word in all words (WordReferenceRow stores first position in text)
this.posinphrase, // position of word in its phrase this.posinphrase, // position of word in its phrase
this.posofphrase, // number of the phrase where word appears this.posofphrase, // number of the phrase where word appears
this.lastModified, // last-modified time of the document where word appears this.lastModified, // last-modified time of the document where word appears
@ -336,21 +366,19 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
if (virtualAge() > (v = other.virtualAge())) this.virtualAge = v; if (virtualAge() > (v = other.virtualAge())) this.virtualAge = v;
if (this.wordsintext > (v = other.wordsintext)) this.wordsintext = v; if (this.wordsintext > (v = other.wordsintext)) this.wordsintext = v;
if (this.phrasesintext > (v = other.phrasesintext)) this.phrasesintext = v; if (this.phrasesintext > (v = other.phrasesintext)) this.phrasesintext = v;
if (this.posintext > (v = other.posintext)) this.posintext = v;
int minpos = min(this.positions, other.positions); // calculate and remember min distance
if (minpos != Integer.MAX_VALUE) { if (this.positions != null || other.positions != null) {
int odist = other.distance(); int odist = other.distance();
int dist = this.distance(); int dist = this.distance();
this.positions.clear(); // we want only the min
this.positions.add(minpos);
// handle distance for multi word queries
// distance is calculated from positions, must be at least 2 positions for calculation
if (odist > 0 && odist < dist) { if (odist > 0 && odist < dist) {
this.positions.add(minpos + odist); if (this.positions == null) {
} else if (dist > 0) { this.positions = new LinkedBlockingQueue<Integer>();
this.positions.add(minpos + dist); } else {
} else if (odist > 0) { this.positions.clear();
this.positions.add(minpos + odist); }
this.positions.add(this.posintext + odist);
} }
} }
@ -375,19 +403,19 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
if (virtualAge() < (v = other.virtualAge())) this.virtualAge = v; if (virtualAge() < (v = other.virtualAge())) this.virtualAge = v;
if (this.wordsintext < (v = other.wordsintext)) this.wordsintext = v; if (this.wordsintext < (v = other.wordsintext)) this.wordsintext = v;
if (this.phrasesintext < (v = other.phrasesintext)) this.phrasesintext = v; if (this.phrasesintext < (v = other.phrasesintext)) this.phrasesintext = v;
if (this.posintext < (v = other.posintext)) this.posintext = v;
int maxpos = max(this.positions, other.positions); // calculate and remember max distance
if (maxpos != Integer.MIN_VALUE) { if (this.positions != null || other.positions != null) {
int odist = other.distance(); int odist = other.distance();
int dist = this.distance(); int dist = this.distance();
this.positions.clear(); if (odist > 0 && odist > dist) {
this.positions.add(maxpos); if (this.positions == null) {
// handle distance for multi word queries this.positions = new LinkedBlockingQueue<Integer>();
// distance is calculated from positions, must be at least 2 positions for calculation } else {
if (odist > dist) { this.positions.clear();
this.positions.add(maxpos - odist); // special cas for max, to not be altered by the pos for distance use pos before maxpos }
} else if (dist > 0) { this.positions.add(this.posintext + odist);
this.positions.add(maxpos - dist);
} }
} }
@ -404,18 +432,27 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
/** /**
* joins two entries into one entry * joins two entries into one entry
* *
* Main usage is on multi word searches to combine the position values for distance ranking, * Main usage is on multi word searches to combine the position values for ranking and word distance calculation,
* A Join is valid for the same url. * A Join is valid for the same url.
* @param r WordReference * @param r WordReference
*/ */
@Override @Override
public void join(final Reference r) { public void join(final Reference r) {
// combine the distance
final WordReference oe = (WordReference) r; final WordReference oe = (WordReference) r;
this.positions.addAll(oe.positions()); // choose min posintext (for > 0)
if (this.posintext > 0 && oe.posintext() > 0) {
if (this.posintext > oe.posintext()) {
this.addPosition(this.posintext); // remember larger position (for distance calculation)
this.posintext = oe.posintext();
} else {
this.addPosition(oe.posintext()); // remember other position (for distance calculation)
}
} else if (this.posintext == 0) {
this.posintext = oe.posintext();
}
// join phrase // join phrase
// this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0; // this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0;
// this.posofphrase = Math.min(this.posofphrase, oe.posofphrase()); // this.posofphrase = Math.min(this.posofphrase, oe.posofphrase());
@ -465,8 +502,13 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
return o1.compareTo(o2); return o1.compareTo(o2);
} }
/**
* Add a position for word distance calculation to the list if position > 0
* @param position
*/
public void addPosition(final int position) { public void addPosition(final int position) {
this.positions.add(position); if (this.positions == null && position > 0) this.positions = new LinkedBlockingQueue<Integer>();
if (position > 0) this.positions.add(position);
} }
/** /**
@ -474,7 +516,6 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
* @param container * @param container
* @return a blocking queue filled with WordReferenceVars that is still filled when the object is returned * @return a blocking queue filled with WordReferenceVars that is still filled when the object is returned
*/ */
public static BlockingQueue<WordReferenceVars> transform(final ReferenceContainer<WordReference> container, final long maxtime, final boolean local) { public static BlockingQueue<WordReferenceVars> transform(final ReferenceContainer<WordReference> container, final long maxtime, final boolean local) {
final LinkedBlockingQueue<WordReferenceVars> vars = new LinkedBlockingQueue<WordReferenceVars>(); final LinkedBlockingQueue<WordReferenceVars> vars = new LinkedBlockingQueue<WordReferenceVars>();
if (container.size() <= 100) { if (container.size() <= 100) {

@ -26,102 +26,24 @@
package net.yacy.kelondro.rwi; package net.yacy.kelondro.rwi;
import java.util.Collection;
import java.util.Iterator; import java.util.Iterator;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
public abstract class AbstractReference implements Reference { public abstract class AbstractReference implements Reference {
protected static int max(Collection<Integer> a, Collection<Integer> b) {
if (a == null || a.isEmpty()) return max(b);
if (b == null || b.isEmpty()) return max(a);
int ma = max(a);
int mb = max(b);
if (ma == Integer.MIN_VALUE) return mb;
if (mb == Integer.MIN_VALUE) return ma;
return Math.max(ma, mb);
}
protected static int min(Collection<Integer> a, Collection<Integer> b) {
assert a != null;
if (a == null || a.isEmpty()) return min(b);
if (b == null || b.isEmpty()) return min(a);
int ma = min(a);
int mb = min(b);
if (ma == Integer.MAX_VALUE) return mb;
if (mb == Integer.MAX_VALUE) return ma;
return Math.min(ma, mb);
}
private static int max(Collection<Integer> a) {
if (a == null || a.isEmpty()) return Integer.MIN_VALUE;
Iterator<Integer> i = a.iterator();
/*
expirienced concurrency issue with this short cut 2016-09-06
on i.next w/o test of hasNext before
java.util.NoSuchElementException at java.util.concurrent.LinkedBlockingQueue$Itr.next(LinkedBlockingQueue.java:828)
if (a.size() == 1) return i.next();
if (a.size() == 2) return Math.max(i.next(), i.next());
int r = i.next();
*/
int r = Integer.MIN_VALUE;
int s;
while (i.hasNext()) {
s = i.next();
if (s > r) r = s;
}
return r;
}
private static int min(Collection<Integer> a) {
if (a == null || a.isEmpty()) return Integer.MAX_VALUE;
Iterator<Integer> i = a.iterator();
/* concurrency issue (see max())
if (a.size() == 1) return i.next();
if (a.size() == 2) return Math.min(i.next(), i.next());
int r = i.next();
*/
int r = Integer.MAX_VALUE;
int s;
while (i.hasNext()) {
s = i.next();
if (s < r) r = s;
}
return r;
}
/**
* max position of search query words for multi word queries
* @return
*/
@Override
public int maxposition() {
return max(positions());
}
/**
* min word position of search query words for multi word queries
* @return
*/
@Override
public int minposition() {
return min(positions());
}
/** /**
* The average distance (in words) between search query terms for multi word searches. * The average distance (in words) between search query terms for multi word searches.
* @return word distance * @return word distance
*/ */
@Override @Override
public int distance() { public int distance() {
if (positions().size() < 2) return 0; // check if positions have been joined
if (positions() == null || positions().isEmpty()) return 0;
int d = 0; int d = 0;
Iterator<Integer> i = positions().iterator(); Iterator<Integer> i = positions().iterator();
// int s0 = i.next(), s1; // concurrency issue see max() int s0 = posintext(); // init with own positon
int s0 = -1, s1; int s1;
while (i.hasNext()) { while (i.hasNext()) {
s1 = i.next(); s1 = i.next();
if (s0 > 0) d += Math.abs(s0 - s1); if (s0 > 0) d += Math.abs(s0 - s1);
@ -130,11 +52,8 @@ public abstract class AbstractReference implements Reference {
// despite first line checks for size < 2 Arithmetic exception div by zero occured (1.91/9278 2016-10-19) // despite first line checks for size < 2 Arithmetic exception div by zero occured (1.91/9278 2016-10-19)
// added d == 0 condition as protection for this (which was in all above tests the case) // added d == 0 condition as protection for this (which was in all above tests the case)
try { try {
return d == 0 ? 0 : d / (positions().size() - 1); return d == 0 ? 0 : d / positions().size();
} catch (ArithmeticException ex) { } catch (ArithmeticException ex) {
// TODO: in peer to peer normalization of rwi queue modifies concurrently positions resulting in div by 0 exception
// with effect of above check position() < 2 is false but now true, it also results in changing ranking results until normalization is finished
// see related/causing code ReferenceOrder.normalizewith() and WordReferenceVars.max()/WordReferenceVars.min() -> refacturing of posintext, distance, min, max needed
ConcurrentLog.fine("AbstractReference", "word distance calculation:" + ex.getMessage()); ConcurrentLog.fine("AbstractReference", "word distance calculation:" + ex.getMessage());
return 0; return 0;
} }

@ -58,29 +58,24 @@ public interface Reference {
public void join(final Reference oe); public void join(final Reference oe);
/** /**
* Positions or search query words for the referenced result url * First positon of word in text.
* This is only valid for multi word search queries. * Word position is not calculated for titles, for all title words it defaults to 0.
* The positions contain the first word position for every search query word * @return min word position starting at 1 (0 if undefined)
* which has been joined (by join() )
* @return list with word position
*/ */
public Collection<Integer> positions(); public int posintext();
/** /**
* max position of search query words (for multi word queries) * Positions of search query words for the referenced result url
* @return * This is only valid for multi word search queries.
*/ * The positions contain the first word position for every joined search query word
public int maxposition(); * which has been joined (by join() )
* @return list with word position (excl. the posintext of this reference) or null
/**
* min word position of search query words (for multi word queries)
* @return
*/ */
public int minposition(); public Collection<Integer> positions();
/** /**
* The average distance (in words) between search query terms for multi word searches. * The average distance (in words) between search query terms for multi word searches.
* The distance is calculated from positions() * The distance is calculated from posintext() and positions()
* @return word distance * @return word distance
*/ */
public int distance(); public int distance();

@ -30,7 +30,6 @@ package net.yacy.peers.graphics;
import java.io.File; import java.io.File;
import java.io.Serializable; import java.io.Serializable;
import java.text.ParseException; import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -451,7 +450,12 @@ public class WebStructureGraph {
@Override @Override
public Collection<Integer> positions() { public Collection<Integer> positions() {
return new ArrayList<Integer>(0); return null;
}
@Override
public int posintext() {
throw new UnsupportedOperationException();
} }
} }

@ -221,7 +221,6 @@ public class ReferenceOrder {
* @return a ranking: the higher the number, the better is the ranking * @return a ranking: the higher the number, the better is the ranking
*/ */
public long cardinal(final WordReference t) { public long cardinal(final WordReference t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry // the normalizedEntry must be a normalized indexEntry
assert this.min != null; assert this.min != null;
assert this.max != null; assert this.max != null;
@ -230,17 +229,15 @@ public class ReferenceOrder {
final Bitfield flags = t.flags(); final Bitfield flags = t.flags();
final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency); final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency);
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf); //System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
final int maxmaxpos = this.max.maxposition(); // returns Integer.MIN_VALUE if positions empty
final int minminpos = this.min.minposition();
final long r = final long r =
((256 - DigestURL.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength) ((256 - DigestURL.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength)
+ ((this.max.urlcomps() == this.min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - this.min.urlcomps() ) << 8) / (this.max.urlcomps() - this.min.urlcomps()) )) << this.ranking.coeff_urlcomps) + ((this.max.urlcomps() == this.min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - this.min.urlcomps() ) << 8) / (this.max.urlcomps() - this.min.urlcomps()) )) << this.ranking.coeff_urlcomps)
+ ((this.max.urllength() == this.min.urllength() ) ? 0 : (256 - (((t.urllength() - this.min.urllength() ) << 8) / (this.max.urllength() - this.min.urllength()) )) << this.ranking.coeff_urllength) + ((this.max.urllength() == this.min.urllength() ) ? 0 : (256 - (((t.urllength() - this.min.urllength() ) << 8) / (this.max.urllength() - this.min.urllength()) )) << this.ranking.coeff_urllength)
+ ((maxmaxpos == minminpos || maxmaxpos < 0) ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext) + ((this.max.posintext() == this.min.posintext()) ? 0 : (256 - (((t.posintext() - this.min.posintext() ) << 8) / (this.max.posintext() - this.min.posintext()) )) << this.ranking.coeff_posintext)
+ ((this.max.posofphrase() == this.min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - this.min.posofphrase() ) << 8) / (this.max.posofphrase() - this.min.posofphrase()) )) << this.ranking.coeff_posofphrase) + ((this.max.posofphrase() == this.min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - this.min.posofphrase() ) << 8) / (this.max.posofphrase() - this.min.posofphrase()) )) << this.ranking.coeff_posofphrase)
+ ((this.max.posinphrase() == this.min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - this.min.posinphrase() ) << 8) / (this.max.posinphrase() - this.min.posinphrase()) )) << this.ranking.coeff_posinphrase) + ((this.max.posinphrase() == this.min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - this.min.posinphrase() ) << 8) / (this.max.posinphrase() - this.min.posinphrase()) )) << this.ranking.coeff_posinphrase)
+ ((this.max.distance() == this.min.distance() ) ? 0 : (256 - (((t.distance() - this.min.distance() ) << 8) / (this.max.distance() - this.min.distance()) )) << this.ranking.coeff_worddistance) + ((this.max.distance() == this.min.distance() ) ? 0 : (256 - (((t.distance() - this.min.distance() ) << 8) / (this.max.distance() - this.min.distance()) )) << this.ranking.coeff_worddistance)
+ ((this.max.virtualAge() == this.min.virtualAge()) ? 0 : (((t.virtualAge() - this.min.virtualAge() ) << 8) / (this.max.virtualAge() - this.min.virtualAge()) ) << this.ranking.coeff_date) + ((this.max.virtualAge() == this.min.virtualAge()) ? 0 : (((t.virtualAge() - this.min.virtualAge() ) << 8) / (this.max.virtualAge() - this.min.virtualAge()) ) << this.ranking.coeff_date)
+ ((this.max.wordsintitle() == this.min.wordsintitle()) ? 0 : (((t.wordsintitle() - this.min.wordsintitle() ) << 8) / (this.max.wordsintitle() - this.min.wordsintitle()) ) << this.ranking.coeff_wordsintitle) + ((this.max.wordsintitle() == this.min.wordsintitle()) ? 0 : (((t.wordsintitle() - this.min.wordsintitle() ) << 8) / (this.max.wordsintitle() - this.min.wordsintitle()) ) << this.ranking.coeff_wordsintitle)
+ ((this.max.wordsintext() == this.min.wordsintext()) ? 0 : (((t.wordsintext() - this.min.wordsintext() ) << 8) / (this.max.wordsintext() - this.min.wordsintext()) ) << this.ranking.coeff_wordsintext) + ((this.max.wordsintext() == this.min.wordsintext()) ? 0 : (((t.wordsintext() - this.min.wordsintext() ) << 8) / (this.max.wordsintext() - this.min.wordsintext()) ) << this.ranking.coeff_wordsintext)
+ ((this.max.phrasesintext() == this.min.phrasesintext()) ? 0 : (((t.phrasesintext()- this.min.phrasesintext() ) << 8) / (this.max.phrasesintext()- this.min.phrasesintext()) ) << this.ranking.coeff_phrasesintext) + ((this.max.phrasesintext() == this.min.phrasesintext()) ? 0 : (((t.phrasesintext()- this.min.phrasesintext() ) << 8) / (this.max.phrasesintext()- this.min.phrasesintext()) ) << this.ranking.coeff_phrasesintext)

@ -57,36 +57,39 @@ public class WordReferenceVarsTest {
ientry.setWord(word); ientry.setWord(word);
WordReferenceVars wvMin = new WordReferenceVars(ientry, true); WordReferenceVars wvMin = new WordReferenceVars(ientry, true);
wvMin.addPosition(10); // add position for distance testing
WordReferenceVars wvMax = wvMin.clone();
// create a other reference // create a other reference
WordReferenceVars wvOther = new WordReferenceVars(ientry, true); WordReferenceVars wvOther = wvMin.clone();
word.posInText = maxposintext;
ientry.setWord(word);
WordReferenceVars wvMax = new WordReferenceVars(ientry, true);
wvMin.addPosition(10); // add position for distance testing
wvMax.addPosition(maxposintext); // add position for distance testing
wvOther.addPosition(maxposintext); // add position (max) for distance testing wvOther.addPosition(maxposintext); // add position (max) for distance testing
// test min for posintext and distance // test min for posintext and distance
wvMin.min(wvOther); wvMin.min(wvOther);
assertEquals("min posintext", minposintext, wvMin.minposition()); assertEquals("min posintext", minposintext, wvMin.posintext());
assertEquals("min distance", 5, wvMin.distance()); assertEquals("min distance", 5, wvMin.distance());
wvMin.min(wvOther); // test repeated call doesn't change result wvMin.min(wvOther); // test repeated call doesn't change result
assertEquals("min posintext (repeat)", minposintext, wvMin.minposition()); assertEquals("min posintext (repeat)", minposintext, wvMin.posintext());
assertEquals("min distance (repeat)", 5, wvMin.distance()); assertEquals("min distance (repeat)", 5, wvMin.distance());
// test max for posintext and distance // test max for posintext and distance
wvMax.max(wvOther); wvMax.max(wvOther);
assertEquals("max posintext", maxposintext, wvMax.maxposition()); assertEquals("max posintext", maxposintext, wvMax.posintext());
assertEquals("max distance", maxposintext - minposintext, wvMax.distance()); assertEquals("max distance", maxposintext - minposintext, wvMax.distance());
wvMax.max(wvOther); // test repeated calls don't change result wvMax.max(wvOther); // test repeated calls don't change result
wvMax.max(wvOther); wvMax.max(wvOther);
assertEquals("max posintext (repeat)", maxposintext, wvMax.maxposition()); assertEquals("max posintext (repeat)", maxposintext, wvMax.posintext());
assertEquals("max distance (repeat)", maxposintext - minposintext, wvMax.distance()); assertEquals("max distance (repeat)", maxposintext - minposintext, wvMax.distance());
// reverse test // reverse test
wvOther.max(wvMax); wvOther.max(wvMax);
assertEquals("max posintext (reverse)", maxposintext, wvOther.maxposition()); assertEquals("max posintext (reverse)", maxposintext, wvOther.posintext());
assertEquals("max distance (repeat)", maxposintext - minposintext, wvOther.distance()); assertEquals("max distance (repeat)", maxposintext - minposintext, wvOther.distance());
} }

@ -155,7 +155,7 @@ public class SegmentTest {
// creates one test url with this text in the rwi index // creates one test url with this text in the rwi index
DigestURL url = new DigestURL("http://test.org/test.html"); DigestURL url = new DigestURL("http://test.org/test.html");
storeTestDocTextToTermIndex(url, "One Two Three Four Five. This is a test text. One two three for five."); storeTestDocTextToTermIndex(url, "One Two Three Four Five. This is a test text. One two three for five");
// posintext 1 2 3 4 5 6 7 8 9 // posintext 1 2 3 4 5 6 7 8 9
// hitcount ("five") 1 1 2 // hitcount ("five") 1 1 2
// posofphrase |-------100------------| |------101---------| |--------102----------| // posofphrase |-------100------------| |------101---------| |--------102----------|
@ -171,7 +171,7 @@ public class SegmentTest {
// do the search // do the search
TermSearch<WordReference> result = index.termIndex.query(queryHashes, excludeHashes, urlselection, termFactory, Integer.MAX_VALUE); TermSearch<WordReference> result = index.termIndex.query(queryHashes, excludeHashes, urlselection, termFactory, Integer.MAX_VALUE);
// get the joined resutls // get the joined results
ReferenceContainer<WordReference> wc = result.joined(); ReferenceContainer<WordReference> wc = result.joined();
// we should have now one result (stored to index above) // we should have now one result (stored to index above)
@ -181,7 +181,7 @@ public class SegmentTest {
WordReference r = wc.getReference(url.hash()); WordReference r = wc.getReference(url.hash());
// min position of search word in text (posintext) // min position of search word in text (posintext)
assertEquals("minposition('five')", 5, r.minposition()); assertEquals("min posintext('five')", 5, r.posintext());
// occurence of search words in text // occurence of search words in text
assertEquals("hitcount('five')", 2, r.hitcount()); assertEquals("hitcount('five')", 2, r.hitcount());
@ -190,15 +190,6 @@ public class SegmentTest {
assertEquals("posofphrase", 100, r.posofphrase()); assertEquals("posofphrase", 100, r.posofphrase());
assertEquals("posinphrase", 5, r.posinphrase()); assertEquals("posinphrase", 5, r.posinphrase());
// currently the results are not as expected for a multi-word query
// (reason: Reference container is backed by ReferenceRow (which doen't hold positions of joined references) ergo can't return related results
System.out.println("-----------------");
System.out.println("positions=" + r.positions() + " (expected=5,8)");
// max position of search word in text
System.out.println("maxposition=" + r.maxposition() + " (expected=8)");
// for a multiword query distance expected to be the avg of search word positions in text
System.out.println("distance=" + r.distance() + " (expected=3)");
System.out.println("-----------------");
} }
} }

Loading…
Cancel
Save