Refacture rwi reference word position and word distance calculation

used for rwi ranking.
Main changes:  
- introduce a  posintext() to access the stored value. This reduces also mem alloc of position array for WordReferenceRow (index access)
- use the positions() array for joined references on multi-word queries if needed (otherwise allow positions() to be null
- adjust assignments and the min() max() and distance() calculation accordingly
pull/88/head
reger 9 years ago
parent f0639d810c
commit 3c7220bc7b

@ -519,7 +519,7 @@ public class IndexControlRWIs_p {
prop.putNum("genUrlList_urlList_" + i + "_urlExists_lother", entry.word().lother());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_hitcount", entry.word().hitcount());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_worddistance", 0);
prop.putNum("genUrlList_urlList_" + i + "_urlExists_pos", entry.word().minposition());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_pos", entry.word().posintext());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_phrase", entry.word().posofphrase());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_posinphrase", entry.word().posinphrase());
prop.putNum("genUrlList_urlList_" + i + "_urlExists_urlcomps", entry.word().urlcomps());

@ -156,17 +156,12 @@ public class CitationReference implements Reference, Serializable {
}
@Override
public int maxposition() {
throw new UnsupportedOperationException();
}
@Override
public int minposition() {
public Collection<Integer> positions() {
throw new UnsupportedOperationException();
}
@Override
public Collection<Integer> positions() {
public int posintext() {
throw new UnsupportedOperationException();
}

@ -183,4 +183,9 @@ public final class NavigationReferenceRow extends AbstractReference implements N
throw new UnsupportedOperationException();
}
@Override
public int posintext() {
throw new UnsupportedOperationException();
}
}

@ -168,4 +168,8 @@ public class NavigationReferenceVars extends AbstractReference implements Navig
throw new UnsupportedOperationException();
}
@Override
public int posintext() {
throw new UnsupportedOperationException(); }
}

@ -26,7 +26,6 @@
package net.yacy.kelondro.data.word;
import java.util.ArrayList;
import java.util.Collection;
import net.yacy.cora.date.MicroDate;
@ -250,7 +249,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
@Override
public long lastModified() {
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
return MicroDate.reverseMicroDateDays(this.entry.getColLong(col_lastModified));
}
@Override
@ -259,18 +258,24 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
}
/**
* First position of word in text.
* @return first positon of word in text
*/
@Override
public int posintext() {
int pos = (int) this.entry.getColLong(col_posintext);
return pos;
}
/**
* positions() is used to remember word positions for each query word of an
* multi word search query. As we currently don't include a separate posintext()
* function, we use positions to make the posintext value available.
* @return Collection with one element
* multi word search query.
* WordReferenceRow is for one WordReference and has no means to return multiple positions
* but is required by the interface.
* @return null
*/
@Override
public Collection<Integer> positions() {
int pos = (int) this.entry.getColLong(col_posintext);
ArrayList<Integer> arr = new ArrayList<Integer>(1);
arr.add(pos);
return arr;
return null;
}
@Override

@ -62,11 +62,12 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
private final char type;
private int hitcount, // how often appears this word in the text
llocal, lother, phrasesintext,
posintext, // word position in text
posinphrase, posofphrase,
urlcomps, urllength,
wordsintext, wordsintitle;
private int virtualAge;
private final Queue<Integer> positions;
private Queue<Integer> positions; // word positons of joined references
private double termFrequency;
private final boolean local;
@ -78,6 +79,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
final int hitcount, // how often appears this word in the text
final int wordcount, // total number of words
final int phrasecount, // total number of phrases
final int posintext, // first position of word in text
final Queue<Integer> ps, // positions of words that are joined into the reference
final int posinphrase, // position of word in its phrase
final int posofphrase, // number of the phrase where word appears
@ -100,9 +102,15 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.llocal = outlinksSame;
this.lother = outlinksOther;
this.phrasesintext = phrasecount;
if (ps != null && !ps.isEmpty()) {
this.positions = new LinkedBlockingQueue<Integer>();
if (!ps.isEmpty()) for (final Integer i: ps) this.positions.add(i);
for (final Integer i : ps) this.positions.add(i);
} else {
this.positions = null;
}
this.posinphrase = posinphrase;
this.posintext = posintext;
this.posofphrase = posofphrase;
this.urlcomps = urlComps;
this.urllength = urlLength;
@ -124,9 +132,15 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.llocal = e.llocal();
this.lother = e.lother();
this.phrasesintext = e.phrasesintext();
if (e.positions() != null && !e.positions().isEmpty()) {
this.positions = new LinkedBlockingQueue<Integer>();
if (!e.positions().isEmpty()) for (final Integer i: e.positions()) this.positions.add(i);
for (final Integer i: e.positions()) this.positions.add(i);
} else {
this.positions = null;
}
this.posinphrase = e.posinphrase();
this.posintext = e.posintext();
this.posofphrase = e.posofphrase();
this.urlcomps = e.urlcomps();
this.urllength = e.urllength();
@ -152,6 +166,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.phrasesintext = 0;
this.positions = null;
this.posinphrase = 0;
this.posintext = 0;
this.posofphrase = 0;
this.urlcomps = 0;
this.urllength = 0;
@ -172,6 +187,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.hitcount,
this.wordsintext,
this.phrasesintext,
this.posintext,
this.positions,
this.posinphrase,
this.posofphrase,
@ -234,6 +250,20 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
return this.posinphrase;
}
/**
* First word position in text.
* @return min position
*/
@Override
public int posintext() {
return this.posintext;
}
/**
* Word positions for joined references (for multi word queries).
* @see posintext()
* @return the word positions of the joined references
*/
@Override
public Collection<Integer> positions() {
return this.positions;
@ -253,7 +283,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.hitcount, // how often appears this word in the text
this.wordsintext, // total number of words
this.phrasesintext, // total number of phrases
this.positions.isEmpty() ? 0 : minposition(), // position of word in all words (WordReferenceRow stores first position in text, minpos also important for joined references)
this.posintext, // position of word in all words (WordReferenceRow stores first position in text)
this.posinphrase, // position of word in its phrase
this.posofphrase, // number of the phrase where word appears
this.lastModified, // last-modified time of the document where word appears
@ -336,21 +366,19 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
if (virtualAge() > (v = other.virtualAge())) this.virtualAge = v;
if (this.wordsintext > (v = other.wordsintext)) this.wordsintext = v;
if (this.phrasesintext > (v = other.phrasesintext)) this.phrasesintext = v;
if (this.posintext > (v = other.posintext)) this.posintext = v;
int minpos = min(this.positions, other.positions);
if (minpos != Integer.MAX_VALUE) {
// calculate and remember min distance
if (this.positions != null || other.positions != null) {
int odist = other.distance();
int dist = this.distance();
this.positions.clear(); // we want only the min
this.positions.add(minpos);
// handle distance for multi word queries
// distance is calculated from positions, must be at least 2 positions for calculation
if (odist > 0 && odist < dist) {
this.positions.add(minpos + odist);
} else if (dist > 0) {
this.positions.add(minpos + dist);
} else if (odist > 0) {
this.positions.add(minpos + odist);
if (this.positions == null) {
this.positions = new LinkedBlockingQueue<Integer>();
} else {
this.positions.clear();
}
this.positions.add(this.posintext + odist);
}
}
@ -375,19 +403,19 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
if (virtualAge() < (v = other.virtualAge())) this.virtualAge = v;
if (this.wordsintext < (v = other.wordsintext)) this.wordsintext = v;
if (this.phrasesintext < (v = other.phrasesintext)) this.phrasesintext = v;
if (this.posintext < (v = other.posintext)) this.posintext = v;
int maxpos = max(this.positions, other.positions);
if (maxpos != Integer.MIN_VALUE) {
// calculate and remember max distance
if (this.positions != null || other.positions != null) {
int odist = other.distance();
int dist = this.distance();
if (odist > 0 && odist > dist) {
if (this.positions == null) {
this.positions = new LinkedBlockingQueue<Integer>();
} else {
this.positions.clear();
this.positions.add(maxpos);
// handle distance for multi word queries
// distance is calculated from positions, must be at least 2 positions for calculation
if (odist > dist) {
this.positions.add(maxpos - odist); // special cas for max, to not be altered by the pos for distance use pos before maxpos
} else if (dist > 0) {
this.positions.add(maxpos - dist);
}
this.positions.add(this.posintext + odist);
}
}
@ -404,17 +432,26 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
/**
* joins two entries into one entry
*
* Main usage is on multi word searches to combine the position values for distance ranking,
* Main usage is on multi word searches to combine the position values for ranking and word distance calculation,
* A Join is valid for the same url.
* @param r WordReference
*/
@Override
public void join(final Reference r) {
// combine the distance
final WordReference oe = (WordReference) r;
this.positions.addAll(oe.positions());
// choose min posintext (for > 0)
if (this.posintext > 0 && oe.posintext() > 0) {
if (this.posintext > oe.posintext()) {
this.addPosition(this.posintext); // remember larger position (for distance calculation)
this.posintext = oe.posintext();
} else {
this.addPosition(oe.posintext()); // remember other position (for distance calculation)
}
} else if (this.posintext == 0) {
this.posintext = oe.posintext();
}
// join phrase
// this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0;
@ -465,8 +502,13 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
return o1.compareTo(o2);
}
/**
* Add a position for word distance calculation to the list if position > 0
* @param position
*/
public void addPosition(final int position) {
this.positions.add(position);
if (this.positions == null && position > 0) this.positions = new LinkedBlockingQueue<Integer>();
if (position > 0) this.positions.add(position);
}
/**
@ -474,7 +516,6 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
* @param container
* @return a blocking queue filled with WordReferenceVars that is still filled when the object is returned
*/
public static BlockingQueue<WordReferenceVars> transform(final ReferenceContainer<WordReference> container, final long maxtime, final boolean local) {
final LinkedBlockingQueue<WordReferenceVars> vars = new LinkedBlockingQueue<WordReferenceVars>();
if (container.size() <= 100) {

@ -26,102 +26,24 @@
package net.yacy.kelondro.rwi;
import java.util.Collection;
import java.util.Iterator;
import net.yacy.cora.util.ConcurrentLog;
public abstract class AbstractReference implements Reference {
protected static int max(Collection<Integer> a, Collection<Integer> b) {
if (a == null || a.isEmpty()) return max(b);
if (b == null || b.isEmpty()) return max(a);
int ma = max(a);
int mb = max(b);
if (ma == Integer.MIN_VALUE) return mb;
if (mb == Integer.MIN_VALUE) return ma;
return Math.max(ma, mb);
}
protected static int min(Collection<Integer> a, Collection<Integer> b) {
assert a != null;
if (a == null || a.isEmpty()) return min(b);
if (b == null || b.isEmpty()) return min(a);
int ma = min(a);
int mb = min(b);
if (ma == Integer.MAX_VALUE) return mb;
if (mb == Integer.MAX_VALUE) return ma;
return Math.min(ma, mb);
}
private static int max(Collection<Integer> a) {
if (a == null || a.isEmpty()) return Integer.MIN_VALUE;
Iterator<Integer> i = a.iterator();
/*
expirienced concurrency issue with this short cut 2016-09-06
on i.next w/o test of hasNext before
java.util.NoSuchElementException at java.util.concurrent.LinkedBlockingQueue$Itr.next(LinkedBlockingQueue.java:828)
if (a.size() == 1) return i.next();
if (a.size() == 2) return Math.max(i.next(), i.next());
int r = i.next();
*/
int r = Integer.MIN_VALUE;
int s;
while (i.hasNext()) {
s = i.next();
if (s > r) r = s;
}
return r;
}
private static int min(Collection<Integer> a) {
if (a == null || a.isEmpty()) return Integer.MAX_VALUE;
Iterator<Integer> i = a.iterator();
/* concurrency issue (see max())
if (a.size() == 1) return i.next();
if (a.size() == 2) return Math.min(i.next(), i.next());
int r = i.next();
*/
int r = Integer.MAX_VALUE;
int s;
while (i.hasNext()) {
s = i.next();
if (s < r) r = s;
}
return r;
}
/**
* max position of search query words for multi word queries
* @return
*/
@Override
public int maxposition() {
return max(positions());
}
/**
* min word position of search query words for multi word queries
* @return
*/
@Override
public int minposition() {
return min(positions());
}
/**
* The average distance (in words) between search query terms for multi word searches.
* @return word distance
*/
@Override
public int distance() {
if (positions().size() < 2) return 0;
// check if positions have been joined
if (positions() == null || positions().isEmpty()) return 0;
int d = 0;
Iterator<Integer> i = positions().iterator();
// int s0 = i.next(), s1; // concurrency issue see max()
int s0 = -1, s1;
int s0 = posintext(); // init with own positon
int s1;
while (i.hasNext()) {
s1 = i.next();
if (s0 > 0) d += Math.abs(s0 - s1);
@ -130,11 +52,8 @@ public abstract class AbstractReference implements Reference {
// despite first line checks for size < 2 Arithmetic exception div by zero occured (1.91/9278 2016-10-19)
// added d == 0 condition as protection for this (which was in all above tests the case)
try {
return d == 0 ? 0 : d / (positions().size() - 1);
return d == 0 ? 0 : d / positions().size();
} catch (ArithmeticException ex) {
// TODO: in peer to peer normalization of rwi queue modifies concurrently positions resulting in div by 0 exception
// with effect of above check position() < 2 is false but now true, it also results in changing ranking results until normalization is finished
// see related/causing code ReferenceOrder.normalizewith() and WordReferenceVars.max()/WordReferenceVars.min() -> refacturing of posintext, distance, min, max needed
ConcurrentLog.fine("AbstractReference", "word distance calculation:" + ex.getMessage());
return 0;
}

@ -58,29 +58,24 @@ public interface Reference {
public void join(final Reference oe);
/**
* Positions or search query words for the referenced result url
* This is only valid for multi word search queries.
* The positions contain the first word position for every search query word
* which has been joined (by join() )
* @return list with word position
* First positon of word in text.
* Word position is not calculated for titles, for all title words it defaults to 0.
* @return min word position starting at 1 (0 if undefined)
*/
public Collection<Integer> positions();
public int posintext();
/**
* max position of search query words (for multi word queries)
* @return
*/
public int maxposition();
/**
* min word position of search query words (for multi word queries)
* @return
* Positions of search query words for the referenced result url
* This is only valid for multi word search queries.
* The positions contain the first word position for every joined search query word
* which has been joined (by join() )
* @return list with word position (excl. the posintext of this reference) or null
*/
public int minposition();
public Collection<Integer> positions();
/**
* The average distance (in words) between search query terms for multi word searches.
* The distance is calculated from positions()
* The distance is calculated from posintext() and positions()
* @return word distance
*/
public int distance();

@ -30,7 +30,6 @@ package net.yacy.peers.graphics;
import java.io.File;
import java.io.Serializable;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@ -451,7 +450,12 @@ public class WebStructureGraph {
@Override
public Collection<Integer> positions() {
return new ArrayList<Integer>(0);
return null;
}
@Override
public int posintext() {
throw new UnsupportedOperationException();
}
}

@ -221,7 +221,6 @@ public class ReferenceOrder {
* @return a ranking: the higher the number, the better is the ranking
*/
public long cardinal(final WordReference t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
assert this.min != null;
assert this.max != null;
@ -230,13 +229,11 @@ public class ReferenceOrder {
final Bitfield flags = t.flags();
final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency);
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
final int maxmaxpos = this.max.maxposition(); // returns Integer.MIN_VALUE if positions empty
final int minminpos = this.min.minposition();
final long r =
((256 - DigestURL.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength)
+ ((this.max.urlcomps() == this.min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - this.min.urlcomps() ) << 8) / (this.max.urlcomps() - this.min.urlcomps()) )) << this.ranking.coeff_urlcomps)
+ ((this.max.urllength() == this.min.urllength() ) ? 0 : (256 - (((t.urllength() - this.min.urllength() ) << 8) / (this.max.urllength() - this.min.urllength()) )) << this.ranking.coeff_urllength)
+ ((maxmaxpos == minminpos || maxmaxpos < 0) ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext)
+ ((this.max.posintext() == this.min.posintext()) ? 0 : (256 - (((t.posintext() - this.min.posintext() ) << 8) / (this.max.posintext() - this.min.posintext()) )) << this.ranking.coeff_posintext)
+ ((this.max.posofphrase() == this.min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - this.min.posofphrase() ) << 8) / (this.max.posofphrase() - this.min.posofphrase()) )) << this.ranking.coeff_posofphrase)
+ ((this.max.posinphrase() == this.min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - this.min.posinphrase() ) << 8) / (this.max.posinphrase() - this.min.posinphrase()) )) << this.ranking.coeff_posinphrase)
+ ((this.max.distance() == this.min.distance() ) ? 0 : (256 - (((t.distance() - this.min.distance() ) << 8) / (this.max.distance() - this.min.distance()) )) << this.ranking.coeff_worddistance)

@ -57,36 +57,39 @@ public class WordReferenceVarsTest {
ientry.setWord(word);
WordReferenceVars wvMin = new WordReferenceVars(ientry, true);
wvMin.addPosition(10); // add position for distance testing
// create a other reference
WordReferenceVars wvOther = wvMin.clone();
WordReferenceVars wvMax = wvMin.clone();
word.posInText = maxposintext;
ientry.setWord(word);
WordReferenceVars wvMax = new WordReferenceVars(ientry, true);
// create a other reference
WordReferenceVars wvOther = new WordReferenceVars(ientry, true);
wvMin.addPosition(10); // add position for distance testing
wvMax.addPosition(maxposintext); // add position for distance testing
wvOther.addPosition(maxposintext); // add position (max) for distance testing
// test min for posintext and distance
wvMin.min(wvOther);
assertEquals("min posintext", minposintext, wvMin.minposition());
assertEquals("min posintext", minposintext, wvMin.posintext());
assertEquals("min distance", 5, wvMin.distance());
wvMin.min(wvOther); // test repeated call doesn't change result
assertEquals("min posintext (repeat)", minposintext, wvMin.minposition());
assertEquals("min posintext (repeat)", minposintext, wvMin.posintext());
assertEquals("min distance (repeat)", 5, wvMin.distance());
// test max for posintext and distance
wvMax.max(wvOther);
assertEquals("max posintext", maxposintext, wvMax.maxposition());
assertEquals("max posintext", maxposintext, wvMax.posintext());
assertEquals("max distance", maxposintext - minposintext, wvMax.distance());
wvMax.max(wvOther); // test repeated calls don't change result
wvMax.max(wvOther);
assertEquals("max posintext (repeat)", maxposintext, wvMax.maxposition());
assertEquals("max posintext (repeat)", maxposintext, wvMax.posintext());
assertEquals("max distance (repeat)", maxposintext - minposintext, wvMax.distance());
// reverse test
wvOther.max(wvMax);
assertEquals("max posintext (reverse)", maxposintext, wvOther.maxposition());
assertEquals("max posintext (reverse)", maxposintext, wvOther.posintext());
assertEquals("max distance (repeat)", maxposintext - minposintext, wvOther.distance());
}

@ -155,7 +155,7 @@ public class SegmentTest {
// creates one test url with this text in the rwi index
DigestURL url = new DigestURL("http://test.org/test.html");
storeTestDocTextToTermIndex(url, "One Two Three Four Five. This is a test text. One two three for five.");
storeTestDocTextToTermIndex(url, "One Two Three Four Five. This is a test text. One two three for five");
// posintext 1 2 3 4 5 6 7 8 9
// hitcount ("five") 1 1 2
// posofphrase |-------100------------| |------101---------| |--------102----------|
@ -171,7 +171,7 @@ public class SegmentTest {
// do the search
TermSearch<WordReference> result = index.termIndex.query(queryHashes, excludeHashes, urlselection, termFactory, Integer.MAX_VALUE);
// get the joined resutls
// get the joined results
ReferenceContainer<WordReference> wc = result.joined();
// we should have now one result (stored to index above)
@ -181,7 +181,7 @@ public class SegmentTest {
WordReference r = wc.getReference(url.hash());
// min position of search word in text (posintext)
assertEquals("minposition('five')", 5, r.minposition());
assertEquals("min posintext('five')", 5, r.posintext());
// occurence of search words in text
assertEquals("hitcount('five')", 2, r.hitcount());
@ -190,15 +190,6 @@ public class SegmentTest {
assertEquals("posofphrase", 100, r.posofphrase());
assertEquals("posinphrase", 5, r.posinphrase());
// currently the results are not as expected for a multi-word query
// (reason: Reference container is backed by ReferenceRow (which doen't hold positions of joined references) ergo can't return related results
System.out.println("-----------------");
System.out.println("positions=" + r.positions() + " (expected=5,8)");
// max position of search word in text
System.out.println("maxposition=" + r.maxposition() + " (expected=8)");
// for a multiword query distance expected to be the avg of search word positions in text
System.out.println("distance=" + r.distance() + " (expected=3)");
System.out.println("-----------------");
}
}

Loading…
Cancel
Save