You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
299 lines
17 KiB
299 lines
17 KiB
// ReferenceOrder.java
|
|
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 07.11.2007 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package net.yacy.search.ranking;
|
|
|
|
import java.util.Arrays;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.concurrent.BlockingQueue;
|
|
import java.util.concurrent.LinkedBlockingQueue;
|
|
import java.util.concurrent.Semaphore;
|
|
|
|
import net.yacy.cora.document.encoding.ASCII;
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
import net.yacy.cora.sorting.ConcurrentScoreMap;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.document.LargeNumberCache;
|
|
import net.yacy.document.Tokenizer;
|
|
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
|
import net.yacy.kelondro.data.word.WordReference;
|
|
import net.yacy.kelondro.data.word.WordReferenceRow;
|
|
import net.yacy.kelondro.data.word.WordReferenceVars;
|
|
import net.yacy.kelondro.rwi.ReferenceContainer;
|
|
import net.yacy.kelondro.util.Bitfield;
|
|
|
|
|
|
public class ReferenceOrder {
|
|
|
|
private static int cores = Runtime.getRuntime().availableProcessors();
|
|
|
|
private int maxdomcount;
|
|
private WordReferenceVars min, max;
|
|
private final ConcurrentScoreMap<String> doms; // collected for "authority" heuristic
|
|
private final RankingProfile ranking;
|
|
private final String language;
|
|
|
|
public ReferenceOrder(final RankingProfile profile, final String language) {
|
|
this.min = null;
|
|
this.max = null;
|
|
this.ranking = profile;
|
|
this.doms = new ConcurrentScoreMap<String>();
|
|
this.maxdomcount = 0;
|
|
this.language = language;
|
|
}
|
|
|
|
public BlockingQueue<WordReferenceVars> normalizeWith(final ReferenceContainer<WordReference> container, long maxtime, final boolean local) {
|
|
final LinkedBlockingQueue<WordReferenceVars> out = new LinkedBlockingQueue<WordReferenceVars>();
|
|
int threads = cores;
|
|
if (container.size() < 100) threads = 2;
|
|
final Thread distributor = new NormalizeDistributor(container, out, threads, maxtime, local);
|
|
distributor.start();
|
|
|
|
// return the resulting queue while the processing queues are still working
|
|
return out;
|
|
}
|
|
|
|
private final class NormalizeDistributor extends Thread {
|
|
|
|
ReferenceContainer<WordReference> container;
|
|
LinkedBlockingQueue<WordReferenceVars> out;
|
|
private final int threads;
|
|
private final long maxtime;
|
|
private final boolean local;
|
|
|
|
public NormalizeDistributor(final ReferenceContainer<WordReference> container, final LinkedBlockingQueue<WordReferenceVars> out, final int threads, final long maxtime, final boolean local) {
|
|
super("ReferenceOrder.NormalizeDistributor");
|
|
this.container = container;
|
|
this.out = out;
|
|
this.threads = threads;
|
|
this.maxtime = maxtime;
|
|
this.local = local;
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
// transform the reference container into a stream of parsed entries
|
|
final BlockingQueue<WordReferenceVars> vars = WordReferenceVars.transform(this.container, this.maxtime, this.local);
|
|
|
|
// start the transformation threads
|
|
final Semaphore termination = new Semaphore(this.threads);
|
|
final NormalizeWorker[] worker = new NormalizeWorker[this.threads];
|
|
for (int i = 0; i < this.threads; i++) {
|
|
worker[i] = new NormalizeWorker(this.out, termination, this.maxtime);
|
|
worker[i].start();
|
|
}
|
|
|
|
// fill the queue
|
|
WordReferenceVars iEntry;
|
|
int p = 0;
|
|
long timeout = this.maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + this.maxtime;
|
|
try {
|
|
while ((iEntry = vars.take()) != WordReferenceVars.poison) {
|
|
worker[p % this.threads].add(iEntry);
|
|
p++;
|
|
if (System.currentTimeMillis() > timeout) {
|
|
ConcurrentLog.warn("NormalizeDistributor", "adding of decoded rows to workers ended with timeout = " + this.maxtime);
|
|
break;
|
|
}
|
|
}
|
|
} catch (final InterruptedException e) {
|
|
}
|
|
|
|
// insert poison to stop the queues
|
|
for (int i = 0; i < this.threads; i++) worker[i].add(WordReferenceVars.poison);
|
|
|
|
// wait for termination but not too long to make it possible that this
|
|
// is called from outside with a join to get some normalization results
|
|
// before going on
|
|
for (int i = 0; i < this.threads; i++) try {worker[i].join(100);} catch (final InterruptedException e) {}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* normalize ranking: find minimum and maximum of separate ranking criteria
|
|
*/
|
|
private class NormalizeWorker extends Thread {
|
|
|
|
private final BlockingQueue<WordReferenceVars> out;
|
|
private final Semaphore termination;
|
|
private final BlockingQueue<WordReferenceVars> decodedEntries;
|
|
private final long maxtime;
|
|
|
|
public NormalizeWorker(final BlockingQueue<WordReferenceVars> out, final Semaphore termination, long maxtime) {
|
|
super("ReferenceOrder.NormalizeWorker");
|
|
this.out = out;
|
|
this.termination = termination;
|
|
this.decodedEntries = new LinkedBlockingQueue<WordReferenceVars>();
|
|
this.maxtime = maxtime;
|
|
}
|
|
|
|
public void add(final WordReferenceVars entry) {
|
|
try {
|
|
this.decodedEntries.put(entry);
|
|
} catch (final InterruptedException e) {
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
try {
|
|
WordReferenceVars iEntry;
|
|
final Map<String, Integer> doms0 = new HashMap<String, Integer>();
|
|
String dom;
|
|
Integer count;
|
|
final Integer int1 = 1;
|
|
long timeout = this.maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + this.maxtime;
|
|
while ((iEntry = this.decodedEntries.take()) != WordReferenceVars.poison) {
|
|
// find min/max
|
|
if (ReferenceOrder.this.min == null) ReferenceOrder.this.min = iEntry.clone(); else ReferenceOrder.this.min.min(iEntry);
|
|
if (ReferenceOrder.this.max == null) ReferenceOrder.this.max = iEntry.clone(); else ReferenceOrder.this.max.max(iEntry);
|
|
this.out.put(iEntry); // must be after the min/max check to prevent that min/max is null in cardinal()
|
|
// update domcount
|
|
dom = iEntry.hosthash();
|
|
count = doms0.get(dom);
|
|
if (count == null) {
|
|
doms0.put(dom, int1);
|
|
} else {
|
|
doms0.put(dom, LargeNumberCache.valueOf(count.intValue() + 1));
|
|
}
|
|
|
|
if (System.currentTimeMillis() > timeout) {
|
|
ConcurrentLog.warn("NormalizeWorker", "normlization of decoded rows ended with timeout = " + this.maxtime);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// update domain score
|
|
Map.Entry<String, Integer> entry;
|
|
final Iterator<Map.Entry<String, Integer>> di = doms0.entrySet().iterator();
|
|
while (di.hasNext()) {
|
|
entry = di.next();
|
|
ReferenceOrder.this.doms.inc(entry.getKey(), (entry.getValue()).intValue());
|
|
}
|
|
if (!ReferenceOrder.this.doms.isEmpty()) ReferenceOrder.this.maxdomcount = ReferenceOrder.this.doms.getMaxScore();
|
|
} catch (final InterruptedException e) {
|
|
ConcurrentLog.logException(e);
|
|
} catch (final Exception e) {
|
|
ConcurrentLog.logException(e);
|
|
} finally {
|
|
// insert poison to signal the termination to next queue
|
|
try {
|
|
this.termination.acquire();
|
|
if (this.termination.availablePermits() == 0) this.out.put(WordReferenceVars.poison);
|
|
} catch (final InterruptedException e) {}
|
|
}
|
|
}
|
|
}
|
|
|
|
public int authority(final String hostHash) {
|
|
assert hostHash.length() == 6;
|
|
return (this.doms.get(hostHash) << 8) / (1 + this.maxdomcount);
|
|
}
|
|
|
|
/**
|
|
* return the ranking of a given word entry
|
|
* @param t
|
|
* @return a ranking: the higher the number, the better is the ranking
|
|
*/
|
|
public long cardinal(final WordReference t) {
|
|
// the normalizedEntry must be a normalized indexEntry
|
|
assert this.min != null;
|
|
assert this.max != null;
|
|
assert t != null;
|
|
assert this.ranking != null;
|
|
final Bitfield flags = t.flags();
|
|
final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency);
|
|
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
|
|
final long r =
|
|
((256 - DigestURL.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength)
|
|
+ ((this.max.urlcomps() == this.min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - this.min.urlcomps() ) << 8) / (this.max.urlcomps() - this.min.urlcomps()) )) << this.ranking.coeff_urlcomps)
|
|
+ ((this.max.urllength() == this.min.urllength() ) ? 0 : (256 - (((t.urllength() - this.min.urllength() ) << 8) / (this.max.urllength() - this.min.urllength()) )) << this.ranking.coeff_urllength)
|
|
+ ((this.max.posintext() == this.min.posintext()) ? 0 : (256 - (((t.posintext() - this.min.posintext() ) << 8) / (this.max.posintext() - this.min.posintext()) )) << this.ranking.coeff_posintext)
|
|
+ ((this.max.posofphrase() == this.min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - this.min.posofphrase() ) << 8) / (this.max.posofphrase() - this.min.posofphrase()) )) << this.ranking.coeff_posofphrase)
|
|
+ ((this.max.posinphrase() == this.min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - this.min.posinphrase() ) << 8) / (this.max.posinphrase() - this.min.posinphrase()) )) << this.ranking.coeff_posinphrase)
|
|
+ ((this.max.distance() == this.min.distance() ) ? 0 : (256 - (((t.distance() - this.min.distance() ) << 8) / (this.max.distance() - this.min.distance()) )) << this.ranking.coeff_worddistance)
|
|
+ ((this.max.virtualAge() == this.min.virtualAge()) ? 0 : (((t.virtualAge() - this.min.virtualAge() ) << 8) / (this.max.virtualAge() - this.min.virtualAge()) ) << this.ranking.coeff_date)
|
|
+ ((this.max.wordsintitle() == this.min.wordsintitle()) ? 0 : (((t.wordsintitle() - this.min.wordsintitle() ) << 8) / (this.max.wordsintitle() - this.min.wordsintitle()) ) << this.ranking.coeff_wordsintitle)
|
|
+ ((this.max.wordsintext() == this.min.wordsintext()) ? 0 : (((t.wordsintext() - this.min.wordsintext() ) << 8) / (this.max.wordsintext() - this.min.wordsintext()) ) << this.ranking.coeff_wordsintext)
|
|
+ ((this.max.phrasesintext() == this.min.phrasesintext()) ? 0 : (((t.phrasesintext()- this.min.phrasesintext() ) << 8) / (this.max.phrasesintext()- this.min.phrasesintext()) ) << this.ranking.coeff_phrasesintext)
|
|
+ ((this.max.llocal() == this.min.llocal()) ? 0 : (((t.llocal() - this.min.llocal() ) << 8) / (this.max.llocal() - this.min.llocal()) ) << this.ranking.coeff_llocal)
|
|
+ ((this.max.lother() == this.min.lother()) ? 0 : (((t.lother() - this.min.lother() ) << 8) / (this.max.lother() - this.min.lother()) ) << this.ranking.coeff_lother)
|
|
+ ((this.max.hitcount() == this.min.hitcount()) ? 0 : (((t.hitcount() - this.min.hitcount() ) << 8) / (this.max.hitcount() - this.min.hitcount()) ) << this.ranking.coeff_hitcount)
|
|
+ tf
|
|
+ ((this.ranking.coeff_authority > 12) ? (authority(t.hosthash()) << this.ranking.coeff_authority) : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << this.ranking.coeff_appurl : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << this.ranking.coeff_app_dc_title : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << this.ranking.coeff_app_dc_creator : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << this.ranking.coeff_app_dc_subject : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << this.ranking.coeff_app_dc_description : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << this.ranking.coeff_appemph : 0)
|
|
+ ((flags.get(Tokenizer.flag_cat_indexof)) ? 255 << this.ranking.coeff_catindexof : 0)
|
|
+ ((flags.get(Tokenizer.flag_cat_hasimage)) ? 255 << this.ranking.coeff_cathasimage : 0)
|
|
+ ((flags.get(Tokenizer.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
|
|
+ ((flags.get(Tokenizer.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
|
|
+ ((flags.get(Tokenizer.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
|
|
+ ((Arrays.equals(t.getLanguage(), ASCII.getBytes(this.language))) ? 255 << this.ranking.coeff_language : 0);
|
|
|
|
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
|
|
|
|
return r; // the higher the number the better the ranking.
|
|
}
|
|
|
|
public long cardinal(final URIMetadataNode t) {
|
|
// the normalizedEntry must be a normalized indexEntry
|
|
assert t != null;
|
|
assert this.ranking != null;
|
|
final Bitfield flags = t.flags();
|
|
long r =
|
|
((256 - DigestURL.domLengthNormalized(t.hash())) << this.ranking.coeff_domlength)
|
|
// TODO: here we score currently absolute numbers (e.g. t.urllength() : (35 << coeff), in contrast rwi calculation is ((between min=0, max=255) << coeff) for each of the score factors
|
|
// + ((256 - (t.urllength() << 8)) << this.ranking.coeff_urllength) // TODO: this is for valid url always NEGATIVE
|
|
+ (t.virtualAge() << this.ranking.coeff_date)
|
|
+ (t.wordsintitle()<< this.ranking.coeff_wordsintitle)
|
|
+ (t.wordCount() << this.ranking.coeff_wordsintext)
|
|
+ (t.llocal() << this.ranking.coeff_llocal)
|
|
+ (t.lother() << this.ranking.coeff_lother)
|
|
//
|
|
+ ((this.ranking.coeff_authority > 12) ? (authority(t.hosthash()) << this.ranking.coeff_authority) : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << this.ranking.coeff_appurl : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << this.ranking.coeff_app_dc_title : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << this.ranking.coeff_app_dc_creator : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << this.ranking.coeff_app_dc_subject : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << this.ranking.coeff_app_dc_description : 0)
|
|
+ ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << this.ranking.coeff_appemph : 0)
|
|
+ ((flags.get(Tokenizer.flag_cat_indexof)) ? 255 << this.ranking.coeff_catindexof : 0)
|
|
+ ((flags.get(Tokenizer.flag_cat_hasimage)) ? 255 << this.ranking.coeff_cathasimage : 0)
|
|
+ ((flags.get(Tokenizer.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
|
|
+ ((flags.get(Tokenizer.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
|
|
+ ((flags.get(Tokenizer.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
|
|
+ ((this.language.equals(t.language())) ? 255 << this.ranking.coeff_language : 0);
|
|
return r; // the higher the number the better the ranking.
|
|
}
|
|
|
|
}
|