// ReferenceOrder.java // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 07.11.2007 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.search.ranking; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.LargeNumberCache; import net.yacy.document.Tokenizer; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.Bitfield; public class ReferenceOrder { private static int cores = Runtime.getRuntime().availableProcessors(); private int maxdomcount; private WordReferenceVars min, max; private final ConcurrentScoreMap doms; // collected for "authority" heuristic private final RankingProfile ranking; private final String language; public ReferenceOrder(final RankingProfile profile, final String language) { this.min = null; this.max = null; this.ranking = profile; this.doms = new ConcurrentScoreMap(); this.maxdomcount = 0; this.language = language; } public BlockingQueue normalizeWith(final ReferenceContainer container, long maxtime, final boolean local) { final LinkedBlockingQueue out = new LinkedBlockingQueue(); int threads = cores; if (container.size() < 100) threads = 2; final Thread distributor = new NormalizeDistributor(container, out, threads, maxtime, local); distributor.start(); // return the resulting queue while the processing queues are still working return out; } private final class NormalizeDistributor extends Thread { ReferenceContainer container; LinkedBlockingQueue out; private final int threads; private final long maxtime; private final boolean local; public NormalizeDistributor(final ReferenceContainer container, final LinkedBlockingQueue out, final int threads, final long maxtime, final boolean local) { super("ReferenceOrder.NormalizeDistributor"); this.container = container; this.out = out; this.threads = threads; this.maxtime = maxtime; this.local = local; } @Override public void run() { // transform the reference container into a stream of parsed entries final BlockingQueue vars = WordReferenceVars.transform(this.container, this.maxtime, this.local); // start the transformation threads final Semaphore termination = new Semaphore(this.threads); final NormalizeWorker[] worker = new NormalizeWorker[this.threads]; for (int i = 0; i < this.threads; i++) { worker[i] = new NormalizeWorker(this.out, termination, this.maxtime); worker[i].start(); } // fill the queue WordReferenceVars iEntry; int p = 0; long timeout = this.maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + this.maxtime; try { while ((iEntry = vars.take()) != WordReferenceVars.poison) { worker[p % this.threads].add(iEntry); p++; if (System.currentTimeMillis() > timeout) { ConcurrentLog.warn("NormalizeDistributor", "adding of decoded rows to workers ended with timeout = " + this.maxtime); break; } } } catch (final InterruptedException e) { } // insert poison to stop the queues for (int i = 0; i < this.threads; i++) worker[i].add(WordReferenceVars.poison); // wait for termination but not too long to make it possible that this // is called from outside with a join to get some normalization results // before going on for (int i = 0; i < this.threads; i++) try {worker[i].join(100);} catch (final InterruptedException e) {} } } /** * normalize ranking: find minimum and maximum of separate ranking criteria */ private class NormalizeWorker extends Thread { private final BlockingQueue out; private final Semaphore termination; private final BlockingQueue decodedEntries; private final long maxtime; public NormalizeWorker(final BlockingQueue out, final Semaphore termination, long maxtime) { super("ReferenceOrder.NormalizeWorker"); this.out = out; this.termination = termination; this.decodedEntries = new LinkedBlockingQueue(); this.maxtime = maxtime; } public void add(final WordReferenceVars entry) { try { this.decodedEntries.put(entry); } catch (final InterruptedException e) { } } @Override public void run() { try { WordReferenceVars iEntry; final Map doms0 = new HashMap(); String dom; Integer count; final Integer int1 = 1; long timeout = this.maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + this.maxtime; while ((iEntry = this.decodedEntries.take()) != WordReferenceVars.poison) { // find min/max if (ReferenceOrder.this.min == null) ReferenceOrder.this.min = iEntry.clone(); else ReferenceOrder.this.min.min(iEntry); if (ReferenceOrder.this.max == null) ReferenceOrder.this.max = iEntry.clone(); else ReferenceOrder.this.max.max(iEntry); this.out.put(iEntry); // must be after the min/max check to prevent that min/max is null in cardinal() // update domcount dom = iEntry.hosthash(); count = doms0.get(dom); if (count == null) { doms0.put(dom, int1); } else { doms0.put(dom, LargeNumberCache.valueOf(count.intValue() + 1)); } if (System.currentTimeMillis() > timeout) { ConcurrentLog.warn("NormalizeWorker", "normlization of decoded rows ended with timeout = " + this.maxtime); break; } } // update domain score Map.Entry entry; final Iterator> di = doms0.entrySet().iterator(); while (di.hasNext()) { entry = di.next(); ReferenceOrder.this.doms.inc(entry.getKey(), (entry.getValue()).intValue()); } if (!ReferenceOrder.this.doms.isEmpty()) ReferenceOrder.this.maxdomcount = ReferenceOrder.this.doms.getMaxScore(); } catch (final InterruptedException e) { ConcurrentLog.logException(e); } catch (final Exception e) { ConcurrentLog.logException(e); } finally { // insert poison to signal the termination to next queue try { this.termination.acquire(); if (this.termination.availablePermits() == 0) this.out.put(WordReferenceVars.poison); } catch (final InterruptedException e) {} } } } public int authority(final String hostHash) { assert hostHash.length() == 6; return (this.doms.get(hostHash) << 8) / (1 + this.maxdomcount); } /** * return the ranking of a given word entry * @param t * @return a ranking: the higher the number, the better is the ranking */ public long cardinal(final WordReference t) { // the normalizedEntry must be a normalized indexEntry assert this.min != null; assert this.max != null; assert t != null; assert this.ranking != null; final Bitfield flags = t.flags(); final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency); //System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf); final long r = ((256 - DigestURL.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength) + ((this.max.urlcomps() == this.min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - this.min.urlcomps() ) << 8) / (this.max.urlcomps() - this.min.urlcomps()) )) << this.ranking.coeff_urlcomps) + ((this.max.urllength() == this.min.urllength() ) ? 0 : (256 - (((t.urllength() - this.min.urllength() ) << 8) / (this.max.urllength() - this.min.urllength()) )) << this.ranking.coeff_urllength) + ((this.max.posintext() == this.min.posintext()) ? 0 : (256 - (((t.posintext() - this.min.posintext() ) << 8) / (this.max.posintext() - this.min.posintext()) )) << this.ranking.coeff_posintext) + ((this.max.posofphrase() == this.min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - this.min.posofphrase() ) << 8) / (this.max.posofphrase() - this.min.posofphrase()) )) << this.ranking.coeff_posofphrase) + ((this.max.posinphrase() == this.min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - this.min.posinphrase() ) << 8) / (this.max.posinphrase() - this.min.posinphrase()) )) << this.ranking.coeff_posinphrase) + ((this.max.distance() == this.min.distance() ) ? 0 : (256 - (((t.distance() - this.min.distance() ) << 8) / (this.max.distance() - this.min.distance()) )) << this.ranking.coeff_worddistance) + ((this.max.virtualAge() == this.min.virtualAge()) ? 0 : (((t.virtualAge() - this.min.virtualAge() ) << 8) / (this.max.virtualAge() - this.min.virtualAge()) ) << this.ranking.coeff_date) + ((this.max.wordsintitle() == this.min.wordsintitle()) ? 0 : (((t.wordsintitle() - this.min.wordsintitle() ) << 8) / (this.max.wordsintitle() - this.min.wordsintitle()) ) << this.ranking.coeff_wordsintitle) + ((this.max.wordsintext() == this.min.wordsintext()) ? 0 : (((t.wordsintext() - this.min.wordsintext() ) << 8) / (this.max.wordsintext() - this.min.wordsintext()) ) << this.ranking.coeff_wordsintext) + ((this.max.phrasesintext() == this.min.phrasesintext()) ? 0 : (((t.phrasesintext()- this.min.phrasesintext() ) << 8) / (this.max.phrasesintext()- this.min.phrasesintext()) ) << this.ranking.coeff_phrasesintext) + ((this.max.llocal() == this.min.llocal()) ? 0 : (((t.llocal() - this.min.llocal() ) << 8) / (this.max.llocal() - this.min.llocal()) ) << this.ranking.coeff_llocal) + ((this.max.lother() == this.min.lother()) ? 0 : (((t.lother() - this.min.lother() ) << 8) / (this.max.lother() - this.min.lother()) ) << this.ranking.coeff_lother) + ((this.max.hitcount() == this.min.hitcount()) ? 0 : (((t.hitcount() - this.min.hitcount() ) << 8) / (this.max.hitcount() - this.min.hitcount()) ) << this.ranking.coeff_hitcount) + tf + ((this.ranking.coeff_authority > 12) ? (authority(t.hosthash()) << this.ranking.coeff_authority) : 0) + ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << this.ranking.coeff_appurl : 0) + ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << this.ranking.coeff_app_dc_title : 0) + ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << this.ranking.coeff_app_dc_creator : 0) + ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << this.ranking.coeff_app_dc_subject : 0) + ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << this.ranking.coeff_app_dc_description : 0) + ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << this.ranking.coeff_appemph : 0) + ((flags.get(Tokenizer.flag_cat_indexof)) ? 255 << this.ranking.coeff_catindexof : 0) + ((flags.get(Tokenizer.flag_cat_hasimage)) ? 255 << this.ranking.coeff_cathasimage : 0) + ((flags.get(Tokenizer.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0) + ((flags.get(Tokenizer.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0) + ((flags.get(Tokenizer.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0) + ((Arrays.equals(t.getLanguage(), ASCII.getBytes(this.language))) ? 255 << this.ranking.coeff_language : 0); //if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0; return r; // the higher the number the better the ranking. } public long cardinal(final URIMetadataNode t) { // the normalizedEntry must be a normalized indexEntry assert t != null; assert this.ranking != null; final Bitfield flags = t.flags(); long r = ((256 - DigestURL.domLengthNormalized(t.hash())) << this.ranking.coeff_domlength) // TODO: here we score currently absolute numbers (e.g. t.urllength() : (35 << coeff), in contrast rwi calculation is ((between min=0, max=255) << coeff) for each of the score factors // + ((256 - (t.urllength() << 8)) << this.ranking.coeff_urllength) // TODO: this is for valid url always NEGATIVE + (t.virtualAge() << this.ranking.coeff_date) + (t.wordsintitle()<< this.ranking.coeff_wordsintitle) + (t.wordCount() << this.ranking.coeff_wordsintext) + (t.llocal() << this.ranking.coeff_llocal) + (t.lother() << this.ranking.coeff_lother) // + ((this.ranking.coeff_authority > 12) ? (authority(t.hosthash()) << this.ranking.coeff_authority) : 0) + ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << this.ranking.coeff_appurl : 0) + ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << this.ranking.coeff_app_dc_title : 0) + ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << this.ranking.coeff_app_dc_creator : 0) + ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << this.ranking.coeff_app_dc_subject : 0) + ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << this.ranking.coeff_app_dc_description : 0) + ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << this.ranking.coeff_appemph : 0) + ((flags.get(Tokenizer.flag_cat_indexof)) ? 255 << this.ranking.coeff_catindexof : 0) + ((flags.get(Tokenizer.flag_cat_hasimage)) ? 255 << this.ranking.coeff_cathasimage : 0) + ((flags.get(Tokenizer.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0) + ((flags.get(Tokenizer.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0) + ((flags.get(Tokenizer.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0) + ((this.language.equals(t.language())) ? 255 << this.ranking.coeff_language : 0); return r; // the higher the number the better the ranking. } }