diff --git a/build.properties b/build.properties index 9fcc34547..5c6e0b5d4 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.541 +releaseVersion=0.542 releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseFileParentDir=yacy diff --git a/source/de/anomic/index/indexRWIEntry.java b/source/de/anomic/index/indexRWIEntry.java index 7f9b4162a..345206463 100644 --- a/source/de/anomic/index/indexRWIEntry.java +++ b/source/de/anomic/index/indexRWIEntry.java @@ -194,7 +194,7 @@ public class indexRWIEntry implements Cloneable { } public int virtualAge() { - return plasmaWordIndex.microDateDays(lastModified()); + return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format } public long lastModified() { @@ -284,31 +284,35 @@ public class indexRWIEntry implements Cloneable { } public static final void min(indexRWIEntry t, indexRWIEntry other) { - if (t.hitcount() > other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount()); - if (t.wordsintext() > other.wordsintext()) t.entry.setCol(col_wordsInText, other.wordsintext()); - if (t.phrasesintext() > other.phrasesintext()) t.entry.setCol(col_phrasesInText, other.phrasesintext()); - if (t.posintext() > other.posintext()) t.entry.setCol(col_posintext, other.posintext()); - if (t.posinphrase() > other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase()); - if (t.posofphrase() > other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase()); - if (t.worddistance() > other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance()); - if (t.lastModified() > other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified()); - if (t.urllength() > other.urllength()) t.entry.setCol(col_urlLength, other.urllength()); - if (t.urlcomps() > other.urlcomps()) t.entry.setCol(col_urlComps, other.urlcomps()); - if (t.wordsintitle() > other.wordsintitle() ) t.entry.setCol(col_wordsInTitle, other.wordsintitle()); + int v; + long w; + if (t.hitcount() > (v = other.hitcount())) t.entry.setCol(col_hitcount, other.hitcount()); + if (t.wordsintext() > (v = other.wordsintext())) t.entry.setCol(col_wordsInText, v); + if (t.phrasesintext() > (v = other.phrasesintext())) t.entry.setCol(col_phrasesInText, v); + if (t.posintext() > (v = other.posintext())) t.entry.setCol(col_posintext, v); + if (t.posinphrase() > (v = other.posinphrase())) t.entry.setCol(col_posinphrase, v); + if (t.posofphrase() > (v = other.posofphrase())) t.entry.setCol(col_posofphrase, v); + if (t.worddistance() > (v = other.worddistance())) t.entry.setCol(col_worddistance, v); + if (t.lastModified() > (w = other.lastModified())) t.entry.setCol(col_lastModified, w); + if (t.urllength() > (v = other.urllength())) t.entry.setCol(col_urlLength, v); + if (t.urlcomps() > (v = other.urlcomps())) t.entry.setCol(col_urlComps, v); + if (t.wordsintitle() > (v = other.wordsintitle())) t.entry.setCol(col_wordsInTitle, v); } public static final void max(indexRWIEntry t, indexRWIEntry other) { - if (t.hitcount() < other.hitcount()) t.entry.setCol(col_hitcount, other.hitcount()); - if (t.wordsintext() < other.wordsintext()) t.entry.setCol(col_wordsInText, other.wordsintext()); - if (t.phrasesintext() < other.phrasesintext()) t.entry.setCol(col_phrasesInText, other.phrasesintext()); - if (t.posintext() < other.posintext()) t.entry.setCol(col_posintext, other.posintext()); - if (t.posinphrase() < other.posinphrase()) t.entry.setCol(col_posinphrase, other.posinphrase()); - if (t.posofphrase() < other.posofphrase()) t.entry.setCol(col_posofphrase, other.posofphrase()); - if (t.worddistance() < other.worddistance()) t.entry.setCol(col_worddistance, other.worddistance()); - if (t.lastModified() < other.lastModified()) t.entry.setCol(col_lastModified, other.lastModified()); - if (t.urllength() < other.urllength()) t.entry.setCol(col_urlLength, other.urllength()); - if (t.urlcomps() < other.urlcomps()) t.entry.setCol(col_urlComps, other.urlcomps()); - if (t.wordsintitle() < other.wordsintitle() ) t.entry.setCol(col_wordsInTitle, other.wordsintitle()); + int v; + long w; + if (t.hitcount() < (v = other.hitcount())) t.entry.setCol(col_hitcount, v); + if (t.wordsintext() < (v = other.wordsintext())) t.entry.setCol(col_wordsInText, v); + if (t.phrasesintext() < (v = other.phrasesintext())) t.entry.setCol(col_phrasesInText, v); + if (t.posintext() < (v = other.posintext())) t.entry.setCol(col_posintext, v); + if (t.posinphrase() < (v = other.posinphrase())) t.entry.setCol(col_posinphrase, v); + if (t.posofphrase() < (v = other.posofphrase())) t.entry.setCol(col_posofphrase, v); + if (t.worddistance() < (v = other.worddistance())) t.entry.setCol(col_worddistance, v); + if (t.lastModified() < (w = other.lastModified())) t.entry.setCol(col_lastModified, w); + if (t.urllength() < (v = other.urllength())) t.entry.setCol(col_urlLength, v); + if (t.urlcomps() < (v = other.urlcomps())) t.entry.setCol(col_urlComps, v); + if (t.wordsintitle() < (v = other.wordsintitle())) t.entry.setCol(col_wordsInTitle, v); } @@ -319,40 +323,6 @@ public class indexRWIEntry implements Cloneable { public void max(indexRWIEntry other) { max(this, other); } - - static void normalize(indexRWIEntry t, indexRWIEntry min, indexRWIEntry max) { - assert (t.urlHash().length() == 12) : "turlhash = " + t.urlHash(); - assert (min.urlHash().length() == 12) : "minurlhash = " + min.urlHash(); - assert (max.urlHash().length() == 12) : "maxurlhash = " + max.urlHash(); - if (1 + max.worddistance() - min.worddistance() == 0) System.out.println("min = " + min.toPropertyForm() + "\nmax=" + max.toPropertyForm()); - //System.out.println("Normalize:\nentry = " + t.toPropertyForm(true)); - //System.out.println("min = " + min.toPropertyForm(true)); - //System.out.println("max = " + max.toPropertyForm(true)); - t.entry.setCol(col_hitcount , (t.hitcount() == 0) ? 0 : 1 + 255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount())); - t.entry.setCol(col_wordsInText , (t.wordsintext() == 0) ? 0 : 1 + 255 * (t.wordsintext() - min.wordsintext() ) / (1 + max.wordsintext() - min.wordsintext())); - t.entry.setCol(col_phrasesInText, (t.phrasesintext() == 0) ? 0 : 1 + 255 * (t.phrasesintext() - min.phrasesintext() ) / (1 + max.phrasesintext() - min.phrasesintext())); - t.entry.setCol(col_posintext , (t.posintext() == 0) ? 0 : 1 + 255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext())); - t.entry.setCol(col_posinphrase , (t.posinphrase() == 0) ? 0 : 1 + 255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase())); - t.entry.setCol(col_posofphrase , (t.posofphrase() == 0) ? 0 : 1 + 255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase())); - t.entry.setCol(col_worddistance , (t.worddistance() == 0) ? 0 : 1 + 255 * (t.worddistance() - min.worddistance()) / (1 + max.worddistance() - min.worddistance())); // FIXME: hier gibts ein division by zero, was nur sein kann wenn die Normalisierung nicht geklappt hat. - t.entry.setCol(col_lastModified , (t.lastModified() == 0) ? 0 : 1 + 255 * (t.lastModified() - min.lastModified()) / (1 + max.lastModified() - min.lastModified())); - t.entry.setCol(col_urlLength , (t.urllength() == 0) ? 0 : 1 + 255 * (t.urllength() - min.urllength() ) / (1 + max.urllength() - min.urllength())); - t.entry.setCol(col_urlComps , (t.urlcomps() == 0) ? 0 : 1 + 255 * (t.urlcomps() - min.urlcomps() ) / (1 + max.urlcomps() - min.urlcomps())); - t.entry.setCol(col_wordsInTitle , (t.wordsintitle() == 0) ? 0 : 1 + 255 * (t.wordsintitle() - min.wordsintitle()) / (1 + max.wordsintitle() - min.wordsintitle())); - - //System.out.println("out = " + t.toPropertyForm(true)); - } - - public void normalize(indexRWIEntry min, indexRWIEntry max) { - normalize(this, min, max); - } - - public indexRWIEntry generateNormalized(indexRWIEntry min, indexRWIEntry max) { - assert (this.urlHash().length() == 12) : "this.urlhash = " + this.urlHash(); - indexRWIEntry e = (indexRWIEntry) this.clone(); - e.normalize(min, max); - return e; - } public boolean isNewer(indexRWIEntry other) { if (other == null) return true; diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 05d920c85..803941058 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -110,7 +110,7 @@ public final class plasmaSearchPreOrder { if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue; } - pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), searchWords), 16) + iEntry.urlHash(), iEntry); + pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry, this.entryMin, this.entryMax, searchWords), 16) + iEntry.urlHash(), iEntry); } this.filteredCount = pageAcc.size(); } diff --git a/source/de/anomic/plasma/plasmaSearchProcessing.java b/source/de/anomic/plasma/plasmaSearchProcessing.java index b64136ac7..57b874bae 100644 --- a/source/de/anomic/plasma/plasmaSearchProcessing.java +++ b/source/de/anomic/plasma/plasmaSearchProcessing.java @@ -380,7 +380,7 @@ public class plasmaSearchProcessing implements Cloneable { int minEntries = getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT); try { ordering: while (preorder.hasNext()) { - if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= minEntries)) break; + if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= 5 * minEntries)) break; preorderEntry = preorder.next(); entry = (indexRWIEntry) preorderEntry[0]; // load only urls if there was not yet a root url of that hash diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 56eb729a2..809110a22 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -89,7 +89,7 @@ public class plasmaSearchRankingProfile { private int coeff_domlength, coeff_ybr, coeff_date, coeff_wordsintitle, coeff_wordsintext, coeff_phrasesintext, coeff_llocal, coeff_lother, coeff_urllength, coeff_urlcomps, coeff_hitcount, - coeff_posintext, coeff_posofphrase, coeff_worddistance, + coeff_posintext, coeff_posofphrase, coeff_posinphrase, coeff_worddistance, coeff_appurl, coeff_appdescr, coeff_appauthor, coeff_apptags, coeff_appref, coeff_appemph, coeff_catindexof, coeff_cathasimage, coeff_cathasaudio, coeff_cathasvideo, coeff_cathasapp, coeff_urlcompintoplist, coeff_descrcompintoplist, coeff_prefer; @@ -110,6 +110,7 @@ public class plasmaSearchRankingProfile { coeff_hitcount = 5; coeff_posintext = 7; coeff_posofphrase = 6; + coeff_posinphrase = 1; coeff_worddistance = 15; coeff_appurl = 14; coeff_appdescr = 13; @@ -249,6 +250,44 @@ public class plasmaSearchRankingProfile { return new String(ext); } + public long preRanking(indexRWIEntry t, indexRWIEntry min, indexRWIEntry max, TreeSet searchedWords) { + // the normalizedEntry must be a normalized indexEntry + long ranking = 0; + ranking += (256 - plasmaURL.domLengthNormalized(t.urlHash())) << coeff_domlength; + ranking += plasmaSearchPreOrder.ybr_p(t.urlHash()) << coeff_ybr; + ranking += (255 - (255 * (t.virtualAge() - min.virtualAge() ) / (1 + max.virtualAge() - min.virtualAge())) ) << coeff_date; + ranking += (255 * (t.wordsintitle() - min.wordsintitle() ) / (1 + max.wordsintitle() - min.wordsintitle())) << coeff_wordsintitle; + ranking += (255 * (t.wordsintext() - min.wordsintext() ) / (1 + max.wordsintext() - min.wordsintext())) << coeff_wordsintext; + ranking += (255 * (t.phrasesintext()- min.phrasesintext()) / (1 + max.phrasesintext()- min.phrasesintext())) << coeff_phrasesintext; + ranking += t.llocal() << coeff_llocal; + ranking += t.lother() << coeff_lother; + ranking += (255 - (255 * (t.urllength() - min.urllength() ) / (1 + max.urllength() - min.urllength())) ) << coeff_urllength; + ranking += (255 - (255 * (t.urlcomps() - min.urlcomps() ) / (1 + max.urlcomps() - min.urlcomps())) ) << coeff_urlcomps; + ranking += (255 * (t.hitcount() - min.hitcount() ) / (1 + max.hitcount() - min.hitcount())) << coeff_hitcount; + ranking += (255 - (255 * (t.posintext() - min.posintext() ) / (1 + max.posintext() - min.posintext())) ) << coeff_posintext; + ranking += (255 - (255 * (t.posofphrase() - min.posofphrase() ) / (1 + max.posofphrase() - min.posofphrase())) ) << coeff_posofphrase; + ranking += (255 - (255 * (t.posinphrase() - min.posinphrase() ) / (1 + max.posinphrase() - min.posinphrase())) ) << coeff_posinphrase; + ranking += (255 - (255 * (t.worddistance() - min.worddistance() ) / (1 + max.worddistance() - min.worddistance()))) << coeff_worddistance; + + kelondroBitfield flags = t.flags(); + ranking += (flags.get(indexRWIEntry.flag_app_url)) ? 256 << coeff_appurl : 0; + ranking += (flags.get(indexRWIEntry.flag_app_descr)) ? 256 << coeff_appdescr : 0; + ranking += (flags.get(indexRWIEntry.flag_app_author)) ? 256 << coeff_appauthor : 0; + ranking += (flags.get(indexRWIEntry.flag_app_tags)) ? 256 << coeff_apptags : 0; + ranking += (flags.get(indexRWIEntry.flag_app_reference)) ? 256 << coeff_appref : 0; + ranking += (flags.get(indexRWIEntry.flag_app_emphasized)) ? 256 << coeff_appemph : 0; + ranking += (flags.get(plasmaCondenser.flag_cat_indexof)) ? 256 << coeff_catindexof : 0; + ranking += (flags.get(plasmaCondenser.flag_cat_hasimage)) ? 256 << coeff_cathasimage : 0; + ranking += (flags.get(plasmaCondenser.flag_cat_hasaudio)) ? 256 << coeff_cathasaudio : 0; + ranking += (flags.get(plasmaCondenser.flag_cat_hasvideo)) ? 256 << coeff_cathasvideo : 0; + ranking += (flags.get(plasmaCondenser.flag_cat_hasapp)) ? 256 << coeff_cathasapp : 0; + + ranking += (plasmaURL.probablyRootURL(t.urlHash())) ? 16 << coeff_urllength : 0; + ranking += (plasmaURL.probablyWordURL(t.urlHash(), searchedWords) != null) ? 256 << coeff_appurl : 0; + + return ranking; + } + /* public long preRanking(indexRWIEntry normalizedEntry, TreeSet searchedWords) { // the normalizedEntry must be a normalized indexEntry long ranking = 0; @@ -282,16 +321,10 @@ public class plasmaSearchRankingProfile { ranking += (plasmaURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << coeff_urllength : 0; ranking += (plasmaURL.probablyWordURL(normalizedEntry.urlHash(), searchedWords) != null) ? 256 << coeff_appurl : 0; - - /* - if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) - System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking); - else - System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains not word " + searchedWord + ", ranking = " + ranking); - */ + return ranking; } - + */ public long postRanking( long ranking, plasmaSearchQuery query,