diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 281213da3..56bdee44d 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -154,6 +154,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen */ public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"'; public static String[] urlComps(String normalizedURL) { + int p = normalizedURL.indexOf("//"); + if (p > 0) normalizedURL = normalizedURL.substring(p + 2); return normalizedURL.toLowerCase().split(splitrex); // word components of the url } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index ae7481068..ad09ad462 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -266,8 +266,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // apply filter profileLocal.startTimer(); - //acc.removeRedundant(); - acc.removeDoubleDom(); + acc.removeRedundant(); + //acc.removeDoubleDom(); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER); profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered()); @@ -294,12 +294,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable { plasmaCrawlLURL.Entry page; Long preranking; Object[] preorderEntry; - int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); try { while (preorder.hasNext()) { - //if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break; - //if (acc.sizeFetched() >= minEntries) break; - if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break; + if (System.currentTimeMillis() >= postorderLimitTime) break; preorderEntry = preorder.next(); entry = (indexEntry) preorderEntry[0]; preranking = (Long) preorderEntry[1]; @@ -322,8 +319,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // apply filter profileLocal.startTimer(); - //acc.removeRedundant(); - acc.removeDoubleDom(); + acc.removeRedundant(); + //acc.removeDoubleDom(); profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER); profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered()); diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 1e506f8a2..1c0da89da 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -96,7 +96,7 @@ public final class plasmaSearchPreOrder { this.pageAcc = new TreeMap(); for (int j = 0; j < count; j++) { iEntry = (indexEntry) i.next(); - pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry); + pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry); } } @@ -150,7 +150,7 @@ public final class plasmaSearchPreOrder { } public Object[] /*{indexEntry, Long}*/ next() { - String top = (String) pageAcc.lastKey(); + String top = (String) pageAcc.firstKey(); //System.out.println("preorder-key: " + top); Long preranking = new Long(Long.parseLong(top.substring(0, 16), 16)); return new Object[]{(indexEntry) pageAcc.remove(top), preranking}; diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 8535db997..650a3a4f7 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -170,16 +170,18 @@ public class plasmaSearchRankingProfile { ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue(); ranking += normalizedEntry.virtualAge() << ((Integer) coeff.get(DATE)).intValue(); ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << ((Integer) coeff.get(YBR)).intValue(); - ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue(); - ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue(); + ranking += (normalizedEntry.posintext() == 0) ? 0 : (256 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue(); + ranking += (normalizedEntry.worddistance() == 0) ? 0 : (256 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue(); ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue(); - ranking += (255 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue(); + ranking += (256 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue(); ranking += (indexURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0; ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0; + /* if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking); else System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains not word " + searchedWord + ", ranking = " + ranking); + */ return ranking; } @@ -219,11 +221,11 @@ public class plasmaSearchRankingProfile { // prefer short urls ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue(); - ranking += (32 - urlcomps.length) << ((Integer) coeff.get(URLCOMPS)).intValue(); + ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue(); // prefer long descriptions - ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue(); - ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue(); + ranking += (256 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue(); + ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue(); return ranking; } diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index a4972c070..e7e43661c 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -100,7 +100,7 @@ public final class plasmaSearchResult { } public plasmaCrawlLURL.Entry nextElement() { - Object top = pageAcc.lastKey(); + Object top = pageAcc.firstKey(); //System.out.println("postorder-key: " + ((String) top)); return (plasmaCrawlLURL.Entry) pageAcc.remove(top); } @@ -154,7 +154,7 @@ public final class plasmaSearchResult { // insert value //System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url()); - pageAcc.put(serverCodings.encodeHex(ranking, 16) + page.hash(), page); + pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - ranking, 16) + page.hash(), page); } // flush memory