diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 281213da3..56bdee44d 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -154,6 +154,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
*/
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
public static String[] urlComps(String normalizedURL) {
+ int p = normalizedURL.indexOf("//");
+ if (p > 0) normalizedURL = normalizedURL.substring(p + 2);
return normalizedURL.toLowerCase().split(splitrex); // word components of the url
}
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index ae7481068..ad09ad462 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -266,8 +266,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// apply filter
profileLocal.startTimer();
- //acc.removeRedundant();
- acc.removeDoubleDom();
+ acc.removeRedundant();
+ //acc.removeDoubleDom();
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());
@@ -294,12 +294,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
plasmaCrawlLURL.Entry page;
Long preranking;
Object[] preorderEntry;
- int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try {
while (preorder.hasNext()) {
- //if ((acc.sizeFetched() >= 50) && ((acc.sizeFetched() >= minEntries) || (System.currentTimeMillis() >= postorderLimitTime))) break;
- //if (acc.sizeFetched() >= minEntries) break;
- if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
+ if (System.currentTimeMillis() >= postorderLimitTime) break;
preorderEntry = preorder.next();
entry = (indexEntry) preorderEntry[0];
preranking = (Long) preorderEntry[1];
@@ -322,8 +319,8 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// apply filter
profileLocal.startTimer();
- //acc.removeRedundant();
- acc.removeDoubleDom();
+ acc.removeRedundant();
+ //acc.removeDoubleDom();
profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered());
diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java
index 1e506f8a2..1c0da89da 100644
--- a/source/de/anomic/plasma/plasmaSearchPreOrder.java
+++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java
@@ -96,7 +96,7 @@ public final class plasmaSearchPreOrder {
this.pageAcc = new TreeMap();
for (int j = 0; j < count; j++) {
iEntry = (indexEntry) i.next();
- pageAcc.put(serverCodings.encodeHex(this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
+ pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
}
}
@@ -150,7 +150,7 @@ public final class plasmaSearchPreOrder {
}
public Object[] /*{indexEntry, Long}*/ next() {
- String top = (String) pageAcc.lastKey();
+ String top = (String) pageAcc.firstKey();
//System.out.println("preorder-key: " + top);
Long preranking = new Long(Long.parseLong(top.substring(0, 16), 16));
return new Object[]{(indexEntry) pageAcc.remove(top), preranking};
diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
index 8535db997..650a3a4f7 100644
--- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java
@@ -170,16 +170,18 @@ public class plasmaSearchRankingProfile {
ranking += normalizedEntry.quality() << ((Integer) coeff.get(ENTROPY)).intValue();
ranking += normalizedEntry.virtualAge() << ((Integer) coeff.get(DATE)).intValue();
ranking += plasmaSearchPreOrder.ybr_p(normalizedEntry.urlHash()) << ((Integer) coeff.get(YBR)).intValue();
- ranking += (normalizedEntry.posintext() == 0) ? 0 : (255 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
- ranking += (normalizedEntry.worddistance() == 0) ? 0 : (255 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
+ ranking += (normalizedEntry.posintext() == 0) ? 0 : (256 - normalizedEntry.posintext()) << ((Integer) coeff.get(POSINTEXT)).intValue();
+ ranking += (normalizedEntry.worddistance() == 0) ? 0 : (256 - normalizedEntry.worddistance()) << ((Integer) coeff.get(WORDDISTANCE)).intValue();
ranking += (normalizedEntry.hitcount() == 0) ? 0 : normalizedEntry.hitcount() << ((Integer) coeff.get(HITCOUNT)).intValue();
- ranking += (255 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
+ ranking += (256 - indexURL.domLengthNormalized(normalizedEntry.urlHash())) << ((Integer) coeff.get(DOMLENGTH)).intValue();
ranking += (indexURL.probablyRootURL(normalizedEntry.urlHash())) ? 16 << ((Integer) coeff.get(URLLENGTH)).intValue() : 0;
ranking += (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord)) ? 256 << ((Integer) coeff.get(QUERYINURL)).intValue() : 0;
+ /*
if (indexURL.probablyWordURL(normalizedEntry.urlHash(), searchedWord))
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains word " + searchedWord + ", weighted " + ((Integer) coeff.get(QUERYINURL)).intValue() + ", ranking = " + ranking);
else
System.out.println("DEBUG - hash " + normalizedEntry.urlHash() + " contains not word " + searchedWord + ", ranking = " + ranking);
+ */
return ranking;
}
@@ -219,11 +221,11 @@ public class plasmaSearchRankingProfile {
// prefer short urls
ranking += (256 - page.url().toString().length()) << ((Integer) coeff.get(URLLENGTH)).intValue();
- ranking += (32 - urlcomps.length) << ((Integer) coeff.get(URLCOMPS)).intValue();
+ ranking += (8 * Math.max(0, 32 - urlcomps.length)) << ((Integer) coeff.get(URLCOMPS)).intValue();
// prefer long descriptions
- ranking += (255 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
- ranking += (255 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
+ ranking += (256 * page.descr().length() / 80) << ((Integer) coeff.get(DESCRLENGTH)).intValue();
+ ranking += (256 * (12 - Math.abs(12 - Math.min(12, descrcomps.length))) / 12) << ((Integer) coeff.get(DESCRCOMPS)).intValue();
return ranking;
}
diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java
index a4972c070..e7e43661c 100644
--- a/source/de/anomic/plasma/plasmaSearchResult.java
+++ b/source/de/anomic/plasma/plasmaSearchResult.java
@@ -100,7 +100,7 @@ public final class plasmaSearchResult {
}
public plasmaCrawlLURL.Entry nextElement() {
- Object top = pageAcc.lastKey();
+ Object top = pageAcc.firstKey();
//System.out.println("postorder-key: " + ((String) top));
return (plasmaCrawlLURL.Entry) pageAcc.remove(top);
}
@@ -154,7 +154,7 @@ public final class plasmaSearchResult {
// insert value
//System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url());
- pageAcc.put(serverCodings.encodeHex(ranking, 16) + page.hash(), page);
+ pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - ranking, 16) + page.hash(), page);
}
// flush memory