diff --git a/bin/cr_accumulate b/bin/cr_accumulate
index 0f12779f6..bc0f83e67 100755
--- a/bin/cr_accumulate
+++ b/bin/cr_accumulate
@@ -1,3 +1,3 @@
cd `dirname $0`/..
-java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate .
-java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168
+java -Xms1500m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate .
+java -Xms1500m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168
diff --git a/bin/cr_genrci b/bin/cr_genrci
index 6f40f8447..b3d15c48e 100755
--- a/bin/cr_genrci
+++ b/bin/cr_genrci
@@ -1,2 +1,2 @@
cd `dirname $0`/..
-java -server -Xms1400m -Xmx1400m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci .
+java -server -Xms2000m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci .
diff --git a/bin/gen_ybr b/bin/gen_ybr
new file mode 100755
index 000000000..88a62717e
--- /dev/null
+++ b/bin/gen_ybr
@@ -0,0 +1,2 @@
+cd `dirname $0`/..
+java -server -Xms1000m -Xmx1000m -classpath source:classes de.anomic.plasma.plasmaRankingRCIEvaluation -genybr .
diff --git a/htroot/index.html b/htroot/index.html
index 25fb63f3f..ae173a7e7 100644
--- a/htroot/index.html
+++ b/htroot/index.html
@@ -26,8 +26,12 @@ Max. number of results:
order by:
Resource:
@@ -101,7 +105,7 @@ from 'late' peers.
#[description]#
#(snippet)#::#[text]# #(/snippet)#
#[urlname]#
-#[date]# | Info
+#[date]# | YBR-#[ybr]# | Info
#{/results}#
diff --git a/htroot/index.java b/htroot/index.java
index e3a159093..0aabe1025 100644
--- a/htroot/index.java
+++ b/htroot/index.java
@@ -56,6 +56,7 @@ import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSearchQuery;
+import de.anomic.plasma.plasmaSearchPreOrder;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
@@ -103,8 +104,12 @@ public class index {
prop.put("count-50", 0);
prop.put("count-100", 0);
prop.put("count-1000", 0);
- prop.put("order-quality", 0);
- prop.put("order-date", 0);
+ prop.put("order-ybr-date-quality", plasmaSearchPreOrder.canUseYBR() ? 1 : 0);
+ prop.put("order-ybr-quality-date", 0);
+ prop.put("order-date-ybr-quality", 0);
+ prop.put("order-quality-ybr-date", 0);
+ prop.put("order-date-quality-ybr", plasmaSearchPreOrder.canUseYBR() ? 0 : 1);
+ prop.put("order-quality-date-ybr", 0);
prop.put("resource-global", ((global) ? 1 : 0));
prop.put("resource-local", ((global) ? 0 : 1));
prop.put("time-1", 0);
@@ -137,8 +142,16 @@ public class index {
(yacyCore.seedDB.mySeed != null) &&
(yacyCore.seedDB.mySeed.getAddress() != null));
- final String order1 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_QUALITY : plasmaSearchQuery.ORDER_DATE;
- final String order2 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_DATE : plasmaSearchQuery.ORDER_QUALITY;
+ String order1="", order2="", order3="";
+ if (order.startsWith("YBR")) order1 = plasmaSearchQuery.ORDER_YBR;
+ if (order.startsWith("Date")) order1 = plasmaSearchQuery.ORDER_DATE;
+ if (order.startsWith("Quality")) order1 = plasmaSearchQuery.ORDER_QUALITY;
+ if (order.indexOf("-YBR-") > 0) order2 = plasmaSearchQuery.ORDER_YBR;
+ if (order.indexOf("-Date-") > 0) order2 = plasmaSearchQuery.ORDER_DATE;
+ if (order.indexOf("-Quality-") > 0) order2 = plasmaSearchQuery.ORDER_QUALITY;
+ if (order.endsWith("YBR")) order3 = plasmaSearchQuery.ORDER_YBR;
+ if (order.endsWith("Date")) order3 = plasmaSearchQuery.ORDER_DATE;
+ if (order.endsWith("Quality")) order3 = plasmaSearchQuery.ORDER_QUALITY;
String urlmask = "";
if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) {
urlmask = ".*";
@@ -147,7 +160,7 @@ public class index {
}
// do the search
- plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2}, count, searchtime, urlmask, referer,
+ plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20);
final serverObjects prop = sb.searchFromLocal(thisSearch);
@@ -240,8 +253,12 @@ public class index {
prop.put("count-50", ((count == 50)) ? 1 : 0);
prop.put("count-100", ((count == 100)) ? 1 : 0);
prop.put("count-1000", ((count == 1000)) ? 1 : 0);
- prop.put("order-quality", ((order.equals("Quality-Date")) ? 1 : 0));
- prop.put("order-date", ((order.equals("Date-Quality")) ? 1 : 0));
+ prop.put("order-ybr-date-quality", ((order.equals("YBR-Date-Quality")) ? 1 : 0));
+ prop.put("order-ybr-quality-date", ((order.equals("YBR-Quality-Date")) ? 1 : 0));
+ prop.put("order-date-ybr-quality", ((order.equals("Date-YBR-Quality")) ? 1 : 0));
+ prop.put("order-quality-ybr-date", ((order.equals("Quality-YBR-Date")) ? 1 : 0));
+ prop.put("order-date-quality-ybr", ((order.equals("Date-Quality-YBR")) ? 1 : 0));
+ prop.put("order-quality-date-ybr", ((order.equals("Quality-Date-YBR")) ? 1 : 0));
prop.put("resource-global", ((global) ? 1 : 0));
prop.put("resource-local", ((global) ? 0 : 1));
prop.put("time-1", ((searchtime == 1000) ? 1 : 0));
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index 8890453df..7f4c82cf1 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -231,7 +231,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// load the yellow-list
f = switchboard.getConfig("proxyYellowList", null);
if (f != null) {
- yellowList = serverFileUtils.loadList(f);
+ yellowList = serverFileUtils.loadList(new File(f));
this.theLogger.logConfig("loaded yellow-list from file " + f + ", " + yellowList.size() + " entries");
} else {
yellowList = new HashSet();
diff --git a/source/de/anomic/kelondro/kelondroAttrSeq.java b/source/de/anomic/kelondro/kelondroAttrSeq.java
index 94b32e4a7..a1c701643 100644
--- a/source/de/anomic/kelondro/kelondroAttrSeq.java
+++ b/source/de/anomic/kelondro/kelondroAttrSeq.java
@@ -206,6 +206,10 @@ public class kelondroAttrSeq {
entries.put(entry.pivot, entry);
}
+ public void putEntrySmall(Entry entry) {
+ entries.put(entry.pivot, entry.toString());
+ }
+
public Entry getEntry(String pivot) {
Object e = entries.get(pivot);
if (e == null) return null;
@@ -351,7 +355,7 @@ public class kelondroAttrSeq {
int p = attrseq.indexOf('|') + 1;
long[] seqattrs = new long[structure.seq_names.length - 1];
String seqname;
- while (p < attrseq.length()) {
+ while (p + structure.seq_len[0] <= attrseq.length()) {
seqname = attrseq.substring(p, p + structure.seq_len[0]);
p += structure.seq_len[0];
for (int i = 1; i < structure.seq_names.length; i++) {
diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java
index eb85b2fad..dbfd80d05 100644
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@@ -64,6 +64,7 @@ import java.util.Locale;
import java.util.Properties;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroTree;
+import de.anomic.kelondro.kelondroException;
import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
@@ -719,17 +720,27 @@ public final class plasmaCrawlLURL extends plasmaURL {
public class kiter implements Iterator {
// enumerates entry elements
kelondroTree.rowIterator i;
+ boolean error = false;
+
public kiter(boolean up, boolean rotating) throws IOException {
i = urlHashCache.rows(up, rotating);
+ error = false;
}
public boolean hasNext() {
+ if (error) return false;
return i.hasNext();
}
public Object next() {
- byte[] e = ((byte[][])i.next())[0];
- if (e == null) return null; else return new Entry(new String(e));
+ try {
+ byte[] e = ((byte[][])i.next())[0];
+ if (e == null) return null; else return new Entry(new String(e));
+ } catch (kelondroException e) {
+ e.printStackTrace();
+ error = true;
+ return null;
+ }
}
public void remove() {
diff --git a/source/de/anomic/plasma/plasmaRankingCRProcess.java b/source/de/anomic/plasma/plasmaRankingCRProcess.java
index 69e52cfd0..5bd81db1c 100644
--- a/source/de/anomic/plasma/plasmaRankingCRProcess.java
+++ b/source/de/anomic/plasma/plasmaRankingCRProcess.java
@@ -53,6 +53,7 @@ import java.util.Map;
import de.anomic.kelondro.kelondroAttrSeq;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
+import de.anomic.server.serverDate;
import de.anomic.tools.bitfield;
public class plasmaRankingCRProcess {
@@ -131,13 +132,13 @@ public class plasmaRankingCRProcess {
acc_entry.setAttr("ACount", (long) ACount);
acc_entry.setAttr("VCount", (long) VCount);
acc_entry.setAttr("Vita", (long) Vita);
- acc.putEntry(acc_entry);
+ acc.putEntrySmall(acc_entry);
}
return true;
}
- public static void accumulate(File from_dir, File tmp_dir, File err_dir, File bkp_dir, File to_file) throws IOException {
+ public static void accumulate(File from_dir, File tmp_dir, File err_dir, File bkp_dir, File to_file, int max_files) throws IOException {
if (!(from_dir.isDirectory())) {
System.out.println("source path " + from_dir + " is not a directory.");
return;
@@ -171,7 +172,8 @@ public class plasmaRankingCRProcess {
kelondroAttrSeq source_cr = null;
File source_file = null;
String[] files = from_dir.list();
- for (int i = 0; i < files.length; i++) {
+ if (files.length < max_files) max_files = files.length;
+ for (int i = 0; i < max_files; i++) {
// open file
source_file = new File(from_dir, files[i]);
if (accumulate_upd(source_file, acc)) {
@@ -206,7 +208,7 @@ public class plasmaRankingCRProcess {
public static int genrci(File cr_in, File rci_out) throws IOException {
if (!(cr_in.exists())) return 0;
final kelondroAttrSeq cr = new kelondroAttrSeq(cr_in, false);
- if (rci_out.exists()) rci_out.delete(); // we want only fresh rci here (during testing)
+ //if (rci_out.exists()) rci_out.delete(); // we want only fresh rci here (during testing)
if (!(rci_out.exists())) {
kelondroAttrSeq rcix = new kelondroAttrSeq("Global Ranking Reverse Citation Index",
",'='," +
@@ -267,7 +269,7 @@ public class plasmaRankingCRProcess {
// java -classpath source de.anomic.plasma.kelondroPropFile -transcode DATA/RANKING/GLOBAL/CRG-test-unsorted-original.cr DATA/RANKING/GLOBAL/CRG-test-generated.cr
try {
if ((args.length == 5) && (args[0].equals("-accumulate"))) {
- accumulate(new File(args[1]), new File(args[2]), new File(args[3]), new File(args[4]), new File(args[5]));
+ accumulate(new File(args[1]), new File(args[2]), new File(args[3]), new File(args[4]), new File(args[5]), Integer.parseInt(args[6]));
}
if ((args.length == 2) && (args[0].equals("-accumulate"))) {
File root_path = new File(args[1]);
@@ -276,7 +278,8 @@ public class plasmaRankingCRProcess {
File tmp_dir = new File(root_path, "DATA/RANKING/GLOBAL/016_tmp");
File err_dir = new File(root_path, "DATA/RANKING/GLOBAL/017_err");
File acc_dir = new File(root_path, "DATA/RANKING/GLOBAL/018_acc");
- File to_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/CRG-a-acc.cr.gz");
+ String filename = "CRG-a-" + new serverDate().toShortString(true) + ".cr.gz";
+ File to_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/" + filename);
if (!(ready_dir.exists())) ready_dir.mkdirs();
if (!(tmp_dir.exists())) tmp_dir.mkdirs();
if (!(err_dir.exists())) err_dir.mkdirs();
@@ -285,7 +288,7 @@ public class plasmaRankingCRProcess {
serverFileUtils.moveAll(from_dir, ready_dir);
long start = System.currentTimeMillis();
int files = ready_dir.list().length;
- accumulate(ready_dir, tmp_dir, err_dir, acc_dir, to_file);
+ accumulate(ready_dir, tmp_dir, err_dir, acc_dir, to_file, 1000);
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished accumulate for " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)");
}
@@ -328,13 +331,16 @@ public class plasmaRankingCRProcess {
}
if ((args.length == 2) && (args[0].equals("-genrci"))) {
File root_path = new File(args[1]);
- File cr_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/CRG-a-acc.cr.gz");
+ File cr_filedir = new File(root_path, "DATA/RANKING/GLOBAL/020_con0");
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
rci_file.getParentFile().mkdirs();
- long start = System.currentTimeMillis();
- int count = genrci(cr_file, rci_file);
- long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
- System.out.println("Finished RCI generation: " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)");
+ String[] cr_filenames = cr_filedir.list();
+ for (int i = 0; i < cr_filenames.length; i++) {
+ long start = System.currentTimeMillis();
+ int count = genrci(new File(cr_filedir, cr_filenames[i]), rci_file);
+ long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
+ System.out.println("Completed RCI generation for input file " + cr_filenames[i] + ": " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)");
+ }
}
} catch (IOException e) {
e.printStackTrace();
diff --git a/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java b/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java
index 33d9735a1..6523bddd9 100644
--- a/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java
+++ b/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java
@@ -47,8 +47,11 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
+import java.net.URL;
+import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.HashMap;
+import java.util.HashSet;
import de.anomic.kelondro.kelondroAttrSeq;
import de.anomic.server.serverCodings;
@@ -57,11 +60,9 @@ import de.anomic.tools.bitfield;
public class plasmaRankingRCIEvaluation {
- public static int[] rcieval(File rci_file) throws IOException {
+ public static int[] rcieval(kelondroAttrSeq rci) throws IOException {
// collect information about which entry has how many references
// the output is a reference-count:occurrences relation
- if (!(rci_file.exists())) return null;
- final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false);
HashMap counts = new HashMap();
Iterator i = rci.keys();
String key;
@@ -102,40 +103,169 @@ public class plasmaRankingRCIEvaluation {
public static int[] interval(int[] counts, int parts) {
long limit = sum(counts) / 2;
- int[] pos = new int[parts];
+ int[] partition = new int[parts];
int s = 0, p = parts - 1;
- for (int i = 0; i < counts.length; i++) {
+ for (int i = 1; i < counts.length; i++) {
s += counts[i];
if ((s > limit) && (p >= 0)) {
- pos[p--] = i - 1;
- limit = (2 * limit - s + counts[i]) / 2;
- s = counts[i];
+ partition[p--] = i;
+ limit = (2 * limit - s) / 2;
+ s = 0;
}
}
- pos[0] = counts.length - 1;
- return pos;
+ partition[0] = counts.length - 1;
+ for (int i = 1; i < 10; i++) partition[i] = (partition[i - 1] + 4 * partition[i]) / 5;
+ return partition;
+ }
+
+ /*
+ public static int[] generateYBRLimits(int[] counts, int[] partition) {
+ int[] limits = new int[partition.length];
+ int min;
+ int j = 0;
+ for (int i = partition.length - 1; i >= 0; i--) {
+ min = counts[j];
+ while (j <= partition[i]) {
+ if (counts[j] < min) min = counts[j];
+ j++;
+ }
+ limits[i] = min;
+ }
+ return limits;
+ }
+ */
+
+ public static void checkPartitionTable0(int[] counts, int[] partition) {
+ int sumsum = 0;
+ int sum;
+ int j = 0;
+ for (int i = partition.length - 1; i >= 0; i--) {
+ sum = 0;
+ while (j <= partition[i]) {
+ sum += counts[j++];
+ }
+ System.out.println("sum of YBR-" + i + " entries: " + sum);
+ sumsum += sum;
+ }
+ System.out.println("complete sum = " + sumsum);
+ }
+
+ public static void checkPartitionTable1(int[] counts, int[] partition) {
+ int sumsum = 0;
+ int[] sum = new int[partition.length];
+ for (int i = 0; i < partition.length; i++) sum[i] = 0;
+ for (int i = 0; i < counts.length; i++) sum[orderIntoYBI(partition, i)] += counts[i];
+ for (int i = partition.length - 1; i >= 0; i--) {
+ System.out.println("sum of YBR-" + i + " entries: " + sum[i]);
+ sumsum += sum[i];
+ }
+ System.out.println("complete sum = " + sumsum);
+ }
+
+ public static int orderIntoYBI(int[] partition, int count) {
+ for (int i = 0; i < partition.length - 1; i++) {
+ if ((count >= (partition[i + 1] + 1)) && (count <= partition[i])) return i;
+ }
+ return partition.length - 1;
+ }
+
+ public static HashSet[] genRankingTable(kelondroAttrSeq rci, int[] partition) {
+ HashSet[] ranked = new HashSet[partition.length];
+ for (int i = 0; i < partition.length; i++) ranked[i] = new HashSet();
+ Iterator i = rci.keys();
+ String key;
+ kelondroAttrSeq.Entry entry;
+ while (i.hasNext()) {
+ key = (String) i.next();
+ entry = rci.getEntry(key);
+ ranked[orderIntoYBI(partition, entry.getSeq().size())].add(key);
+ }
+ return ranked;
+ }
+
+ public static HashMap genReverseDomHash(File domlist) {
+ HashSet domset = serverFileUtils.loadList(domlist);
+ HashMap dommap = new HashMap();
+ Iterator i = domset.iterator();
+ String dom;
+ while (i.hasNext()) {
+ dom = (String) i.next();
+ if (dom.startsWith("www.")) dom = dom.substring(4);
+ try {
+ dommap.put(plasmaURL.urlHash(new URL("http://" + dom)).substring(6), dom);
+ dommap.put(plasmaURL.urlHash(new URL("http://www." + dom)).substring(6), "www." + dom);
+ } catch (MalformedURLException e) {}
+ }
+ return dommap;
+ }
+
+ public static void storeRankingTable(HashSet[] ranking, File tablePath) throws IOException {
+ String hash;
+ String filename;
+ if (!(tablePath.exists())) tablePath.mkdirs();
+ for (int i = 0; i < ranking.length; i++) {
+ filename = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx";
+ serverFileUtils.saveSet(new File(tablePath, filename), ranking[i], "");
+ }
}
public static void main(String[] args) {
try {
+ if ((args.length == 2) && (args[0].equals("-genybr"))) {
+ File root_path = new File(args[1]);
+ File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
+ long start = System.currentTimeMillis();
+ if (!(rci_file.exists())) return;
+ final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false);
+ int counts[] = rcieval(rci);
+ int[] partition = interval(counts, 16);
+ HashSet[] ranked = genRankingTable(rci, partition);
+ storeRankingTable(ranked, new File(root_path, "ranking/YBR"));
+ long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
+ System.out.println("Finished YBR generation in " + seconds + " seconds.");
+ }
if ((args.length == 2) && (args[0].equals("-rcieval"))) {
File root_path = new File(args[1]);
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
long start = System.currentTimeMillis();
- int count[] = rcieval(rci_file);
+ if (!(rci_file.exists())) return;
+ final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false);
+ int counts[] = rcieval(rci);
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
- System.out.println("Finished RCI evaluation in " + seconds + " seconds");
+ System.out.println("Finished RCI evaluation in " + seconds + " seconds. " + counts.length + " counts in array.");
/*
System.out.println("count table:");
- for (int i = 0; i < count.length; i++) {
- System.out.println(i + " references: " + count[i] + " times");
+ for (int i = 0; i < counts.length; i++) {
+ System.out.println(i + " references: " + counts[i] + " times");
}
*/
- int[] pos = interval(count, 16);
+ int[] partition = interval(counts, 16);
System.out.println("partition position table:");
- for (int i = 0; i < pos.length; i++) {
- System.out.println("position " + i + ": " + pos[i]);
+ for (int i = 0; i < partition.length - 1; i++) {
+ System.out.println("YBR-" + i + ": " + (partition[i + 1] + 1) + " - " + partition[i] + " references");
+ }
+ System.out.println("YBR-" + (partition.length - 1) + ": 0 - " + partition[partition.length - 1] + " references");
+ checkPartitionTable0(counts, partition);
+ checkPartitionTable1(counts, partition);
+ int sum = 0;
+ for (int i = 0; i < counts.length; i++) sum += counts[i];
+ System.out.println("sum of all references: " + sum);
+
+ // now print out the table
+ HashSet[] ranked = genRankingTable(rci, partition);
+ HashMap dommap = genReverseDomHash(new File(root_path, "domlist.txt"));
+ String hash, dom;
+ for (int i = 0; i < 9; i++) {
+ System.out.print("YBR-" + i + ": ");
+ Iterator k = ranked[i].iterator();
+ while (k.hasNext()) {
+ hash = (String) k.next();
+ dom = (String) dommap.get(hash);
+ if (dom == null) System.out.print("[" + hash + "], "); else System.out.print(dom + ", ");
+ }
+ System.out.println();
}
+
}
} catch (IOException e) {
e.printStackTrace();
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index 7cce5a8b1..63d75051e 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -249,7 +249,8 @@ public final class plasmaSearchEvent {
// apply filter
profileLocal.startTimer();
- acc.removeRedundant();
+ acc.removeDoubleDom();
+ //acc.removeRedundant();
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_FILTER, acc.sizeOrdered());
diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java
index b4b1bb5ae..afb566450 100644
--- a/source/de/anomic/plasma/plasmaSearchPreOrder.java
+++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java
@@ -42,16 +42,53 @@
package de.anomic.plasma;
+import java.io.File;
+import java.io.IOException;
+import java.util.Set;
import java.util.TreeMap;
import java.util.Iterator;
import de.anomic.server.serverCodings;
+import de.anomic.server.serverFileUtils;
public final class plasmaSearchPreOrder {
+ private static Set[] ybrTables = null; // block-rank tables
+ private static boolean useYBR = true;
+
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private plasmaSearchQuery query;
+ public static void loadYBR(File rankingPath, int count) {
+ // load ranking tables
+ if (rankingPath.exists()) {
+ ybrTables = new Set[count];
+ String ybrName;
+ try {
+ for (int i = 0; i < count; i++) {
+ ybrName = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx";
+ ybrTables[i] = serverFileUtils.loadSet(new File(rankingPath, ybrName), 6, false);
+ }
+ } catch (IOException e) {
+ ybrTables = null;
+ }
+ } else {
+ ybrTables = null;
+ }
+ }
+
+ public static boolean canUseYBR() {
+ return ybrTables != null;
+ }
+
+ public static boolean isUsingYBR() {
+ return useYBR;
+ }
+
+ public static void switchYBR(boolean usage) {
+ useYBR = usage;
+ }
+
public plasmaSearchPreOrder(plasmaSearchQuery query) {
this.pageAcc = new TreeMap();
this.query = query;
@@ -64,7 +101,6 @@ public final class plasmaSearchPreOrder {
return theClone;
}
-
public boolean hasNext() {
return pageAcc.size() > 0;
}
@@ -87,12 +123,34 @@ public final class plasmaSearchPreOrder {
public void addEntry(plasmaWordIndexEntry indexEntry) {
long ranking = 0;
- if (query.order[0].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = 4096 * indexEntry.getQuality();
- else if (query.order[0].equals(plasmaSearchQuery.ORDER_DATE)) ranking = 4096 * indexEntry.getVirtualAge();
- if (query.order[1].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality();
- else if (query.order[1].equals(plasmaSearchQuery.ORDER_DATE)) ranking += indexEntry.getVirtualAge();
+ long factor = 1024 * 1024;
+
+ for (int i = 0; i < 3; i++) {
+ if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = factor * indexEntry.getQuality();
+ else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking = factor * indexEntry.getVirtualAge();
+ else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking = factor * ybr_p(indexEntry.getUrlHash());
+ factor = factor / 1024;
+ }
+
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
}
+ public static int ybr_p(String urlHash) {
+ return 16 - ybr(urlHash);
+ }
+
+ public static int ybr(String urlHash) {
+ if (ybrTables == null) return 16;
+ if (!(useYBR)) return 16;
+ final String domHash = urlHash.substring(6);
+ for (int i = 0; i < ybrTables.length; i++) {
+ if (ybrTables[i].contains(domHash)) {
+ //System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
+ return i;
+ }
+ }
+ //System.out.println("NOT FOUND: " + urlHash);
+ return 16;
+ }
}
diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java
index f7cf1eb7a..71235c989 100644
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@@ -54,6 +54,7 @@ public final class plasmaSearchQuery {
public static final String ORDER_QUALITY = "quality";
public static final String ORDER_DATE = "date";
+ public static final String ORDER_YBR = "ybr";
public static final int SEARCHDOM_LOCAL = 0;
public static final int SEARCHDOM_GROUPDHT = 1;
diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java
index c777292cc..24cba50c2 100644
--- a/source/de/anomic/plasma/plasmaSearchResult.java
+++ b/source/de/anomic/plasma/plasmaSearchResult.java
@@ -111,8 +111,8 @@ public final class plasmaSearchResult {
URL url = page.url();
String descr = page.descr();
if ((url == null) || (descr == null)) return;
- String[] urlcomps = url.toString().split(splitrex); // word components of the url
- String[] descrcomps = descr.split(splitrex); // words in the description
+ String[] urlcomps = url.toString().toLowerCase().split(splitrex); // word components of the url
+ String[] descrcomps = descr.toLowerCase().split(splitrex); // words in the description
// store everything
Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps};
@@ -137,7 +137,7 @@ public final class plasmaSearchResult {
plasmaCrawlLURL.Entry page;
String[] urlcomps;
String[] descrcomps;
- long ranking;
+ long ranking, factor;
String queryhash;
for (int i = 0; i < results.size(); i++) {
// take out values from result array
@@ -149,12 +149,18 @@ public final class plasmaSearchResult {
// apply pre-calculated order attributes
ranking = 0;
- if (query.order[0].equals(plasmaSearchQuery.ORDER_DATE)) ranking += 10 * indexEntry.getVirtualAge();
- //if (query.order[0].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality();
+ factor = 4096L*4096L;
+ for (int j = 0; j < 3; j++) {
+ if (query.order[j].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += factor * indexEntry.getQuality() / 64L;
+ else if (query.order[j].equals(plasmaSearchQuery.ORDER_DATE)) ranking += factor * indexEntry.getVirtualAge() / 64L;
+ else if (query.order[j].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * plasmaSearchPreOrder.ybr_p(indexEntry.getUrlHash());
+ factor = factor / 4096L;
+ }
+
// apply 'common-sense' heuristic using references
- for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking++;
- for (int j = 0; j < descrcomps.length; j++) if (commonSense.contains(descrcomps[j])) ranking++;
+ for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length;
+ for (int j = 0; j < descrcomps.length; j++) if (commonSense.contains(descrcomps[j])) ranking += 10L*4096L*4096L / descrcomps.length;
// apply query-in-result matching
Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps);
@@ -162,12 +168,13 @@ public final class plasmaSearchResult {
Iterator shi = query.queryHashes.iterator();
while (shi.hasNext()) {
queryhash = (String) shi.next();
- if (urlcomph.contains(queryhash)) ranking += 10;
- if (descrcomph.contains(queryhash)) ranking += 100;
+ if (urlcomph.contains(queryhash)) ranking += 90L*4096L*4096L / urlcomps.length / query.queryHashes.size();
+ if (descrcomph.contains(queryhash)) ranking += 40L*4096L*4096L / descrcomps.length / query.queryHashes.size();
}
+
// insert value
- //System.out.println("Ranking " + ranking + " for URL " + url.toString());
+ //System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), page);
}
@@ -175,6 +182,20 @@ public final class plasmaSearchResult {
results = null;
}
+ public void removeDoubleDom() {
+ Iterator i = pageAcc.entrySet().iterator();
+ HashSet doms = new HashSet();
+ Map.Entry entry;
+ String dom;
+
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ dom = ((plasmaCrawlLURL.Entry) entry.getValue()).url().getHost();
+ if (doms.contains(dom)) i.remove(); else doms.add(dom);
+ }
+
+ }
+
public void removeRedundant() {
// remove all urls from the pageAcc structure that occur double by specific redundancy rules
// a link is redundant, if a sub-path of the url is cited before. redundant urls are removed
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 4da2e293c..1fa173723 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -281,6 +281,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
ppRamString(stopwordsFile.length()/1024));
}
+ // load ranking tables
+ File rankingPath = new File(rootPath, "ranking/YBR");
+ if (rankingPath.exists()) {
+ plasmaSearchPreOrder.loadYBR(rankingPath, 12);
+ }
+
// read memory amount
int ramLURL = (int) getConfigLong("ramCacheLURL", 1024) / 1024;
int ramNURL = (int) getConfigLong("ramCacheNURL", 1024) / 1024;
@@ -1555,14 +1561,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public serverObjects searchFromLocal(plasmaSearchQuery query) {
// tell all threads to do nothing for a specific time
- //log.logInfo("A");
wordIndex.intermission(2 * query.maximumTime);
- //log.logInfo("B");
intermissionAllThreads(2 * query.maximumTime);
- //log.logInfo("C");
serverObjects prop = new serverObjects();
- //log.logInfo("D");
try {
// filter out words that appear in bluelist
//log.logInfo("E");
@@ -1654,6 +1656,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("results_" + i + "_urlhash", urlhash);
prop.put("results_" + i + "_urlname", nxTools.cutUrlText(urlname, 120));
prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
+ prop.put("results_" + i + "_ybr", plasmaSearchPreOrder.ybr(urlentry.hash()));
prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
prop.put("results_" + i + "_words",URLEncoder.encode(query.queryWords.toString(),"UTF-8"));
// adding snippet if available
diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java
index 67d8f4e4f..8869e7baf 100644
--- a/source/de/anomic/server/serverFileUtils.java
+++ b/source/de/anomic/server/serverFileUtils.java
@@ -176,11 +176,11 @@ public final class serverFileUtils {
copy(new ByteArrayInputStream(source), dest);
}
- public static HashSet loadList(String filename) {
+ public static HashSet loadList(File file) {
HashSet set = new HashSet();
BufferedReader br = null;
try {
- br = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
+ br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
String line;
while ((line = br.readLine()) != null) {
line = line.trim();
|