From 0e25020f512001b17d6671237000fd75aa1fcab7 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 22 Nov 2005 15:17:05 +0000 Subject: [PATCH] added first generation and usage of YBR index-files. Enhanced overall ranking of search results. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1118 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- bin/cr_accumulate | 4 +- bin/cr_genrci | 2 +- bin/gen_ybr | 2 + htroot/index.html | 10 +- htroot/index.java | 31 +++- source/de/anomic/http/httpdProxyHandler.java | 2 +- .../de/anomic/kelondro/kelondroAttrSeq.java | 6 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 15 +- .../anomic/plasma/plasmaRankingCRProcess.java | 30 ++-- .../plasma/plasmaRankingRCIEvaluation.java | 164 ++++++++++++++++-- .../de/anomic/plasma/plasmaSearchEvent.java | 3 +- .../anomic/plasma/plasmaSearchPreOrder.java | 68 +++++++- .../de/anomic/plasma/plasmaSearchQuery.java | 1 + .../de/anomic/plasma/plasmaSearchResult.java | 41 +++-- .../de/anomic/plasma/plasmaSwitchboard.java | 11 +- source/de/anomic/server/serverFileUtils.java | 4 +- 16 files changed, 326 insertions(+), 68 deletions(-) create mode 100755 bin/gen_ybr diff --git a/bin/cr_accumulate b/bin/cr_accumulate index 0f12779f6..bc0f83e67 100755 --- a/bin/cr_accumulate +++ b/bin/cr_accumulate @@ -1,3 +1,3 @@ cd `dirname $0`/.. -java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate . -java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168 +java -Xms1500m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate . +java -Xms1500m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168 diff --git a/bin/cr_genrci b/bin/cr_genrci index 6f40f8447..b3d15c48e 100755 --- a/bin/cr_genrci +++ b/bin/cr_genrci @@ -1,2 +1,2 @@ cd `dirname $0`/.. -java -server -Xms1400m -Xmx1400m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci . +java -server -Xms2000m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci . diff --git a/bin/gen_ybr b/bin/gen_ybr new file mode 100755 index 000000000..88a62717e --- /dev/null +++ b/bin/gen_ybr @@ -0,0 +1,2 @@ +cd `dirname $0`/.. +java -server -Xms1000m -Xmx1000m -classpath source:classes de.anomic.plasma.plasmaRankingRCIEvaluation -genybr . diff --git a/htroot/index.html b/htroot/index.html index 25fb63f3f..ae173a7e7 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -26,8 +26,12 @@ Max. number of results:   order by: Resource: @@ -101,7 +105,7 @@ from 'late' peers.

#[description]#
#(snippet)#::#[text]#
#(/snippet)# #[urlname]#
-#[date]# | Info

+#[date]# | YBR-#[ybr]# | Info

#{/results}# diff --git a/htroot/index.java b/htroot/index.java index e3a159093..0aabe1025 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -56,6 +56,7 @@ import de.anomic.http.httpHeader; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSearchQuery; +import de.anomic.plasma.plasmaSearchPreOrder; import de.anomic.server.serverCore; import de.anomic.server.serverDate; import de.anomic.server.serverObjects; @@ -103,8 +104,12 @@ public class index { prop.put("count-50", 0); prop.put("count-100", 0); prop.put("count-1000", 0); - prop.put("order-quality", 0); - prop.put("order-date", 0); + prop.put("order-ybr-date-quality", plasmaSearchPreOrder.canUseYBR() ? 1 : 0); + prop.put("order-ybr-quality-date", 0); + prop.put("order-date-ybr-quality", 0); + prop.put("order-quality-ybr-date", 0); + prop.put("order-date-quality-ybr", plasmaSearchPreOrder.canUseYBR() ? 0 : 1); + prop.put("order-quality-date-ybr", 0); prop.put("resource-global", ((global) ? 1 : 0)); prop.put("resource-local", ((global) ? 0 : 1)); prop.put("time-1", 0); @@ -137,8 +142,16 @@ public class index { (yacyCore.seedDB.mySeed != null) && (yacyCore.seedDB.mySeed.getAddress() != null)); - final String order1 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_QUALITY : plasmaSearchQuery.ORDER_DATE; - final String order2 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_DATE : plasmaSearchQuery.ORDER_QUALITY; + String order1="", order2="", order3=""; + if (order.startsWith("YBR")) order1 = plasmaSearchQuery.ORDER_YBR; + if (order.startsWith("Date")) order1 = plasmaSearchQuery.ORDER_DATE; + if (order.startsWith("Quality")) order1 = plasmaSearchQuery.ORDER_QUALITY; + if (order.indexOf("-YBR-") > 0) order2 = plasmaSearchQuery.ORDER_YBR; + if (order.indexOf("-Date-") > 0) order2 = plasmaSearchQuery.ORDER_DATE; + if (order.indexOf("-Quality-") > 0) order2 = plasmaSearchQuery.ORDER_QUALITY; + if (order.endsWith("YBR")) order3 = plasmaSearchQuery.ORDER_YBR; + if (order.endsWith("Date")) order3 = plasmaSearchQuery.ORDER_DATE; + if (order.endsWith("Quality")) order3 = plasmaSearchQuery.ORDER_QUALITY; String urlmask = ""; if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) { urlmask = ".*"; @@ -147,7 +160,7 @@ public class index { } // do the search - plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2}, count, searchtime, urlmask, referer, + plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer, ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL, "", 20); final serverObjects prop = sb.searchFromLocal(thisSearch); @@ -240,8 +253,12 @@ public class index { prop.put("count-50", ((count == 50)) ? 1 : 0); prop.put("count-100", ((count == 100)) ? 1 : 0); prop.put("count-1000", ((count == 1000)) ? 1 : 0); - prop.put("order-quality", ((order.equals("Quality-Date")) ? 1 : 0)); - prop.put("order-date", ((order.equals("Date-Quality")) ? 1 : 0)); + prop.put("order-ybr-date-quality", ((order.equals("YBR-Date-Quality")) ? 1 : 0)); + prop.put("order-ybr-quality-date", ((order.equals("YBR-Quality-Date")) ? 1 : 0)); + prop.put("order-date-ybr-quality", ((order.equals("Date-YBR-Quality")) ? 1 : 0)); + prop.put("order-quality-ybr-date", ((order.equals("Quality-YBR-Date")) ? 1 : 0)); + prop.put("order-date-quality-ybr", ((order.equals("Date-Quality-YBR")) ? 1 : 0)); + prop.put("order-quality-date-ybr", ((order.equals("Quality-Date-YBR")) ? 1 : 0)); prop.put("resource-global", ((global) ? 1 : 0)); prop.put("resource-local", ((global) ? 0 : 1)); prop.put("time-1", ((searchtime == 1000) ? 1 : 0)); diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 8890453df..7f4c82cf1 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -231,7 +231,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // load the yellow-list f = switchboard.getConfig("proxyYellowList", null); if (f != null) { - yellowList = serverFileUtils.loadList(f); + yellowList = serverFileUtils.loadList(new File(f)); this.theLogger.logConfig("loaded yellow-list from file " + f + ", " + yellowList.size() + " entries"); } else { yellowList = new HashSet(); diff --git a/source/de/anomic/kelondro/kelondroAttrSeq.java b/source/de/anomic/kelondro/kelondroAttrSeq.java index 94b32e4a7..a1c701643 100644 --- a/source/de/anomic/kelondro/kelondroAttrSeq.java +++ b/source/de/anomic/kelondro/kelondroAttrSeq.java @@ -206,6 +206,10 @@ public class kelondroAttrSeq { entries.put(entry.pivot, entry); } + public void putEntrySmall(Entry entry) { + entries.put(entry.pivot, entry.toString()); + } + public Entry getEntry(String pivot) { Object e = entries.get(pivot); if (e == null) return null; @@ -351,7 +355,7 @@ public class kelondroAttrSeq { int p = attrseq.indexOf('|') + 1; long[] seqattrs = new long[structure.seq_names.length - 1]; String seqname; - while (p < attrseq.length()) { + while (p + structure.seq_len[0] <= attrseq.length()) { seqname = attrseq.substring(p, p + structure.seq_len[0]); p += structure.seq_len[0]; for (int i = 1; i < structure.seq_names.length; i++) { diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index eb85b2fad..dbfd80d05 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -64,6 +64,7 @@ import java.util.Locale; import java.util.Properties; import de.anomic.http.httpc; import de.anomic.kelondro.kelondroTree; +import de.anomic.kelondro.kelondroException; import de.anomic.server.serverCodings; import de.anomic.server.serverObjects; import de.anomic.server.logging.serverLog; @@ -719,17 +720,27 @@ public final class plasmaCrawlLURL extends plasmaURL { public class kiter implements Iterator { // enumerates entry elements kelondroTree.rowIterator i; + boolean error = false; + public kiter(boolean up, boolean rotating) throws IOException { i = urlHashCache.rows(up, rotating); + error = false; } public boolean hasNext() { + if (error) return false; return i.hasNext(); } public Object next() { - byte[] e = ((byte[][])i.next())[0]; - if (e == null) return null; else return new Entry(new String(e)); + try { + byte[] e = ((byte[][])i.next())[0]; + if (e == null) return null; else return new Entry(new String(e)); + } catch (kelondroException e) { + e.printStackTrace(); + error = true; + return null; + } } public void remove() { diff --git a/source/de/anomic/plasma/plasmaRankingCRProcess.java b/source/de/anomic/plasma/plasmaRankingCRProcess.java index 69e52cfd0..5bd81db1c 100644 --- a/source/de/anomic/plasma/plasmaRankingCRProcess.java +++ b/source/de/anomic/plasma/plasmaRankingCRProcess.java @@ -53,6 +53,7 @@ import java.util.Map; import de.anomic.kelondro.kelondroAttrSeq; import de.anomic.server.serverCodings; import de.anomic.server.serverFileUtils; +import de.anomic.server.serverDate; import de.anomic.tools.bitfield; public class plasmaRankingCRProcess { @@ -131,13 +132,13 @@ public class plasmaRankingCRProcess { acc_entry.setAttr("ACount", (long) ACount); acc_entry.setAttr("VCount", (long) VCount); acc_entry.setAttr("Vita", (long) Vita); - acc.putEntry(acc_entry); + acc.putEntrySmall(acc_entry); } return true; } - public static void accumulate(File from_dir, File tmp_dir, File err_dir, File bkp_dir, File to_file) throws IOException { + public static void accumulate(File from_dir, File tmp_dir, File err_dir, File bkp_dir, File to_file, int max_files) throws IOException { if (!(from_dir.isDirectory())) { System.out.println("source path " + from_dir + " is not a directory."); return; @@ -171,7 +172,8 @@ public class plasmaRankingCRProcess { kelondroAttrSeq source_cr = null; File source_file = null; String[] files = from_dir.list(); - for (int i = 0; i < files.length; i++) { + if (files.length < max_files) max_files = files.length; + for (int i = 0; i < max_files; i++) { // open file source_file = new File(from_dir, files[i]); if (accumulate_upd(source_file, acc)) { @@ -206,7 +208,7 @@ public class plasmaRankingCRProcess { public static int genrci(File cr_in, File rci_out) throws IOException { if (!(cr_in.exists())) return 0; final kelondroAttrSeq cr = new kelondroAttrSeq(cr_in, false); - if (rci_out.exists()) rci_out.delete(); // we want only fresh rci here (during testing) + //if (rci_out.exists()) rci_out.delete(); // we want only fresh rci here (during testing) if (!(rci_out.exists())) { kelondroAttrSeq rcix = new kelondroAttrSeq("Global Ranking Reverse Citation Index", ",'='," + @@ -267,7 +269,7 @@ public class plasmaRankingCRProcess { // java -classpath source de.anomic.plasma.kelondroPropFile -transcode DATA/RANKING/GLOBAL/CRG-test-unsorted-original.cr DATA/RANKING/GLOBAL/CRG-test-generated.cr try { if ((args.length == 5) && (args[0].equals("-accumulate"))) { - accumulate(new File(args[1]), new File(args[2]), new File(args[3]), new File(args[4]), new File(args[5])); + accumulate(new File(args[1]), new File(args[2]), new File(args[3]), new File(args[4]), new File(args[5]), Integer.parseInt(args[6])); } if ((args.length == 2) && (args[0].equals("-accumulate"))) { File root_path = new File(args[1]); @@ -276,7 +278,8 @@ public class plasmaRankingCRProcess { File tmp_dir = new File(root_path, "DATA/RANKING/GLOBAL/016_tmp"); File err_dir = new File(root_path, "DATA/RANKING/GLOBAL/017_err"); File acc_dir = new File(root_path, "DATA/RANKING/GLOBAL/018_acc"); - File to_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/CRG-a-acc.cr.gz"); + String filename = "CRG-a-" + new serverDate().toShortString(true) + ".cr.gz"; + File to_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/" + filename); if (!(ready_dir.exists())) ready_dir.mkdirs(); if (!(tmp_dir.exists())) tmp_dir.mkdirs(); if (!(err_dir.exists())) err_dir.mkdirs(); @@ -285,7 +288,7 @@ public class plasmaRankingCRProcess { serverFileUtils.moveAll(from_dir, ready_dir); long start = System.currentTimeMillis(); int files = ready_dir.list().length; - accumulate(ready_dir, tmp_dir, err_dir, acc_dir, to_file); + accumulate(ready_dir, tmp_dir, err_dir, acc_dir, to_file, 1000); long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); System.out.println("Finished accumulate for " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)"); } @@ -328,13 +331,16 @@ public class plasmaRankingCRProcess { } if ((args.length == 2) && (args[0].equals("-genrci"))) { File root_path = new File(args[1]); - File cr_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/CRG-a-acc.cr.gz"); + File cr_filedir = new File(root_path, "DATA/RANKING/GLOBAL/020_con0"); File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz"); rci_file.getParentFile().mkdirs(); - long start = System.currentTimeMillis(); - int count = genrci(cr_file, rci_file); - long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); - System.out.println("Finished RCI generation: " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)"); + String[] cr_filenames = cr_filedir.list(); + for (int i = 0; i < cr_filenames.length; i++) { + long start = System.currentTimeMillis(); + int count = genrci(new File(cr_filedir, cr_filenames[i]), rci_file); + long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); + System.out.println("Completed RCI generation for input file " + cr_filenames[i] + ": " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)"); + } } } catch (IOException e) { e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java b/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java index 33d9735a1..6523bddd9 100644 --- a/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java +++ b/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java @@ -47,8 +47,11 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; +import java.net.URL; +import java.net.MalformedURLException; import java.util.Iterator; import java.util.HashMap; +import java.util.HashSet; import de.anomic.kelondro.kelondroAttrSeq; import de.anomic.server.serverCodings; @@ -57,11 +60,9 @@ import de.anomic.tools.bitfield; public class plasmaRankingRCIEvaluation { - public static int[] rcieval(File rci_file) throws IOException { + public static int[] rcieval(kelondroAttrSeq rci) throws IOException { // collect information about which entry has how many references // the output is a reference-count:occurrences relation - if (!(rci_file.exists())) return null; - final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false); HashMap counts = new HashMap(); Iterator i = rci.keys(); String key; @@ -102,40 +103,169 @@ public class plasmaRankingRCIEvaluation { public static int[] interval(int[] counts, int parts) { long limit = sum(counts) / 2; - int[] pos = new int[parts]; + int[] partition = new int[parts]; int s = 0, p = parts - 1; - for (int i = 0; i < counts.length; i++) { + for (int i = 1; i < counts.length; i++) { s += counts[i]; if ((s > limit) && (p >= 0)) { - pos[p--] = i - 1; - limit = (2 * limit - s + counts[i]) / 2; - s = counts[i]; + partition[p--] = i; + limit = (2 * limit - s) / 2; + s = 0; } } - pos[0] = counts.length - 1; - return pos; + partition[0] = counts.length - 1; + for (int i = 1; i < 10; i++) partition[i] = (partition[i - 1] + 4 * partition[i]) / 5; + return partition; + } + + /* + public static int[] generateYBRLimits(int[] counts, int[] partition) { + int[] limits = new int[partition.length]; + int min; + int j = 0; + for (int i = partition.length - 1; i >= 0; i--) { + min = counts[j]; + while (j <= partition[i]) { + if (counts[j] < min) min = counts[j]; + j++; + } + limits[i] = min; + } + return limits; + } + */ + + public static void checkPartitionTable0(int[] counts, int[] partition) { + int sumsum = 0; + int sum; + int j = 0; + for (int i = partition.length - 1; i >= 0; i--) { + sum = 0; + while (j <= partition[i]) { + sum += counts[j++]; + } + System.out.println("sum of YBR-" + i + " entries: " + sum); + sumsum += sum; + } + System.out.println("complete sum = " + sumsum); + } + + public static void checkPartitionTable1(int[] counts, int[] partition) { + int sumsum = 0; + int[] sum = new int[partition.length]; + for (int i = 0; i < partition.length; i++) sum[i] = 0; + for (int i = 0; i < counts.length; i++) sum[orderIntoYBI(partition, i)] += counts[i]; + for (int i = partition.length - 1; i >= 0; i--) { + System.out.println("sum of YBR-" + i + " entries: " + sum[i]); + sumsum += sum[i]; + } + System.out.println("complete sum = " + sumsum); + } + + public static int orderIntoYBI(int[] partition, int count) { + for (int i = 0; i < partition.length - 1; i++) { + if ((count >= (partition[i + 1] + 1)) && (count <= partition[i])) return i; + } + return partition.length - 1; + } + + public static HashSet[] genRankingTable(kelondroAttrSeq rci, int[] partition) { + HashSet[] ranked = new HashSet[partition.length]; + for (int i = 0; i < partition.length; i++) ranked[i] = new HashSet(); + Iterator i = rci.keys(); + String key; + kelondroAttrSeq.Entry entry; + while (i.hasNext()) { + key = (String) i.next(); + entry = rci.getEntry(key); + ranked[orderIntoYBI(partition, entry.getSeq().size())].add(key); + } + return ranked; + } + + public static HashMap genReverseDomHash(File domlist) { + HashSet domset = serverFileUtils.loadList(domlist); + HashMap dommap = new HashMap(); + Iterator i = domset.iterator(); + String dom; + while (i.hasNext()) { + dom = (String) i.next(); + if (dom.startsWith("www.")) dom = dom.substring(4); + try { + dommap.put(plasmaURL.urlHash(new URL("http://" + dom)).substring(6), dom); + dommap.put(plasmaURL.urlHash(new URL("http://www." + dom)).substring(6), "www." + dom); + } catch (MalformedURLException e) {} + } + return dommap; + } + + public static void storeRankingTable(HashSet[] ranking, File tablePath) throws IOException { + String hash; + String filename; + if (!(tablePath.exists())) tablePath.mkdirs(); + for (int i = 0; i < ranking.length; i++) { + filename = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx"; + serverFileUtils.saveSet(new File(tablePath, filename), ranking[i], ""); + } } public static void main(String[] args) { try { + if ((args.length == 2) && (args[0].equals("-genybr"))) { + File root_path = new File(args[1]); + File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz"); + long start = System.currentTimeMillis(); + if (!(rci_file.exists())) return; + final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false); + int counts[] = rcieval(rci); + int[] partition = interval(counts, 16); + HashSet[] ranked = genRankingTable(rci, partition); + storeRankingTable(ranked, new File(root_path, "ranking/YBR")); + long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); + System.out.println("Finished YBR generation in " + seconds + " seconds."); + } if ((args.length == 2) && (args[0].equals("-rcieval"))) { File root_path = new File(args[1]); File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz"); long start = System.currentTimeMillis(); - int count[] = rcieval(rci_file); + if (!(rci_file.exists())) return; + final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false); + int counts[] = rcieval(rci); long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); - System.out.println("Finished RCI evaluation in " + seconds + " seconds"); + System.out.println("Finished RCI evaluation in " + seconds + " seconds. " + counts.length + " counts in array."); /* System.out.println("count table:"); - for (int i = 0; i < count.length; i++) { - System.out.println(i + " references: " + count[i] + " times"); + for (int i = 0; i < counts.length; i++) { + System.out.println(i + " references: " + counts[i] + " times"); } */ - int[] pos = interval(count, 16); + int[] partition = interval(counts, 16); System.out.println("partition position table:"); - for (int i = 0; i < pos.length; i++) { - System.out.println("position " + i + ": " + pos[i]); + for (int i = 0; i < partition.length - 1; i++) { + System.out.println("YBR-" + i + ": " + (partition[i + 1] + 1) + " - " + partition[i] + " references"); + } + System.out.println("YBR-" + (partition.length - 1) + ": 0 - " + partition[partition.length - 1] + " references"); + checkPartitionTable0(counts, partition); + checkPartitionTable1(counts, partition); + int sum = 0; + for (int i = 0; i < counts.length; i++) sum += counts[i]; + System.out.println("sum of all references: " + sum); + + // now print out the table + HashSet[] ranked = genRankingTable(rci, partition); + HashMap dommap = genReverseDomHash(new File(root_path, "domlist.txt")); + String hash, dom; + for (int i = 0; i < 9; i++) { + System.out.print("YBR-" + i + ": "); + Iterator k = ranked[i].iterator(); + while (k.hasNext()) { + hash = (String) k.next(); + dom = (String) dommap.get(hash); + if (dom == null) System.out.print("[" + hash + "], "); else System.out.print(dom + ", "); + } + System.out.println(); } + } } catch (IOException e) { e.printStackTrace(); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 7cce5a8b1..63d75051e 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -249,7 +249,8 @@ public final class plasmaSearchEvent { // apply filter profileLocal.startTimer(); - acc.removeRedundant(); + acc.removeDoubleDom(); + //acc.removeRedundant(); profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_FILTER); profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_FILTER, acc.sizeOrdered()); diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index b4b1bb5ae..afb566450 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -42,16 +42,53 @@ package de.anomic.plasma; +import java.io.File; +import java.io.IOException; +import java.util.Set; import java.util.TreeMap; import java.util.Iterator; import de.anomic.server.serverCodings; +import de.anomic.server.serverFileUtils; public final class plasmaSearchPreOrder { + private static Set[] ybrTables = null; // block-rank tables + private static boolean useYBR = true; + private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry private plasmaSearchQuery query; + public static void loadYBR(File rankingPath, int count) { + // load ranking tables + if (rankingPath.exists()) { + ybrTables = new Set[count]; + String ybrName; + try { + for (int i = 0; i < count; i++) { + ybrName = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx"; + ybrTables[i] = serverFileUtils.loadSet(new File(rankingPath, ybrName), 6, false); + } + } catch (IOException e) { + ybrTables = null; + } + } else { + ybrTables = null; + } + } + + public static boolean canUseYBR() { + return ybrTables != null; + } + + public static boolean isUsingYBR() { + return useYBR; + } + + public static void switchYBR(boolean usage) { + useYBR = usage; + } + public plasmaSearchPreOrder(plasmaSearchQuery query) { this.pageAcc = new TreeMap(); this.query = query; @@ -64,7 +101,6 @@ public final class plasmaSearchPreOrder { return theClone; } - public boolean hasNext() { return pageAcc.size() > 0; } @@ -87,12 +123,34 @@ public final class plasmaSearchPreOrder { public void addEntry(plasmaWordIndexEntry indexEntry) { long ranking = 0; - if (query.order[0].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = 4096 * indexEntry.getQuality(); - else if (query.order[0].equals(plasmaSearchQuery.ORDER_DATE)) ranking = 4096 * indexEntry.getVirtualAge(); - if (query.order[1].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality(); - else if (query.order[1].equals(plasmaSearchQuery.ORDER_DATE)) ranking += indexEntry.getVirtualAge(); + long factor = 1024 * 1024; + + for (int i = 0; i < 3; i++) { + if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = factor * indexEntry.getQuality(); + else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking = factor * indexEntry.getVirtualAge(); + else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking = factor * ybr_p(indexEntry.getUrlHash()); + factor = factor / 1024; + } + pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry); } + public static int ybr_p(String urlHash) { + return 16 - ybr(urlHash); + } + + public static int ybr(String urlHash) { + if (ybrTables == null) return 16; + if (!(useYBR)) return 16; + final String domHash = urlHash.substring(6); + for (int i = 0; i < ybrTables.length; i++) { + if (ybrTables[i].contains(domHash)) { + //System.out.println("YBR FOUND: " + urlHash + " (" + i + ")"); + return i; + } + } + //System.out.println("NOT FOUND: " + urlHash); + return 16; + } } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index f7cf1eb7a..71235c989 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -54,6 +54,7 @@ public final class plasmaSearchQuery { public static final String ORDER_QUALITY = "quality"; public static final String ORDER_DATE = "date"; + public static final String ORDER_YBR = "ybr"; public static final int SEARCHDOM_LOCAL = 0; public static final int SEARCHDOM_GROUPDHT = 1; diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index c777292cc..24cba50c2 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -111,8 +111,8 @@ public final class plasmaSearchResult { URL url = page.url(); String descr = page.descr(); if ((url == null) || (descr == null)) return; - String[] urlcomps = url.toString().split(splitrex); // word components of the url - String[] descrcomps = descr.split(splitrex); // words in the description + String[] urlcomps = url.toString().toLowerCase().split(splitrex); // word components of the url + String[] descrcomps = descr.toLowerCase().split(splitrex); // words in the description // store everything Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps}; @@ -137,7 +137,7 @@ public final class plasmaSearchResult { plasmaCrawlLURL.Entry page; String[] urlcomps; String[] descrcomps; - long ranking; + long ranking, factor; String queryhash; for (int i = 0; i < results.size(); i++) { // take out values from result array @@ -149,12 +149,18 @@ public final class plasmaSearchResult { // apply pre-calculated order attributes ranking = 0; - if (query.order[0].equals(plasmaSearchQuery.ORDER_DATE)) ranking += 10 * indexEntry.getVirtualAge(); - //if (query.order[0].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality(); + factor = 4096L*4096L; + for (int j = 0; j < 3; j++) { + if (query.order[j].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += factor * indexEntry.getQuality() / 64L; + else if (query.order[j].equals(plasmaSearchQuery.ORDER_DATE)) ranking += factor * indexEntry.getVirtualAge() / 64L; + else if (query.order[j].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * plasmaSearchPreOrder.ybr_p(indexEntry.getUrlHash()); + factor = factor / 4096L; + } + // apply 'common-sense' heuristic using references - for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking++; - for (int j = 0; j < descrcomps.length; j++) if (commonSense.contains(descrcomps[j])) ranking++; + for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length; + for (int j = 0; j < descrcomps.length; j++) if (commonSense.contains(descrcomps[j])) ranking += 10L*4096L*4096L / descrcomps.length; // apply query-in-result matching Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps); @@ -162,12 +168,13 @@ public final class plasmaSearchResult { Iterator shi = query.queryHashes.iterator(); while (shi.hasNext()) { queryhash = (String) shi.next(); - if (urlcomph.contains(queryhash)) ranking += 10; - if (descrcomph.contains(queryhash)) ranking += 100; + if (urlcomph.contains(queryhash)) ranking += 90L*4096L*4096L / urlcomps.length / query.queryHashes.size(); + if (descrcomph.contains(queryhash)) ranking += 40L*4096L*4096L / descrcomps.length / query.queryHashes.size(); } + // insert value - //System.out.println("Ranking " + ranking + " for URL " + url.toString()); + //System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url()); pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), page); } @@ -175,6 +182,20 @@ public final class plasmaSearchResult { results = null; } + public void removeDoubleDom() { + Iterator i = pageAcc.entrySet().iterator(); + HashSet doms = new HashSet(); + Map.Entry entry; + String dom; + + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + dom = ((plasmaCrawlLURL.Entry) entry.getValue()).url().getHost(); + if (doms.contains(dom)) i.remove(); else doms.add(dom); + } + + } + public void removeRedundant() { // remove all urls from the pageAcc structure that occur double by specific redundancy rules // a link is redundant, if a sub-path of the url is cited before. redundant urls are removed diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 4da2e293c..1fa173723 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -281,6 +281,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ppRamString(stopwordsFile.length()/1024)); } + // load ranking tables + File rankingPath = new File(rootPath, "ranking/YBR"); + if (rankingPath.exists()) { + plasmaSearchPreOrder.loadYBR(rankingPath, 12); + } + // read memory amount int ramLURL = (int) getConfigLong("ramCacheLURL", 1024) / 1024; int ramNURL = (int) getConfigLong("ramCacheNURL", 1024) / 1024; @@ -1555,14 +1561,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public serverObjects searchFromLocal(plasmaSearchQuery query) { // tell all threads to do nothing for a specific time - //log.logInfo("A"); wordIndex.intermission(2 * query.maximumTime); - //log.logInfo("B"); intermissionAllThreads(2 * query.maximumTime); - //log.logInfo("C"); serverObjects prop = new serverObjects(); - //log.logInfo("D"); try { // filter out words that appear in bluelist //log.logInfo("E"); @@ -1654,6 +1656,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser prop.put("results_" + i + "_urlhash", urlhash); prop.put("results_" + i + "_urlname", nxTools.cutUrlText(urlname, 120)); prop.put("results_" + i + "_date", dateString(urlentry.moddate())); + prop.put("results_" + i + "_ybr", plasmaSearchPreOrder.ybr(urlentry.hash())); prop.put("results_" + i + "_size", Long.toString(urlentry.size())); prop.put("results_" + i + "_words",URLEncoder.encode(query.queryWords.toString(),"UTF-8")); // adding snippet if available diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index 67d8f4e4f..8869e7baf 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -176,11 +176,11 @@ public final class serverFileUtils { copy(new ByteArrayInputStream(source), dest); } - public static HashSet loadList(String filename) { + public static HashSet loadList(File file) { HashSet set = new HashSet(); BufferedReader br = null; try { - br = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); + br = new BufferedReader(new InputStreamReader(new FileInputStream(file))); String line; while ((line = br.readLine()) != null) { line = line.trim();