added first generation and usage of YBR index-files. Enhanced overall ranking of search results.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1118 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 75c0d12ac5
commit 0e25020f51

@ -1,3 +1,3 @@
cd `dirname $0`/..
java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate .
java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168
java -Xms1500m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate .
java -Xms1500m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168

@ -1,2 +1,2 @@
cd `dirname $0`/..
java -server -Xms1400m -Xmx1400m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci .
java -server -Xms2000m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci .

@ -0,0 +1,2 @@
cd `dirname $0`/..
java -server -Xms1000m -Xmx1000m -classpath source:classes de.anomic.plasma.plasmaRankingRCIEvaluation -genybr .

@ -26,8 +26,12 @@ Max. number of results:
</select>
&nbsp;&nbsp;order by:
<select NAME="order">
<option value="Quality-Date" #(order-quality)#::selected#(/order-quality)#>Quality-Date</option>
<option value="Date-Quality" #(order-date)#::selected#(/order-date)#>Date-Quality</option>
<option value="YBR-Date-Quality" #(order-ybr-date-quality)#::selected#(/order-ybr-date-quality)#>YBR-Date-Quality</option>
<option value="YBR-Quality-Date" #(order-ybr-quality-date)#::selected#(/order-ybr-quality-date)#>YBR-Quality-Date</option>
<option value="Date-YBR-Quality" #(order-date-ybr-quality)#::selected#(/order-date-ybr-quality)#>Date-YBR-Quality</option>
<option value="Quality-YBR-Date" #(order-quality-ybr-date)#::selected#(/order-quality-ybr-date)#>Quality-YBR-Date</option>
<option value="Date-Quality-YBR" #(order-date-quality-ybr)#::selected#(/order-date-quality-ybr)#>Date-Quality-YBR</option>
<option value="Quality-Date-YBR" #(order-quality-date-ybr)#::selected#(/order-quality-date-ybr)#>Quality-Date-YBR</option>
</select>
</td></tr><tr><td>
Resource:
@ -101,7 +105,7 @@ from 'late' peers.
<p><b>#[description]#</b><br>
#(snippet)#::<i>#[text]#</i><br>#(/snippet)#
<a href="#[url]#">#[urlname]#</a><br>
#[date]# | <a href="ViewFile.html?urlHash=#[urlhash]#&words=#[words]#">Info</a><br></p>
#[date]# | YBR-#[ybr]# | <a href="ViewFile.html?urlHash=#[urlhash]#&words=#[words]#">Info</a><br></p>
<!-- link end -->
#{/results}#

@ -56,6 +56,7 @@ import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchPreOrder;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
@ -103,8 +104,12 @@ public class index {
prop.put("count-50", 0);
prop.put("count-100", 0);
prop.put("count-1000", 0);
prop.put("order-quality", 0);
prop.put("order-date", 0);
prop.put("order-ybr-date-quality", plasmaSearchPreOrder.canUseYBR() ? 1 : 0);
prop.put("order-ybr-quality-date", 0);
prop.put("order-date-ybr-quality", 0);
prop.put("order-quality-ybr-date", 0);
prop.put("order-date-quality-ybr", plasmaSearchPreOrder.canUseYBR() ? 0 : 1);
prop.put("order-quality-date-ybr", 0);
prop.put("resource-global", ((global) ? 1 : 0));
prop.put("resource-local", ((global) ? 0 : 1));
prop.put("time-1", 0);
@ -137,8 +142,16 @@ public class index {
(yacyCore.seedDB.mySeed != null) &&
(yacyCore.seedDB.mySeed.getAddress() != null));
final String order1 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_QUALITY : plasmaSearchQuery.ORDER_DATE;
final String order2 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_DATE : plasmaSearchQuery.ORDER_QUALITY;
String order1="", order2="", order3="";
if (order.startsWith("YBR")) order1 = plasmaSearchQuery.ORDER_YBR;
if (order.startsWith("Date")) order1 = plasmaSearchQuery.ORDER_DATE;
if (order.startsWith("Quality")) order1 = plasmaSearchQuery.ORDER_QUALITY;
if (order.indexOf("-YBR-") > 0) order2 = plasmaSearchQuery.ORDER_YBR;
if (order.indexOf("-Date-") > 0) order2 = plasmaSearchQuery.ORDER_DATE;
if (order.indexOf("-Quality-") > 0) order2 = plasmaSearchQuery.ORDER_QUALITY;
if (order.endsWith("YBR")) order3 = plasmaSearchQuery.ORDER_YBR;
if (order.endsWith("Date")) order3 = plasmaSearchQuery.ORDER_DATE;
if (order.endsWith("Quality")) order3 = plasmaSearchQuery.ORDER_QUALITY;
String urlmask = "";
if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) {
urlmask = ".*";
@ -147,7 +160,7 @@ public class index {
}
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2}, count, searchtime, urlmask, referer,
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2, order3}, count, searchtime, urlmask, referer,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20);
final serverObjects prop = sb.searchFromLocal(thisSearch);
@ -240,8 +253,12 @@ public class index {
prop.put("count-50", ((count == 50)) ? 1 : 0);
prop.put("count-100", ((count == 100)) ? 1 : 0);
prop.put("count-1000", ((count == 1000)) ? 1 : 0);
prop.put("order-quality", ((order.equals("Quality-Date")) ? 1 : 0));
prop.put("order-date", ((order.equals("Date-Quality")) ? 1 : 0));
prop.put("order-ybr-date-quality", ((order.equals("YBR-Date-Quality")) ? 1 : 0));
prop.put("order-ybr-quality-date", ((order.equals("YBR-Quality-Date")) ? 1 : 0));
prop.put("order-date-ybr-quality", ((order.equals("Date-YBR-Quality")) ? 1 : 0));
prop.put("order-quality-ybr-date", ((order.equals("Quality-YBR-Date")) ? 1 : 0));
prop.put("order-date-quality-ybr", ((order.equals("Date-Quality-YBR")) ? 1 : 0));
prop.put("order-quality-date-ybr", ((order.equals("Quality-Date-YBR")) ? 1 : 0));
prop.put("resource-global", ((global) ? 1 : 0));
prop.put("resource-local", ((global) ? 0 : 1));
prop.put("time-1", ((searchtime == 1000) ? 1 : 0));

@ -231,7 +231,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// load the yellow-list
f = switchboard.getConfig("proxyYellowList", null);
if (f != null) {
yellowList = serverFileUtils.loadList(f);
yellowList = serverFileUtils.loadList(new File(f));
this.theLogger.logConfig("loaded yellow-list from file " + f + ", " + yellowList.size() + " entries");
} else {
yellowList = new HashSet();

@ -206,6 +206,10 @@ public class kelondroAttrSeq {
entries.put(entry.pivot, entry);
}
public void putEntrySmall(Entry entry) {
entries.put(entry.pivot, entry.toString());
}
public Entry getEntry(String pivot) {
Object e = entries.get(pivot);
if (e == null) return null;
@ -351,7 +355,7 @@ public class kelondroAttrSeq {
int p = attrseq.indexOf('|') + 1;
long[] seqattrs = new long[structure.seq_names.length - 1];
String seqname;
while (p < attrseq.length()) {
while (p + structure.seq_len[0] <= attrseq.length()) {
seqname = attrseq.substring(p, p + structure.seq_len[0]);
p += structure.seq_len[0];
for (int i = 1; i < structure.seq_names.length; i++) {

@ -64,6 +64,7 @@ import java.util.Locale;
import java.util.Properties;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroException;
import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
@ -719,17 +720,27 @@ public final class plasmaCrawlLURL extends plasmaURL {
public class kiter implements Iterator {
// enumerates entry elements
kelondroTree.rowIterator i;
boolean error = false;
public kiter(boolean up, boolean rotating) throws IOException {
i = urlHashCache.rows(up, rotating);
error = false;
}
public boolean hasNext() {
if (error) return false;
return i.hasNext();
}
public Object next() {
byte[] e = ((byte[][])i.next())[0];
if (e == null) return null; else return new Entry(new String(e));
try {
byte[] e = ((byte[][])i.next())[0];
if (e == null) return null; else return new Entry(new String(e));
} catch (kelondroException e) {
e.printStackTrace();
error = true;
return null;
}
}
public void remove() {

@ -53,6 +53,7 @@ import java.util.Map;
import de.anomic.kelondro.kelondroAttrSeq;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverDate;
import de.anomic.tools.bitfield;
public class plasmaRankingCRProcess {
@ -131,13 +132,13 @@ public class plasmaRankingCRProcess {
acc_entry.setAttr("ACount", (long) ACount);
acc_entry.setAttr("VCount", (long) VCount);
acc_entry.setAttr("Vita", (long) Vita);
acc.putEntry(acc_entry);
acc.putEntrySmall(acc_entry);
}
return true;
}
public static void accumulate(File from_dir, File tmp_dir, File err_dir, File bkp_dir, File to_file) throws IOException {
public static void accumulate(File from_dir, File tmp_dir, File err_dir, File bkp_dir, File to_file, int max_files) throws IOException {
if (!(from_dir.isDirectory())) {
System.out.println("source path " + from_dir + " is not a directory.");
return;
@ -171,7 +172,8 @@ public class plasmaRankingCRProcess {
kelondroAttrSeq source_cr = null;
File source_file = null;
String[] files = from_dir.list();
for (int i = 0; i < files.length; i++) {
if (files.length < max_files) max_files = files.length;
for (int i = 0; i < max_files; i++) {
// open file
source_file = new File(from_dir, files[i]);
if (accumulate_upd(source_file, acc)) {
@ -206,7 +208,7 @@ public class plasmaRankingCRProcess {
public static int genrci(File cr_in, File rci_out) throws IOException {
if (!(cr_in.exists())) return 0;
final kelondroAttrSeq cr = new kelondroAttrSeq(cr_in, false);
if (rci_out.exists()) rci_out.delete(); // we want only fresh rci here (during testing)
//if (rci_out.exists()) rci_out.delete(); // we want only fresh rci here (during testing)
if (!(rci_out.exists())) {
kelondroAttrSeq rcix = new kelondroAttrSeq("Global Ranking Reverse Citation Index",
"<AnchorDom-6>,'='," +
@ -267,7 +269,7 @@ public class plasmaRankingCRProcess {
// java -classpath source de.anomic.plasma.kelondroPropFile -transcode DATA/RANKING/GLOBAL/CRG-test-unsorted-original.cr DATA/RANKING/GLOBAL/CRG-test-generated.cr
try {
if ((args.length == 5) && (args[0].equals("-accumulate"))) {
accumulate(new File(args[1]), new File(args[2]), new File(args[3]), new File(args[4]), new File(args[5]));
accumulate(new File(args[1]), new File(args[2]), new File(args[3]), new File(args[4]), new File(args[5]), Integer.parseInt(args[6]));
}
if ((args.length == 2) && (args[0].equals("-accumulate"))) {
File root_path = new File(args[1]);
@ -276,7 +278,8 @@ public class plasmaRankingCRProcess {
File tmp_dir = new File(root_path, "DATA/RANKING/GLOBAL/016_tmp");
File err_dir = new File(root_path, "DATA/RANKING/GLOBAL/017_err");
File acc_dir = new File(root_path, "DATA/RANKING/GLOBAL/018_acc");
File to_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/CRG-a-acc.cr.gz");
String filename = "CRG-a-" + new serverDate().toShortString(true) + ".cr.gz";
File to_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/" + filename);
if (!(ready_dir.exists())) ready_dir.mkdirs();
if (!(tmp_dir.exists())) tmp_dir.mkdirs();
if (!(err_dir.exists())) err_dir.mkdirs();
@ -285,7 +288,7 @@ public class plasmaRankingCRProcess {
serverFileUtils.moveAll(from_dir, ready_dir);
long start = System.currentTimeMillis();
int files = ready_dir.list().length;
accumulate(ready_dir, tmp_dir, err_dir, acc_dir, to_file);
accumulate(ready_dir, tmp_dir, err_dir, acc_dir, to_file, 1000);
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished accumulate for " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)");
}
@ -328,13 +331,16 @@ public class plasmaRankingCRProcess {
}
if ((args.length == 2) && (args[0].equals("-genrci"))) {
File root_path = new File(args[1]);
File cr_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/CRG-a-acc.cr.gz");
File cr_filedir = new File(root_path, "DATA/RANKING/GLOBAL/020_con0");
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
rci_file.getParentFile().mkdirs();
long start = System.currentTimeMillis();
int count = genrci(cr_file, rci_file);
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished RCI generation: " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)");
String[] cr_filenames = cr_filedir.list();
for (int i = 0; i < cr_filenames.length; i++) {
long start = System.currentTimeMillis();
int count = genrci(new File(cr_filedir, cr_filenames[i]), rci_file);
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Completed RCI generation for input file " + cr_filenames[i] + ": " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)");
}
}
} catch (IOException e) {
e.printStackTrace();

@ -47,8 +47,11 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.Iterator;
import java.util.HashMap;
import java.util.HashSet;
import de.anomic.kelondro.kelondroAttrSeq;
import de.anomic.server.serverCodings;
@ -57,11 +60,9 @@ import de.anomic.tools.bitfield;
public class plasmaRankingRCIEvaluation {
public static int[] rcieval(File rci_file) throws IOException {
public static int[] rcieval(kelondroAttrSeq rci) throws IOException {
// collect information about which entry has how many references
// the output is a reference-count:occurrences relation
if (!(rci_file.exists())) return null;
final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false);
HashMap counts = new HashMap();
Iterator i = rci.keys();
String key;
@ -102,40 +103,169 @@ public class plasmaRankingRCIEvaluation {
public static int[] interval(int[] counts, int parts) {
long limit = sum(counts) / 2;
int[] pos = new int[parts];
int[] partition = new int[parts];
int s = 0, p = parts - 1;
for (int i = 0; i < counts.length; i++) {
for (int i = 1; i < counts.length; i++) {
s += counts[i];
if ((s > limit) && (p >= 0)) {
pos[p--] = i - 1;
limit = (2 * limit - s + counts[i]) / 2;
s = counts[i];
partition[p--] = i;
limit = (2 * limit - s) / 2;
s = 0;
}
}
pos[0] = counts.length - 1;
return pos;
partition[0] = counts.length - 1;
for (int i = 1; i < 10; i++) partition[i] = (partition[i - 1] + 4 * partition[i]) / 5;
return partition;
}
/*
public static int[] generateYBRLimits(int[] counts, int[] partition) {
int[] limits = new int[partition.length];
int min;
int j = 0;
for (int i = partition.length - 1; i >= 0; i--) {
min = counts[j];
while (j <= partition[i]) {
if (counts[j] < min) min = counts[j];
j++;
}
limits[i] = min;
}
return limits;
}
*/
public static void checkPartitionTable0(int[] counts, int[] partition) {
int sumsum = 0;
int sum;
int j = 0;
for (int i = partition.length - 1; i >= 0; i--) {
sum = 0;
while (j <= partition[i]) {
sum += counts[j++];
}
System.out.println("sum of YBR-" + i + " entries: " + sum);
sumsum += sum;
}
System.out.println("complete sum = " + sumsum);
}
public static void checkPartitionTable1(int[] counts, int[] partition) {
int sumsum = 0;
int[] sum = new int[partition.length];
for (int i = 0; i < partition.length; i++) sum[i] = 0;
for (int i = 0; i < counts.length; i++) sum[orderIntoYBI(partition, i)] += counts[i];
for (int i = partition.length - 1; i >= 0; i--) {
System.out.println("sum of YBR-" + i + " entries: " + sum[i]);
sumsum += sum[i];
}
System.out.println("complete sum = " + sumsum);
}
public static int orderIntoYBI(int[] partition, int count) {
for (int i = 0; i < partition.length - 1; i++) {
if ((count >= (partition[i + 1] + 1)) && (count <= partition[i])) return i;
}
return partition.length - 1;
}
public static HashSet[] genRankingTable(kelondroAttrSeq rci, int[] partition) {
HashSet[] ranked = new HashSet[partition.length];
for (int i = 0; i < partition.length; i++) ranked[i] = new HashSet();
Iterator i = rci.keys();
String key;
kelondroAttrSeq.Entry entry;
while (i.hasNext()) {
key = (String) i.next();
entry = rci.getEntry(key);
ranked[orderIntoYBI(partition, entry.getSeq().size())].add(key);
}
return ranked;
}
public static HashMap genReverseDomHash(File domlist) {
HashSet domset = serverFileUtils.loadList(domlist);
HashMap dommap = new HashMap();
Iterator i = domset.iterator();
String dom;
while (i.hasNext()) {
dom = (String) i.next();
if (dom.startsWith("www.")) dom = dom.substring(4);
try {
dommap.put(plasmaURL.urlHash(new URL("http://" + dom)).substring(6), dom);
dommap.put(plasmaURL.urlHash(new URL("http://www." + dom)).substring(6), "www." + dom);
} catch (MalformedURLException e) {}
}
return dommap;
}
public static void storeRankingTable(HashSet[] ranking, File tablePath) throws IOException {
String hash;
String filename;
if (!(tablePath.exists())) tablePath.mkdirs();
for (int i = 0; i < ranking.length; i++) {
filename = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx";
serverFileUtils.saveSet(new File(tablePath, filename), ranking[i], "");
}
}
public static void main(String[] args) {
try {
if ((args.length == 2) && (args[0].equals("-genybr"))) {
File root_path = new File(args[1]);
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
long start = System.currentTimeMillis();
if (!(rci_file.exists())) return;
final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false);
int counts[] = rcieval(rci);
int[] partition = interval(counts, 16);
HashSet[] ranked = genRankingTable(rci, partition);
storeRankingTable(ranked, new File(root_path, "ranking/YBR"));
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished YBR generation in " + seconds + " seconds.");
}
if ((args.length == 2) && (args[0].equals("-rcieval"))) {
File root_path = new File(args[1]);
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
long start = System.currentTimeMillis();
int count[] = rcieval(rci_file);
if (!(rci_file.exists())) return;
final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false);
int counts[] = rcieval(rci);
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished RCI evaluation in " + seconds + " seconds");
System.out.println("Finished RCI evaluation in " + seconds + " seconds. " + counts.length + " counts in array.");
/*
System.out.println("count table:");
for (int i = 0; i < count.length; i++) {
System.out.println(i + " references: " + count[i] + " times");
for (int i = 0; i < counts.length; i++) {
System.out.println(i + " references: " + counts[i] + " times");
}
*/
int[] pos = interval(count, 16);
int[] partition = interval(counts, 16);
System.out.println("partition position table:");
for (int i = 0; i < pos.length; i++) {
System.out.println("position " + i + ": " + pos[i]);
for (int i = 0; i < partition.length - 1; i++) {
System.out.println("YBR-" + i + ": " + (partition[i + 1] + 1) + " - " + partition[i] + " references");
}
System.out.println("YBR-" + (partition.length - 1) + ": 0 - " + partition[partition.length - 1] + " references");
checkPartitionTable0(counts, partition);
checkPartitionTable1(counts, partition);
int sum = 0;
for (int i = 0; i < counts.length; i++) sum += counts[i];
System.out.println("sum of all references: " + sum);
// now print out the table
HashSet[] ranked = genRankingTable(rci, partition);
HashMap dommap = genReverseDomHash(new File(root_path, "domlist.txt"));
String hash, dom;
for (int i = 0; i < 9; i++) {
System.out.print("YBR-" + i + ": ");
Iterator k = ranked[i].iterator();
while (k.hasNext()) {
hash = (String) k.next();
dom = (String) dommap.get(hash);
if (dom == null) System.out.print("[" + hash + "], "); else System.out.print(dom + ", ");
}
System.out.println();
}
}
} catch (IOException e) {
e.printStackTrace();

@ -249,7 +249,8 @@ public final class plasmaSearchEvent {
// apply filter
profileLocal.startTimer();
acc.removeRedundant();
acc.removeDoubleDom();
//acc.removeRedundant();
profileLocal.setYieldTime(plasmaSearchProfile.PROCESS_FILTER);
profileLocal.setYieldCount(plasmaSearchProfile.PROCESS_FILTER, acc.sizeOrdered());

@ -42,16 +42,53 @@
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Set;
import java.util.TreeMap;
import java.util.Iterator;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
public final class plasmaSearchPreOrder {
private static Set[] ybrTables = null; // block-rank tables
private static boolean useYBR = true;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
private plasmaSearchQuery query;
public static void loadYBR(File rankingPath, int count) {
// load ranking tables
if (rankingPath.exists()) {
ybrTables = new Set[count];
String ybrName;
try {
for (int i = 0; i < count; i++) {
ybrName = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx";
ybrTables[i] = serverFileUtils.loadSet(new File(rankingPath, ybrName), 6, false);
}
} catch (IOException e) {
ybrTables = null;
}
} else {
ybrTables = null;
}
}
public static boolean canUseYBR() {
return ybrTables != null;
}
public static boolean isUsingYBR() {
return useYBR;
}
public static void switchYBR(boolean usage) {
useYBR = usage;
}
public plasmaSearchPreOrder(plasmaSearchQuery query) {
this.pageAcc = new TreeMap();
this.query = query;
@ -64,7 +101,6 @@ public final class plasmaSearchPreOrder {
return theClone;
}
public boolean hasNext() {
return pageAcc.size() > 0;
}
@ -87,12 +123,34 @@ public final class plasmaSearchPreOrder {
public void addEntry(plasmaWordIndexEntry indexEntry) {
long ranking = 0;
if (query.order[0].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = 4096 * indexEntry.getQuality();
else if (query.order[0].equals(plasmaSearchQuery.ORDER_DATE)) ranking = 4096 * indexEntry.getVirtualAge();
if (query.order[1].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality();
else if (query.order[1].equals(plasmaSearchQuery.ORDER_DATE)) ranking += indexEntry.getVirtualAge();
long factor = 1024 * 1024;
for (int i = 0; i < 3; i++) {
if (query.order[i].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = factor * indexEntry.getQuality();
else if (query.order[i].equals(plasmaSearchQuery.ORDER_DATE)) ranking = factor * indexEntry.getVirtualAge();
else if (query.order[i].equals(plasmaSearchQuery.ORDER_YBR)) ranking = factor * ybr_p(indexEntry.getUrlHash());
factor = factor / 1024;
}
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), indexEntry);
}
public static int ybr_p(String urlHash) {
return 16 - ybr(urlHash);
}
public static int ybr(String urlHash) {
if (ybrTables == null) return 16;
if (!(useYBR)) return 16;
final String domHash = urlHash.substring(6);
for (int i = 0; i < ybrTables.length; i++) {
if (ybrTables[i].contains(domHash)) {
//System.out.println("YBR FOUND: " + urlHash + " (" + i + ")");
return i;
}
}
//System.out.println("NOT FOUND: " + urlHash);
return 16;
}
}

@ -54,6 +54,7 @@ public final class plasmaSearchQuery {
public static final String ORDER_QUALITY = "quality";
public static final String ORDER_DATE = "date";
public static final String ORDER_YBR = "ybr";
public static final int SEARCHDOM_LOCAL = 0;
public static final int SEARCHDOM_GROUPDHT = 1;

@ -111,8 +111,8 @@ public final class plasmaSearchResult {
URL url = page.url();
String descr = page.descr();
if ((url == null) || (descr == null)) return;
String[] urlcomps = url.toString().split(splitrex); // word components of the url
String[] descrcomps = descr.split(splitrex); // words in the description
String[] urlcomps = url.toString().toLowerCase().split(splitrex); // word components of the url
String[] descrcomps = descr.toLowerCase().split(splitrex); // words in the description
// store everything
Object[] resultVector = new Object[] {indexEntry, page, urlcomps, descrcomps};
@ -137,7 +137,7 @@ public final class plasmaSearchResult {
plasmaCrawlLURL.Entry page;
String[] urlcomps;
String[] descrcomps;
long ranking;
long ranking, factor;
String queryhash;
for (int i = 0; i < results.size(); i++) {
// take out values from result array
@ -149,12 +149,18 @@ public final class plasmaSearchResult {
// apply pre-calculated order attributes
ranking = 0;
if (query.order[0].equals(plasmaSearchQuery.ORDER_DATE)) ranking += 10 * indexEntry.getVirtualAge();
//if (query.order[0].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality();
factor = 4096L*4096L;
for (int j = 0; j < 3; j++) {
if (query.order[j].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += factor * indexEntry.getQuality() / 64L;
else if (query.order[j].equals(plasmaSearchQuery.ORDER_DATE)) ranking += factor * indexEntry.getVirtualAge() / 64L;
else if (query.order[j].equals(plasmaSearchQuery.ORDER_YBR)) ranking += factor * plasmaSearchPreOrder.ybr_p(indexEntry.getUrlHash());
factor = factor / 4096L;
}
// apply 'common-sense' heuristic using references
for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking++;
for (int j = 0; j < descrcomps.length; j++) if (commonSense.contains(descrcomps[j])) ranking++;
for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += 10L*4096L*4096L / urlcomps.length;
for (int j = 0; j < descrcomps.length; j++) if (commonSense.contains(descrcomps[j])) ranking += 10L*4096L*4096L / descrcomps.length;
// apply query-in-result matching
Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps);
@ -162,12 +168,13 @@ public final class plasmaSearchResult {
Iterator shi = query.queryHashes.iterator();
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash)) ranking += 10;
if (descrcomph.contains(queryhash)) ranking += 100;
if (urlcomph.contains(queryhash)) ranking += 90L*4096L*4096L / urlcomps.length / query.queryHashes.size();
if (descrcomph.contains(queryhash)) ranking += 40L*4096L*4096L / descrcomps.length / query.queryHashes.size();
}
// insert value
//System.out.println("Ranking " + ranking + " for URL " + url.toString());
//System.out.println("Ranking " + ranking + ", YBR-" + plasmaSearchPreOrder.ybr(indexEntry.getUrlHash()) + " for URL " + page.url());
pageAcc.put(serverCodings.encodeHex(ranking, 16) + indexEntry.getUrlHash(), page);
}
@ -175,6 +182,20 @@ public final class plasmaSearchResult {
results = null;
}
public void removeDoubleDom() {
Iterator i = pageAcc.entrySet().iterator();
HashSet doms = new HashSet();
Map.Entry entry;
String dom;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
dom = ((plasmaCrawlLURL.Entry) entry.getValue()).url().getHost();
if (doms.contains(dom)) i.remove(); else doms.add(dom);
}
}
public void removeRedundant() {
// remove all urls from the pageAcc structure that occur double by specific redundancy rules
// a link is redundant, if a sub-path of the url is cited before. redundant urls are removed

@ -281,6 +281,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
ppRamString(stopwordsFile.length()/1024));
}
// load ranking tables
File rankingPath = new File(rootPath, "ranking/YBR");
if (rankingPath.exists()) {
plasmaSearchPreOrder.loadYBR(rankingPath, 12);
}
// read memory amount
int ramLURL = (int) getConfigLong("ramCacheLURL", 1024) / 1024;
int ramNURL = (int) getConfigLong("ramCacheNURL", 1024) / 1024;
@ -1555,14 +1561,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public serverObjects searchFromLocal(plasmaSearchQuery query) {
// tell all threads to do nothing for a specific time
//log.logInfo("A");
wordIndex.intermission(2 * query.maximumTime);
//log.logInfo("B");
intermissionAllThreads(2 * query.maximumTime);
//log.logInfo("C");
serverObjects prop = new serverObjects();
//log.logInfo("D");
try {
// filter out words that appear in bluelist
//log.logInfo("E");
@ -1654,6 +1656,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
prop.put("results_" + i + "_urlhash", urlhash);
prop.put("results_" + i + "_urlname", nxTools.cutUrlText(urlname, 120));
prop.put("results_" + i + "_date", dateString(urlentry.moddate()));
prop.put("results_" + i + "_ybr", plasmaSearchPreOrder.ybr(urlentry.hash()));
prop.put("results_" + i + "_size", Long.toString(urlentry.size()));
prop.put("results_" + i + "_words",URLEncoder.encode(query.queryWords.toString(),"UTF-8"));
// adding snippet if available

@ -176,11 +176,11 @@ public final class serverFileUtils {
copy(new ByteArrayInputStream(source), dest);
}
public static HashSet loadList(String filename) {
public static HashSet loadList(File file) {
HashSet set = new HashSet();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
String line;
while ((line = br.readLine()) != null) {
line = line.trim();

Loading…
Cancel
Save