anhancements in ranking preparation and fixed problem with parser/mime recognition

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1132 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 38d915e24c
commit 40621a5663

@ -1,2 +1,2 @@
cd `dirname $0`/..
java -server -Xms2000m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci .
java -server -Xms2100m -Xmx2100m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci .

@ -77,7 +77,7 @@ public class kelondroAttrSeq {
public kelondroAttrSeq(File file, boolean tree) throws IOException {
this.file = file;
this.structure = null;
this.created = 0;
this.created = -1;
this.name = "";
this.entries = (tree) ? (Map) new TreeMap() : (Map) new HashMap();
readAttrFile(file);
@ -144,6 +144,9 @@ public class kelondroAttrSeq {
}
}
br.close();
if (structure == null) throw new IOException("file contains no structure tag");
if (name == null) throw new IOException("file contains no name tag");
if (created == -1) throw new IOException("file contains no created tag");
}
public int size() {

@ -77,6 +77,16 @@ public class kelondroBinSearch {
if (c > 0) /* buffer > t */ return contains(t, beginPos, pivot);
return false;
}
public int size() {
return count;
}
public byte[] get(int element) {
byte[] a = new byte[chunksize];
System.arraycopy(this.chunks, element * this.chunksize, a, 0, chunksize);
return a;
}
private void selectBuffer(int element) {
System.arraycopy(this.chunks, element * this.chunksize, this.buffer, 0, chunksize);

@ -260,10 +260,12 @@ public final class plasmaParser {
public static boolean supportedContent(URL url, String mimeType) {
// TODO: we need some exceptions here to index URLs like this
// http://www.musicabona.com/respighi/12668/cd/index.html.fr
if ((mimeType!=null)&&(mimeType.trim().equalsIgnoreCase("text/html"))) {
mimeType = getRealMimeType(mimeType);
if (mimeType.equals("text/html")) {
return supportedMimeTypesContains(mimeType);
} else {
return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
}
return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
}
public static boolean supportedRealTimeContent(URL url, String mimeType) {

@ -119,23 +119,6 @@ public class plasmaRankingRCIEvaluation {
return partition;
}
/*
public static int[] generateYBRLimits(int[] counts, int[] partition) {
int[] limits = new int[partition.length];
int min;
int j = 0;
for (int i = partition.length - 1; i >= 0; i--) {
min = counts[j];
while (j <= partition[i]) {
if (counts[j] < min) min = counts[j];
j++;
}
limits[i] = min;
}
return limits;
}
*/
public static void checkPartitionTable0(int[] counts, int[] partition) {
int sumsum = 0;
int sum;
@ -204,7 +187,7 @@ public class plasmaRankingRCIEvaluation {
String hash;
String filename;
if (!(tablePath.exists())) tablePath.mkdirs();
for (int i = 0; i < ranking.length; i++) {
for (int i = 0; i < ranking.length - 1; i++) {
filename = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx";
serverFileUtils.saveSet(new File(tablePath, filename), ranking[i], "");
}
@ -217,30 +200,13 @@ public class plasmaRankingRCIEvaluation {
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
long start = System.currentTimeMillis();
if (!(rci_file.exists())) return;
// create partition table
final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false);
int counts[] = rcieval(rci);
int[] partition = interval(counts, 16);
TreeSet[] ranked = genRankingTable(rci, partition);
storeRankingTable(ranked, new File(root_path, "ranking/YBR"));
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished YBR generation in " + seconds + " seconds.");
}
if ((args.length == 2) && (args[0].equals("-rcieval"))) {
File root_path = new File(args[1]);
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
long start = System.currentTimeMillis();
if (!(rci_file.exists())) return;
final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false);
int counts[] = rcieval(rci);
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished RCI evaluation in " + seconds + " seconds. " + counts.length + " counts in array.");
/*
System.out.println("count table:");
for (int i = 0; i < counts.length; i++) {
System.out.println(i + " references: " + counts[i] + " times");
}
*/
int[] partition = interval(counts, 16);
// check the table
System.out.println("partition position table:");
for (int i = 0; i < partition.length - 1; i++) {
System.out.println("YBR-" + i + ": " + (partition[i + 1] + 1) + " - " + partition[i] + " references");
@ -252,15 +218,27 @@ public class plasmaRankingRCIEvaluation {
for (int i = 0; i < counts.length; i++) sum += counts[i];
System.out.println("sum of all references: " + sum);
// now print out the table
// create ranking
TreeSet[] ranked = genRankingTable(rci, partition);
storeRankingTable(ranked, new File(root_path, "ranking/YBR"));
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished YBR generation in " + seconds + " seconds.");
}
if ((args.length == 2) && (args[0].equals("-rcieval"))) {
File root_path = new File(args[1]);
// load a partition table
plasmaSearchPreOrder.loadYBR(new File(root_path, "ranking/YBR"), 16);
// load domain list and generate hash index for domains
HashMap dommap = genReverseDomHash(new File(root_path, "domlist.txt"));
// print out the table
String hash, dom;
for (int i = 0; i < 9; i++) {
System.out.print("YBR-" + i + ": ");
Iterator k = ranked[i].iterator();
while (k.hasNext()) {
hash = (String) k.next();
for (int j = 0; j < plasmaSearchPreOrder.ybrTables[i].size(); j++) {
hash = new String(plasmaSearchPreOrder.ybrTables[i].get(j));
dom = (String) dommap.get(hash);
if (dom == null) System.out.print("[" + hash + "], "); else System.out.print(dom + ", ");
}

@ -53,7 +53,7 @@ import de.anomic.kelondro.kelondroBinSearch;
public final class plasmaSearchPreOrder {
private static kelondroBinSearch[] ybrTables = null; // block-rank tables
public static kelondroBinSearch[] ybrTables = null; // block-rank tables
private static boolean useYBR = true;
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry

Loading…
Cancel
Save