From 40621a5663950504d96fbefa405d0363c5a50a74 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 27 Nov 2005 11:55:24 +0000 Subject: [PATCH] anhancements in ranking preparation and fixed problem with parser/mime recognition git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1132 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- bin/cr_genrci | 2 +- .../de/anomic/kelondro/kelondroAttrSeq.java | 5 +- .../de/anomic/kelondro/kelondroBinSearch.java | 10 +++ source/de/anomic/plasma/plasmaParser.java | 6 +- .../plasma/plasmaRankingRCIEvaluation.java | 64 ++++++------------- .../anomic/plasma/plasmaSearchPreOrder.java | 2 +- 6 files changed, 41 insertions(+), 48 deletions(-) diff --git a/bin/cr_genrci b/bin/cr_genrci index b3d15c48e..63ff2df13 100755 --- a/bin/cr_genrci +++ b/bin/cr_genrci @@ -1,2 +1,2 @@ cd `dirname $0`/.. -java -server -Xms2000m -Xmx2000m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci . +java -server -Xms2100m -Xmx2100m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci . diff --git a/source/de/anomic/kelondro/kelondroAttrSeq.java b/source/de/anomic/kelondro/kelondroAttrSeq.java index a1c701643..dc49a5558 100644 --- a/source/de/anomic/kelondro/kelondroAttrSeq.java +++ b/source/de/anomic/kelondro/kelondroAttrSeq.java @@ -77,7 +77,7 @@ public class kelondroAttrSeq { public kelondroAttrSeq(File file, boolean tree) throws IOException { this.file = file; this.structure = null; - this.created = 0; + this.created = -1; this.name = ""; this.entries = (tree) ? (Map) new TreeMap() : (Map) new HashMap(); readAttrFile(file); @@ -144,6 +144,9 @@ public class kelondroAttrSeq { } } br.close(); + if (structure == null) throw new IOException("file contains no structure tag"); + if (name == null) throw new IOException("file contains no name tag"); + if (created == -1) throw new IOException("file contains no created tag"); } public int size() { diff --git a/source/de/anomic/kelondro/kelondroBinSearch.java b/source/de/anomic/kelondro/kelondroBinSearch.java index 2e2457f30..d306bc486 100644 --- a/source/de/anomic/kelondro/kelondroBinSearch.java +++ b/source/de/anomic/kelondro/kelondroBinSearch.java @@ -77,6 +77,16 @@ public class kelondroBinSearch { if (c > 0) /* buffer > t */ return contains(t, beginPos, pivot); return false; } + + public int size() { + return count; + } + + public byte[] get(int element) { + byte[] a = new byte[chunksize]; + System.arraycopy(this.chunks, element * this.chunksize, a, 0, chunksize); + return a; + } private void selectBuffer(int element) { System.arraycopy(this.chunks, element * this.chunksize, this.buffer, 0, chunksize); diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 38ffb529d..e1551aaaf 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -260,10 +260,12 @@ public final class plasmaParser { public static boolean supportedContent(URL url, String mimeType) { // TODO: we need some exceptions here to index URLs like this // http://www.musicabona.com/respighi/12668/cd/index.html.fr - if ((mimeType!=null)&&(mimeType.trim().equalsIgnoreCase("text/html"))) { + mimeType = getRealMimeType(mimeType); + if (mimeType.equals("text/html")) { return supportedMimeTypesContains(mimeType); + } else { + return supportedMimeTypesContains(mimeType) && supportedFileExt(url); } - return supportedMimeTypesContains(mimeType) && supportedFileExt(url); } public static boolean supportedRealTimeContent(URL url, String mimeType) { diff --git a/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java b/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java index de3e48b18..a51d2835b 100644 --- a/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java +++ b/source/de/anomic/plasma/plasmaRankingRCIEvaluation.java @@ -119,23 +119,6 @@ public class plasmaRankingRCIEvaluation { return partition; } - /* - public static int[] generateYBRLimits(int[] counts, int[] partition) { - int[] limits = new int[partition.length]; - int min; - int j = 0; - for (int i = partition.length - 1; i >= 0; i--) { - min = counts[j]; - while (j <= partition[i]) { - if (counts[j] < min) min = counts[j]; - j++; - } - limits[i] = min; - } - return limits; - } - */ - public static void checkPartitionTable0(int[] counts, int[] partition) { int sumsum = 0; int sum; @@ -204,7 +187,7 @@ public class plasmaRankingRCIEvaluation { String hash; String filename; if (!(tablePath.exists())) tablePath.mkdirs(); - for (int i = 0; i < ranking.length; i++) { + for (int i = 0; i < ranking.length - 1; i++) { filename = "YBR-4-" + serverCodings.encodeHex(i, 2) + ".idx"; serverFileUtils.saveSet(new File(tablePath, filename), ranking[i], ""); } @@ -217,30 +200,13 @@ public class plasmaRankingRCIEvaluation { File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz"); long start = System.currentTimeMillis(); if (!(rci_file.exists())) return; + + // create partition table final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false); int counts[] = rcieval(rci); int[] partition = interval(counts, 16); - TreeSet[] ranked = genRankingTable(rci, partition); - storeRankingTable(ranked, new File(root_path, "ranking/YBR")); - long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); - System.out.println("Finished YBR generation in " + seconds + " seconds."); - } - if ((args.length == 2) && (args[0].equals("-rcieval"))) { - File root_path = new File(args[1]); - File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz"); - long start = System.currentTimeMillis(); - if (!(rci_file.exists())) return; - final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false); - int counts[] = rcieval(rci); - long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); - System.out.println("Finished RCI evaluation in " + seconds + " seconds. " + counts.length + " counts in array."); - /* - System.out.println("count table:"); - for (int i = 0; i < counts.length; i++) { - System.out.println(i + " references: " + counts[i] + " times"); - } - */ - int[] partition = interval(counts, 16); + + // check the table System.out.println("partition position table:"); for (int i = 0; i < partition.length - 1; i++) { System.out.println("YBR-" + i + ": " + (partition[i + 1] + 1) + " - " + partition[i] + " references"); @@ -252,15 +218,27 @@ public class plasmaRankingRCIEvaluation { for (int i = 0; i < counts.length; i++) sum += counts[i]; System.out.println("sum of all references: " + sum); - // now print out the table + // create ranking TreeSet[] ranked = genRankingTable(rci, partition); + storeRankingTable(ranked, new File(root_path, "ranking/YBR")); + long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000); + System.out.println("Finished YBR generation in " + seconds + " seconds."); + } + if ((args.length == 2) && (args[0].equals("-rcieval"))) { + File root_path = new File(args[1]); + + // load a partition table + plasmaSearchPreOrder.loadYBR(new File(root_path, "ranking/YBR"), 16); + + // load domain list and generate hash index for domains HashMap dommap = genReverseDomHash(new File(root_path, "domlist.txt")); + + // print out the table String hash, dom; for (int i = 0; i < 9; i++) { System.out.print("YBR-" + i + ": "); - Iterator k = ranked[i].iterator(); - while (k.hasNext()) { - hash = (String) k.next(); + for (int j = 0; j < plasmaSearchPreOrder.ybrTables[i].size(); j++) { + hash = new String(plasmaSearchPreOrder.ybrTables[i].get(j)); dom = (String) dommap.get(hash); if (dom == null) System.out.print("[" + hash + "], "); else System.out.print(dom + ", "); } diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 7ad3baab1..b48fc4720 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -53,7 +53,7 @@ import de.anomic.kelondro.kelondroBinSearch; public final class plasmaSearchPreOrder { - private static kelondroBinSearch[] ybrTables = null; // block-rank tables + public static kelondroBinSearch[] ybrTables = null; // block-rank tables private static boolean useYBR = true; private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry