From 29fe436e365d58858d1c2e464ba8b03f4502d7f3 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 9 Nov 2009 19:14:51 +0000 Subject: [PATCH] - fixed post-ranking including prefer mask - enhanced a core database access method / less wasted ram git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6473 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexImportOAIPMHList_p.java | 2 +- htroot/yacysearch.java | 4 +- source/de/anomic/search/RankingProcess.java | 46 +--------- source/de/anomic/search/ResultFetcher.java | 58 +++++++++++- .../document/importer/OAIPMHImporter.java | 90 ++++++++++++++++++- .../yacy/document/importer/OAIPMHReader.java | 19 +--- source/net/yacy/kelondro/index/Cache.java | 17 ++-- 7 files changed, 163 insertions(+), 73 deletions(-) diff --git a/htroot/IndexImportOAIPMHList_p.java b/htroot/IndexImportOAIPMHList_p.java index 33454e3b8..9c2dd257b 100644 --- a/htroot/IndexImportOAIPMHList_p.java +++ b/htroot/IndexImportOAIPMHList_p.java @@ -43,7 +43,7 @@ public class IndexImportOAIPMHList_p { prop.put("source", 0); if (post != null && post.containsKey("source")) { - Set oaiRoots = OAIPMHImporter.getOAIServer(sb.loader); + Set oaiRoots = OAIPMHImporter.getAllListedOAIServer(sb.loader); boolean dark = false; int cnt = 0; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 5d3f2a41c..ea50a37b5 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -180,8 +180,8 @@ public class yacysearch { originalUrlMask = ".*"; } - String prefermask = (post == null ? "" : post.get("prefermaskfilter", "")); - if ((prefermask.length() > 0) && (prefermask.indexOf(".*") < 0)) prefermask = ".*" + prefermask + ".*"; + String prefermask = (post == null) ? "" : post.get("prefermaskfilter", ""); + if (prefermask.length() > 0 && prefermask.indexOf(".*") < 0) prefermask = ".*" + prefermask + ".*"; Bitfield constraint = (post != null && post.containsKey("constraint") && post.get("constraint", "").length() > 0) ? new Bitfield(4, post.get("constraint", "______")) : null; if (indexof) { diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 87674d3f4..0f813ce29 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -35,7 +35,6 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; -import java.util.Set; import java.util.TreeSet; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; @@ -579,6 +578,10 @@ public final class RankingProcess extends Thread { } }; + public Map getTopics() { + return this.ref; + } + @SuppressWarnings("unchecked") public ArrayList getTopicNavigator(final int count) { // create a list of words that had been computed by statistics over all @@ -701,46 +704,5 @@ public final class RankingProcess extends Thread { //System.out.println("NOT FOUND: " + urlHash); return 15; } - - public long postRanking( - final Set topwords, - final ResultEntry rentry, - final int position) { - - long r = (255 - position) << 8; - - // for media search: prefer pages with many links - if (query.contentdom == QueryParams.CONTENTDOM_IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage; - if (query.contentdom == QueryParams.CONTENTDOM_AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio; - if (query.contentdom == QueryParams.CONTENTDOM_VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo; - if (query.contentdom == QueryParams.CONTENTDOM_APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp; - - // prefer hit with 'prefer' pattern - if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer; - if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer; - - // apply 'common-sense' heuristic using references - final String urlstring = rentry.url().toNormalform(true, true); - final String[] urlcomps = DigestURI.urlComps(urlstring); - final String[] descrcomps = rentry.title().toLowerCase().split(DigestURI.splitrex); - for (int j = 0; j < urlcomps.length; j++) { - if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << query.ranking.coeff_urlcompintoplist; - } - for (int j = 0; j < descrcomps.length; j++) { - if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << query.ranking.coeff_descrcompintoplist; - } - // apply query-in-result matching - final Set urlcomph = Word.words2hashSet(urlcomps); - final Set descrcomph = Word.words2hashSet(descrcomps); - final Iterator shi = query.queryHashes.iterator(); - byte[] queryhash; - while (shi.hasNext()) { - queryhash = shi.next(); - if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl; - if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_app_dc_title; - } - - return r; - } } diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 292654cad..3a24cd462 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -29,10 +29,15 @@ package de.anomic.search; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; import java.util.TreeSet; import net.yacy.document.Condenser; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.MemoryTracker; import net.yacy.kelondro.util.SetTools; @@ -173,7 +178,12 @@ public class ResultFetcher { // place the result to the result vector if (!result.exists(resultEntry)) { - result.push(resultEntry, Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()))); + + // apply post-ranking + long ranking = Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word())); + ranking += postRanking(resultEntry, rankedCache.getTopics()); + + result.push(resultEntry, ranking); if (nav_topics) rankedCache.addTopics(resultEntry); } //System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url()); @@ -355,5 +365,49 @@ public class ResultFetcher { } return this.result.list(this.result.size()); } - + + public long postRanking( + final ResultEntry rentry, + final Map topwords) { + + long r = 0; + + // for media search: prefer pages with many links + if (query.contentdom == QueryParams.CONTENTDOM_IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage; + if (query.contentdom == QueryParams.CONTENTDOM_AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio; + if (query.contentdom == QueryParams.CONTENTDOM_VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo; + if (query.contentdom == QueryParams.CONTENTDOM_APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp; + + // prefer hit with 'prefer' pattern + if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer; + if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer; + + // apply 'common-sense' heuristic using references + final String urlstring = rentry.url().toNormalform(true, true); + final String[] urlcomps = DigestURI.urlComps(urlstring); + final String[] descrcomps = rentry.title().toLowerCase().split(DigestURI.splitrex); + Integer tc; + for (int j = 0; j < urlcomps.length; j++) { + tc = topwords.get(urlcomps[j]); + if (tc != null) r += Math.max(1, tc.intValue()) << query.ranking.coeff_urlcompintoplist; + } + for (int j = 0; j < descrcomps.length; j++) { + tc = topwords.get(descrcomps[j]); + if (tc != null) r += Math.max(1, tc) << query.ranking.coeff_descrcompintoplist; + } + + // apply query-in-result matching + final Set urlcomph = Word.words2hashSet(urlcomps); + final Set descrcomph = Word.words2hashSet(descrcomps); + final Iterator shi = query.queryHashes.iterator(); + byte[] queryhash; + while (shi.hasNext()) { + queryhash = shi.next(); + if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl; + if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_app_dc_title; + } + + return r; + } + } diff --git a/source/net/yacy/document/importer/OAIPMHImporter.java b/source/net/yacy/document/importer/OAIPMHImporter.java index b88f49ef5..fa50db6dc 100644 --- a/source/net/yacy/document/importer/OAIPMHImporter.java +++ b/source/net/yacy/document/importer/OAIPMHImporter.java @@ -31,12 +31,17 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.net.MalformedURLException; +import java.text.ParseException; +import java.util.Date; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.TreeSet; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.DateFormatter; import net.yacy.repository.LoaderDispatcher; import net.yacy.document.parser.csvParser; @@ -133,7 +138,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable getOAIServer(LoaderDispatcher loader) { + public static Set getUnloadedOAIServer( + LoaderDispatcher loader, + File surrogatesIn, + File surrogatesOut, + long staleLimit) { + Set plainList = getAllListedOAIServer(loader); + Map loaded = getLoadedOAIServer(surrogatesIn, surrogatesOut); + long limit = System.currentTimeMillis() - staleLimit; + for (Map.Entry a: loaded.entrySet()) { + if (a.getValue().getTime() > limit) plainList.remove(a.getKey()); + } + return plainList; + } + + /** + * use the list server at http://roar.eprints.org/index.php?action=csv + * to produce a list of OAI-PMH sources + * @param loader + * @return the list of oai-pmh sources + */ + public static Set getAllListedOAIServer(LoaderDispatcher loader) { TreeSet list = new TreeSet(); // read roar @@ -204,5 +229,66 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable getLoadedOAIServer(File surrogatesIn, File surrogatesOut) { + Map map = getLoadedOAIServer(surrogatesOut); + map.putAll((Map) getLoadedOAIServer(surrogatesIn).entrySet()); + return map; + } + + private static Map getLoadedOAIServer(File surrogates) { + HashMap map = new HashMap(); + //oaipmh_opus.bsz-bw.de_20091102113118728.xml + for (String s: surrogates.list()) { + if (s.startsWith(filenamePrefix) && s.endsWith(".xml") && s.charAt(s.length() - 22) == filenameSeparationChar) { + try { + Date fd = DateFormatter.parseShortMilliSecond(s.substring(s.length() - 21, s.length() - 4)); + String hostID = s.substring(7, s.length() - 22); + Date md = map.get(hostID); + if (md == null || fd.after(md)) map.put(hostID, fd); + } catch (ParseException e) { + Log.logException(e); + } + } + } + return map; + } + + public static final char hostReplacementChar = '_'; + public static final char filenameSeparationChar = '.'; + public static final String filenamePrefix = "oaipmh"; + + /** + * compute a host id that is also used in the getLoadedOAIServer method for the map key + * @param source + * @return a string that is a key for the given host + */ + public static final String hostID(DigestURI source) { + String s = ResumptionToken.truncatedURL(source); + if (s.endsWith("?")) s = s.substring(0, s.length() - 1); + if (s.endsWith("/")) s = s.substring(0, s.length() - 1); + if (s.startsWith("https://")) s = s.substring(8); + if (s.startsWith("http://")) s = s.substring(7); + return s.replace('.', hostReplacementChar).replace('/', hostReplacementChar).replace(':', hostReplacementChar); + } + /** + * get a file name for a source. the file name contains a prefix that is used to identify + * that source as part of the OAI-PMH import process and a host key to identify the source. + * also included is a date stamp within the file name + * @param source + * @return a file name for the given source. It will be different for each call for same hosts because it contains a date stamp + */ + public static final String filename4Source(DigestURI source) { + return filenamePrefix + OAIPMHImporter.filenameSeparationChar + + OAIPMHImporter.hostID(source) + OAIPMHImporter.filenameSeparationChar + + DateFormatter.formatShortMilliSecond(new Date()) + ".xml"; + } } \ No newline at end of file diff --git a/source/net/yacy/document/importer/OAIPMHReader.java b/source/net/yacy/document/importer/OAIPMHReader.java index 0b3b85678..2023b066d 100644 --- a/source/net/yacy/document/importer/OAIPMHReader.java +++ b/source/net/yacy/document/importer/OAIPMHReader.java @@ -29,10 +29,8 @@ package net.yacy.document.importer; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; -import java.util.Date; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.LoaderDispatcher; @@ -55,13 +53,11 @@ public class OAIPMHReader { this.source = source; // load the file from the net - Response response; - response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); + Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); byte[] b = response.getContent(); this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b)); - String file = filePrefix + "." + filename4source(source) + "." + DateFormatter.formatShortMilliSecond(new Date()); - File f0 = new File(targetDir, file + ".tmp"); - File f1 = new File(targetDir, file + ".xml"); + File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source)); + File f0 = new File(targetDir, f1.getName() + ".tmp"); // transaction-safe writing FileUtils.copy(b, f0); @@ -81,15 +77,6 @@ public class OAIPMHReader { */ } - public static final String filename4source(DigestURI source) { - String s = ResumptionToken.truncatedURL(source); - if (s.endsWith("?")) s = s.substring(0, s.length() - 1); - if (s.endsWith("/")) s = s.substring(0, s.length() - 1); - if (s.startsWith("https://")) s = s.substring(8); - if (s.startsWith("http://")) s = s.substring(7); - return s.replace('.', '_').replace('/', '_').replace(':', '_'); - } - public ResumptionToken getResumptionToken() { return this.resumptionToken; } diff --git a/source/net/yacy/kelondro/index/Cache.java b/source/net/yacy/kelondro/index/Cache.java index 490aae327..88e783eb4 100644 --- a/source/net/yacy/kelondro/index/Cache.java +++ b/source/net/yacy/kelondro/index/Cache.java @@ -216,35 +216,36 @@ public final class Cache implements ObjectIndex, Iterable { public final synchronized boolean has(final byte[] key) { // first look into the miss cache if (readMissCache != null) { - if (readMissCache.get(key) == null) { - this.hasnotMiss++; - } else { + if (readMissCache.has(key)) { this.hasnotHit++; return false; + } else { + this.hasnotMiss++; } } // then try the hit cache and the buffers if (readHitCache != null) { - if (readHitCache.get(key) != null) { + if (readHitCache.has(key)) { this.readHit++; return true; + } else { + this.readMiss++; } } // finally ask the back-end index - this.readMiss++; return index.has(key); } public final synchronized Row.Entry get(final byte[] key) throws IOException { // first look into the miss cache if (readMissCache != null) { - if (readMissCache.get(key) == null) { - this.hasnotMiss++; - } else { + if (readMissCache.has(key)) { this.hasnotHit++; return null; + } else { + this.hasnotMiss++; } }