From 29fe436e365d58858d1c2e464ba8b03f4502d7f3 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Mon, 9 Nov 2009 19:14:51 +0000
Subject: [PATCH] - fixed post-ranking including prefer mask - enhanced a core
 database access method / less wasted ram

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6473 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/IndexImportOAIPMHList_p.java           |  2 +-
 htroot/yacysearch.java                        |  4 +-
 source/de/anomic/search/RankingProcess.java   | 46 +---------
 source/de/anomic/search/ResultFetcher.java    | 58 +++++++++++-
 .../document/importer/OAIPMHImporter.java     | 90 ++++++++++++++++++-
 .../yacy/document/importer/OAIPMHReader.java  | 19 +---
 source/net/yacy/kelondro/index/Cache.java     | 17 ++--
 7 files changed, 163 insertions(+), 73 deletions(-)
diff --git a/htroot/IndexImportOAIPMHList_p.java b/htroot/IndexImportOAIPMHList_p.java
index 33454e3b8..9c2dd257b 100644
--- a/htroot/IndexImportOAIPMHList_p.java
+++ b/htroot/IndexImportOAIPMHList_p.java
@@ -43,7 +43,7 @@ public class IndexImportOAIPMHList_p {
         prop.put("source", 0);
         
         if (post != null && post.containsKey("source")) {
-            Set<String> oaiRoots = OAIPMHImporter.getOAIServer(sb.loader);
+            Set<String> oaiRoots = OAIPMHImporter.getAllListedOAIServer(sb.loader);
             
             boolean dark = false;
             int cnt = 0;
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 5d3f2a41c..ea50a37b5 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -180,8 +180,8 @@ public class yacysearch {
             originalUrlMask = ".*";
         }
 
-        String prefermask = (post == null ? "" : post.get("prefermaskfilter", ""));
-        if ((prefermask.length() > 0) && (prefermask.indexOf(".*") < 0)) prefermask = ".*" + prefermask + ".*";
+        String prefermask = (post == null) ? "" : post.get("prefermaskfilter", "");
+        if (prefermask.length() > 0 && prefermask.indexOf(".*") < 0) prefermask = ".*" + prefermask + ".*";
 
         Bitfield constraint = (post != null && post.containsKey("constraint") && post.get("constraint", "").length() > 0) ? new Bitfield(4, post.get("constraint", "______")) : null;
         if (indexof) {
diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java
index 87674d3f4..0f813ce29 100644
--- a/source/de/anomic/search/RankingProcess.java
+++ b/source/de/anomic/search/RankingProcess.java
@@ -35,7 +35,6 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
-import java.util.Set;
 import java.util.TreeSet;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
@@ -579,6 +578,10 @@ public final class RankingProcess extends Thread {
         }
     };
     
+    public Map<String, Integer> getTopics() {
+        return this.ref;
+    }
+    
     @SuppressWarnings("unchecked")
     public ArrayList<NavigatorEntry> getTopicNavigator(final int count) {
         // create a list of words that had been computed by statistics over all
@@ -701,46 +704,5 @@ public final class RankingProcess extends Thread {
         //System.out.println("NOT FOUND: " + urlHash);
         return 15;
     }
-    
-    public long postRanking(
-                    final Set<String> topwords,
-                    final ResultEntry rentry,
-                    final int position) {
-
-        long r = (255 - position) << 8;
-        
-        // for media search: prefer pages with many links
-        if (query.contentdom == QueryParams.CONTENTDOM_IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage;
-        if (query.contentdom == QueryParams.CONTENTDOM_AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio;
-        if (query.contentdom == QueryParams.CONTENTDOM_VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
-        if (query.contentdom == QueryParams.CONTENTDOM_APP  ) r += rentry.lapp()   << query.ranking.coeff_cathasapp;
-        
-        // prefer hit with 'prefer' pattern
-        if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
-        if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
-        
-        // apply 'common-sense' heuristic using references
-        final String urlstring = rentry.url().toNormalform(true, true);
-        final String[] urlcomps = DigestURI.urlComps(urlstring);
-        final String[] descrcomps = rentry.title().toLowerCase().split(DigestURI.splitrex);
-        for (int j = 0; j < urlcomps.length; j++) {
-            if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << query.ranking.coeff_urlcompintoplist;
-        }
-        for (int j = 0; j < descrcomps.length; j++) {
-            if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << query.ranking.coeff_descrcompintoplist;
-        }
 
-        // apply query-in-result matching
-        final Set<byte[]> urlcomph = Word.words2hashSet(urlcomps);
-        final Set<byte[]> descrcomph = Word.words2hashSet(descrcomps);
-        final Iterator<byte[]> shi = query.queryHashes.iterator();
-        byte[] queryhash;
-        while (shi.hasNext()) {
-            queryhash = shi.next();
-            if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl;
-            if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_app_dc_title;
-        }
-
-        return r;
-    }
 }
diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java
index 292654cad..3a24cd462 100644
--- a/source/de/anomic/search/ResultFetcher.java
+++ b/source/de/anomic/search/ResultFetcher.java
@@ -29,10 +29,15 @@ package de.anomic.search;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
 import java.util.TreeSet;
 
 import net.yacy.document.Condenser;
+import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
+import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.MemoryTracker;
 import net.yacy.kelondro.util.SetTools;
@@ -173,7 +178,12 @@ public class ResultFetcher {
                     
                     // place the result to the result vector
                     if (!result.exists(resultEntry)) {
-                        result.push(resultEntry, Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word())));
+                        
+                        // apply post-ranking
+                        long ranking = Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()));
+                        ranking += postRanking(resultEntry, rankedCache.getTopics());
+                        
+                        result.push(resultEntry, ranking);
                         if (nav_topics) rankedCache.addTopics(resultEntry);
                     }
                     //System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
@@ -355,5 +365,49 @@ public class ResultFetcher {
         }
         return this.result.list(this.result.size());
     }
-    
+
+    public long postRanking(
+            final ResultEntry rentry,
+            final Map<String, Integer> topwords) {
+
+        long r = 0;
+        
+        // for media search: prefer pages with many links
+        if (query.contentdom == QueryParams.CONTENTDOM_IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage;
+        if (query.contentdom == QueryParams.CONTENTDOM_AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio;
+        if (query.contentdom == QueryParams.CONTENTDOM_VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
+        if (query.contentdom == QueryParams.CONTENTDOM_APP  ) r += rentry.lapp()   << query.ranking.coeff_cathasapp;
+        
+        // prefer hit with 'prefer' pattern
+        if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
+        if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
+        
+        // apply 'common-sense' heuristic using references
+        final String urlstring = rentry.url().toNormalform(true, true);
+        final String[] urlcomps = DigestURI.urlComps(urlstring);
+        final String[] descrcomps = rentry.title().toLowerCase().split(DigestURI.splitrex);
+        Integer tc;
+        for (int j = 0; j < urlcomps.length; j++) {
+            tc = topwords.get(urlcomps[j]);
+            if (tc != null) r += Math.max(1, tc.intValue()) << query.ranking.coeff_urlcompintoplist;
+        }
+        for (int j = 0; j < descrcomps.length; j++) {
+            tc = topwords.get(descrcomps[j]);
+            if (tc != null) r += Math.max(1, tc) << query.ranking.coeff_descrcompintoplist;
+        }
+        
+        // apply query-in-result matching
+        final Set<byte[]> urlcomph = Word.words2hashSet(urlcomps);
+        final Set<byte[]> descrcomph = Word.words2hashSet(descrcomps);
+        final Iterator<byte[]> shi = query.queryHashes.iterator();
+        byte[] queryhash;
+        while (shi.hasNext()) {
+            queryhash = shi.next();
+            if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl;
+            if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_app_dc_title;
+        }
+        
+        return r;
+    }
+
 }
diff --git a/source/net/yacy/document/importer/OAIPMHImporter.java b/source/net/yacy/document/importer/OAIPMHImporter.java
index b88f49ef5..fa50db6dc 100644
--- a/source/net/yacy/document/importer/OAIPMHImporter.java
+++ b/source/net/yacy/document/importer/OAIPMHImporter.java
@@ -31,12 +31,17 @@ import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.net.MalformedURLException;
+import java.text.ParseException;
+import java.util.Date;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
+import net.yacy.kelondro.util.DateFormatter;
 import net.yacy.repository.LoaderDispatcher;
 import net.yacy.document.parser.csvParser;
 
@@ -133,7 +138,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
         this.message = "loading first part of records";
         while (true) {
             try {
-                OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, "oaipmh");
+                OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix);
                 this.chunkCount++;
                 this.recordsCount += reader.getResumptionToken().getRecordCounter();
                 this.source = reader.getResumptionToken().resumptionURL(this.source);
@@ -170,7 +175,27 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
         return 0;
     }
     
-    public static Set<String> getOAIServer(LoaderDispatcher loader) {
+    public static Set<String> getUnloadedOAIServer(
+            LoaderDispatcher loader,
+            File surrogatesIn,
+            File surrogatesOut,
+            long staleLimit) {
+        Set<String> plainList = getAllListedOAIServer(loader);
+        Map<String, Date> loaded = getLoadedOAIServer(surrogatesIn, surrogatesOut);
+        long limit = System.currentTimeMillis() - staleLimit;
+        for (Map.Entry<String, Date> a: loaded.entrySet()) {
+            if (a.getValue().getTime() > limit) plainList.remove(a.getKey());
+        }
+        return plainList;
+    }
+    
+    /**
+     * use the list server at http://roar.eprints.org/index.php?action=csv
+     * to produce a list of OAI-PMH sources
+     * @param loader
+     * @return the list of oai-pmh sources
+     */
+    public static Set<String> getAllListedOAIServer(LoaderDispatcher loader) {
         TreeSet<String> list = new TreeSet<String>();
 
         // read roar
@@ -204,5 +229,66 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
         
         return list;
     }
+
+    /**
+     * get a map for already loaded oai-pmh servers and their latest access date
+     * @param surrogatesIn
+     * @param surrogatesOut
+     * @return a map where the key is the hostID of the servers and the value is the last access date
+     */
+    @SuppressWarnings("unchecked")
+    public static Map<String, Date> getLoadedOAIServer(File surrogatesIn, File surrogatesOut) {
+        Map<String, Date> map = getLoadedOAIServer(surrogatesOut);
+        map.putAll((Map<? extends String, ? extends Date>) getLoadedOAIServer(surrogatesIn).entrySet());
+        return map;
+    }
+    
+    private static Map<String, Date> getLoadedOAIServer(File surrogates) {
+        HashMap<String, Date> map = new HashMap<String, Date>();
+        //oaipmh_opus.bsz-bw.de_20091102113118728.xml
+        for (String s: surrogates.list()) {
+            if (s.startsWith(filenamePrefix) && s.endsWith(".xml") && s.charAt(s.length() - 22) == filenameSeparationChar) {
+                try {
+                    Date fd = DateFormatter.parseShortMilliSecond(s.substring(s.length() - 21, s.length() - 4));
+                    String hostID = s.substring(7, s.length() - 22);
+                    Date md = map.get(hostID);
+                    if (md == null || fd.after(md)) map.put(hostID, fd);
+                } catch (ParseException e) {
+                    Log.logException(e);
+                }
+            }
+        }
+        return map;
+    }
+
+    public static final char hostReplacementChar = '_';
+    public static final char filenameSeparationChar = '.';
+    public static final String filenamePrefix = "oaipmh";
+
+    /**
+     * compute a host id that is also used in the getLoadedOAIServer method for the map key
+     * @param source
+     * @return a string that is a key for the given host
+     */
+    public static final String hostID(DigestURI source) {
+        String s = ResumptionToken.truncatedURL(source);
+        if (s.endsWith("?")) s = s.substring(0, s.length() - 1);
+        if (s.endsWith("/")) s = s.substring(0, s.length() - 1);
+        if (s.startsWith("https://")) s = s.substring(8);
+        if (s.startsWith("http://")) s = s.substring(7);
+        return s.replace('.', hostReplacementChar).replace('/', hostReplacementChar).replace(':', hostReplacementChar);
+    }
     
+    /**
+     * get a file name for a source. the file name contains a prefix that is used to identify
+     * that source as part of the OAI-PMH import process and a host key to identify the source.
+     * also included is a date stamp within the file name
+     * @param source
+     * @return a file name for the given source. It will be different for each call for same hosts because it contains a date stamp
+     */
+    public static final String filename4Source(DigestURI source) {
+        return filenamePrefix + OAIPMHImporter.filenameSeparationChar +
+               OAIPMHImporter.hostID(source) + OAIPMHImporter.filenameSeparationChar +
+               DateFormatter.formatShortMilliSecond(new Date()) + ".xml";
+    }
 }
\ No newline at end of file
diff --git a/source/net/yacy/document/importer/OAIPMHReader.java b/source/net/yacy/document/importer/OAIPMHReader.java
index 0b3b85678..2023b066d 100644
--- a/source/net/yacy/document/importer/OAIPMHReader.java
+++ b/source/net/yacy/document/importer/OAIPMHReader.java
@@ -29,10 +29,8 @@ package net.yacy.document.importer;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
-import java.util.Date;
 
 import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.util.DateFormatter;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.repository.LoaderDispatcher;
 
@@ -55,13 +53,11 @@ public class OAIPMHReader {
         this.source = source;
         
         // load the file from the net
-        Response response;
-        response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
+        Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
         byte[] b = response.getContent();
         this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b));
-        String file = filePrefix + "." + filename4source(source) + "." + DateFormatter.formatShortMilliSecond(new Date());
-        File f0 = new File(targetDir, file + ".tmp");
-        File f1 = new File(targetDir, file + ".xml");
+        File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source));
+        File f0 = new File(targetDir, f1.getName() + ".tmp");
         
         // transaction-safe writing
         FileUtils.copy(b, f0);
@@ -81,15 +77,6 @@ public class OAIPMHReader {
         */
     }
     
-    public static final String filename4source(DigestURI source) {
-        String s = ResumptionToken.truncatedURL(source);
-        if (s.endsWith("?")) s = s.substring(0, s.length() - 1);
-        if (s.endsWith("/")) s = s.substring(0, s.length() - 1);
-        if (s.startsWith("https://")) s = s.substring(8);
-        if (s.startsWith("http://")) s = s.substring(7);
-        return s.replace('.', '_').replace('/', '_').replace(':', '_');
-    }
-    
     public ResumptionToken getResumptionToken() {
         return this.resumptionToken;
     }
diff --git a/source/net/yacy/kelondro/index/Cache.java b/source/net/yacy/kelondro/index/Cache.java
index 490aae327..88e783eb4 100644
--- a/source/net/yacy/kelondro/index/Cache.java
+++ b/source/net/yacy/kelondro/index/Cache.java
@@ -216,35 +216,36 @@ public final class Cache implements ObjectIndex, Iterable<Row.Entry> {
     public final synchronized boolean has(final byte[] key) {
         // first look into the miss cache
         if (readMissCache != null) {
-            if (readMissCache.get(key) == null) {
-                this.hasnotMiss++;
-            } else {
+            if (readMissCache.has(key)) {
                 this.hasnotHit++;
                 return false;
+            } else {
+                this.hasnotMiss++;
             }
         }
 
         // then try the hit cache and the buffers
         if (readHitCache != null) {
-            if (readHitCache.get(key) != null) {
+            if (readHitCache.has(key)) {
                 this.readHit++;
                 return true;
+            } else {
+                this.readMiss++;
             }
         }
         
         // finally ask the back-end index
-        this.readMiss++;
         return index.has(key);
     }
     
     public final synchronized Row.Entry get(final byte[] key) throws IOException {
         // first look into the miss cache
         if (readMissCache != null) {
-            if (readMissCache.get(key) == null) {
-                this.hasnotMiss++;
-            } else {
+            if (readMissCache.has(key)) {
                 this.hasnotHit++;
                 return null;
+            } else {
+                this.hasnotMiss++;
             }
         }