- fixed post-ranking including prefer mask

- enhanced a core database access method / less wasted ram

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6473 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent e9ab130ad7
commit 29fe436e36

@ -43,7 +43,7 @@ public class IndexImportOAIPMHList_p {
prop.put("source", 0);
if (post != null && post.containsKey("source")) {
Set<String> oaiRoots = OAIPMHImporter.getOAIServer(sb.loader);
Set<String> oaiRoots = OAIPMHImporter.getAllListedOAIServer(sb.loader);
boolean dark = false;
int cnt = 0;

@ -180,8 +180,8 @@ public class yacysearch {
originalUrlMask = ".*";
}
String prefermask = (post == null ? "" : post.get("prefermaskfilter", ""));
if ((prefermask.length() > 0) && (prefermask.indexOf(".*") < 0)) prefermask = ".*" + prefermask + ".*";
String prefermask = (post == null) ? "" : post.get("prefermaskfilter", "");
if (prefermask.length() > 0 && prefermask.indexOf(".*") < 0) prefermask = ".*" + prefermask + ".*";
Bitfield constraint = (post != null && post.containsKey("constraint") && post.get("constraint", "").length() > 0) ? new Bitfield(4, post.get("constraint", "______")) : null;
if (indexof) {

@ -35,7 +35,6 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
@ -579,6 +578,10 @@ public final class RankingProcess extends Thread {
}
};
public Map<String, Integer> getTopics() {
return this.ref;
}
@SuppressWarnings("unchecked")
public ArrayList<NavigatorEntry> getTopicNavigator(final int count) {
// create a list of words that had been computed by statistics over all
@ -701,46 +704,5 @@ public final class RankingProcess extends Thread {
//System.out.println("NOT FOUND: " + urlHash);
return 15;
}
public long postRanking(
final Set<String> topwords,
final ResultEntry rentry,
final int position) {
long r = (255 - position) << 8;
// for media search: prefer pages with many links
if (query.contentdom == QueryParams.CONTENTDOM_IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage;
if (query.contentdom == QueryParams.CONTENTDOM_AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio;
if (query.contentdom == QueryParams.CONTENTDOM_VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
if (query.contentdom == QueryParams.CONTENTDOM_APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp;
// prefer hit with 'prefer' pattern
if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
// apply 'common-sense' heuristic using references
final String urlstring = rentry.url().toNormalform(true, true);
final String[] urlcomps = DigestURI.urlComps(urlstring);
final String[] descrcomps = rentry.title().toLowerCase().split(DigestURI.splitrex);
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << query.ranking.coeff_urlcompintoplist;
}
for (int j = 0; j < descrcomps.length; j++) {
if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << query.ranking.coeff_descrcompintoplist;
}
// apply query-in-result matching
final Set<byte[]> urlcomph = Word.words2hashSet(urlcomps);
final Set<byte[]> descrcomph = Word.words2hashSet(descrcomps);
final Iterator<byte[]> shi = query.queryHashes.iterator();
byte[] queryhash;
while (shi.hasNext()) {
queryhash = shi.next();
if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl;
if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_app_dc_title;
}
return r;
}
}

@ -29,10 +29,15 @@ package de.anomic.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryTracker;
import net.yacy.kelondro.util.SetTools;
@ -173,7 +178,12 @@ public class ResultFetcher {
// place the result to the result vector
if (!result.exists(resultEntry)) {
result.push(resultEntry, Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word())));
// apply post-ranking
long ranking = Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()));
ranking += postRanking(resultEntry, rankedCache.getTopics());
result.push(resultEntry, ranking);
if (nav_topics) rankedCache.addTopics(resultEntry);
}
//System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
@ -355,5 +365,49 @@ public class ResultFetcher {
}
return this.result.list(this.result.size());
}
public long postRanking(
final ResultEntry rentry,
final Map<String, Integer> topwords) {
long r = 0;
// for media search: prefer pages with many links
if (query.contentdom == QueryParams.CONTENTDOM_IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage;
if (query.contentdom == QueryParams.CONTENTDOM_AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio;
if (query.contentdom == QueryParams.CONTENTDOM_VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
if (query.contentdom == QueryParams.CONTENTDOM_APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp;
// prefer hit with 'prefer' pattern
if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
// apply 'common-sense' heuristic using references
final String urlstring = rentry.url().toNormalform(true, true);
final String[] urlcomps = DigestURI.urlComps(urlstring);
final String[] descrcomps = rentry.title().toLowerCase().split(DigestURI.splitrex);
Integer tc;
for (int j = 0; j < urlcomps.length; j++) {
tc = topwords.get(urlcomps[j]);
if (tc != null) r += Math.max(1, tc.intValue()) << query.ranking.coeff_urlcompintoplist;
}
for (int j = 0; j < descrcomps.length; j++) {
tc = topwords.get(descrcomps[j]);
if (tc != null) r += Math.max(1, tc) << query.ranking.coeff_descrcompintoplist;
}
// apply query-in-result matching
final Set<byte[]> urlcomph = Word.words2hashSet(urlcomps);
final Set<byte[]> descrcomph = Word.words2hashSet(descrcomps);
final Iterator<byte[]> shi = query.queryHashes.iterator();
byte[] queryhash;
while (shi.hasNext()) {
queryhash = shi.next();
if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl;
if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_app_dc_title;
}
return r;
}
}

@ -31,12 +31,17 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.document.parser.csvParser;
@ -133,7 +138,7 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
this.message = "loading first part of records";
while (true) {
try {
OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, "oaipmh");
OAIPMHReader reader = new OAIPMHReader(this.loader, this.source, Switchboard.getSwitchboard().surrogatesInPath, filenamePrefix);
this.chunkCount++;
this.recordsCount += reader.getResumptionToken().getRecordCounter();
this.source = reader.getResumptionToken().resumptionURL(this.source);
@ -170,7 +175,27 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
return 0;
}
public static Set<String> getOAIServer(LoaderDispatcher loader) {
public static Set<String> getUnloadedOAIServer(
LoaderDispatcher loader,
File surrogatesIn,
File surrogatesOut,
long staleLimit) {
Set<String> plainList = getAllListedOAIServer(loader);
Map<String, Date> loaded = getLoadedOAIServer(surrogatesIn, surrogatesOut);
long limit = System.currentTimeMillis() - staleLimit;
for (Map.Entry<String, Date> a: loaded.entrySet()) {
if (a.getValue().getTime() > limit) plainList.remove(a.getKey());
}
return plainList;
}
/**
* use the list server at http://roar.eprints.org/index.php?action=csv
* to produce a list of OAI-PMH sources
* @param loader
* @return the list of oai-pmh sources
*/
public static Set<String> getAllListedOAIServer(LoaderDispatcher loader) {
TreeSet<String> list = new TreeSet<String>();
// read roar
@ -204,5 +229,66 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
return list;
}
/**
* get a map for already loaded oai-pmh servers and their latest access date
* @param surrogatesIn
* @param surrogatesOut
* @return a map where the key is the hostID of the servers and the value is the last access date
*/
@SuppressWarnings("unchecked")
public static Map<String, Date> getLoadedOAIServer(File surrogatesIn, File surrogatesOut) {
Map<String, Date> map = getLoadedOAIServer(surrogatesOut);
map.putAll((Map<? extends String, ? extends Date>) getLoadedOAIServer(surrogatesIn).entrySet());
return map;
}
private static Map<String, Date> getLoadedOAIServer(File surrogates) {
HashMap<String, Date> map = new HashMap<String, Date>();
//oaipmh_opus.bsz-bw.de_20091102113118728.xml
for (String s: surrogates.list()) {
if (s.startsWith(filenamePrefix) && s.endsWith(".xml") && s.charAt(s.length() - 22) == filenameSeparationChar) {
try {
Date fd = DateFormatter.parseShortMilliSecond(s.substring(s.length() - 21, s.length() - 4));
String hostID = s.substring(7, s.length() - 22);
Date md = map.get(hostID);
if (md == null || fd.after(md)) map.put(hostID, fd);
} catch (ParseException e) {
Log.logException(e);
}
}
}
return map;
}
public static final char hostReplacementChar = '_';
public static final char filenameSeparationChar = '.';
public static final String filenamePrefix = "oaipmh";
/**
* compute a host id that is also used in the getLoadedOAIServer method for the map key
* @param source
* @return a string that is a key for the given host
*/
public static final String hostID(DigestURI source) {
String s = ResumptionToken.truncatedURL(source);
if (s.endsWith("?")) s = s.substring(0, s.length() - 1);
if (s.endsWith("/")) s = s.substring(0, s.length() - 1);
if (s.startsWith("https://")) s = s.substring(8);
if (s.startsWith("http://")) s = s.substring(7);
return s.replace('.', hostReplacementChar).replace('/', hostReplacementChar).replace(':', hostReplacementChar);
}
/**
* get a file name for a source. the file name contains a prefix that is used to identify
* that source as part of the OAI-PMH import process and a host key to identify the source.
* also included is a date stamp within the file name
* @param source
* @return a file name for the given source. It will be different for each call for same hosts because it contains a date stamp
*/
public static final String filename4Source(DigestURI source) {
return filenamePrefix + OAIPMHImporter.filenameSeparationChar +
OAIPMHImporter.hostID(source) + OAIPMHImporter.filenameSeparationChar +
DateFormatter.formatShortMilliSecond(new Date()) + ".xml";
}
}

@ -29,10 +29,8 @@ package net.yacy.document.importer;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
@ -55,13 +53,11 @@ public class OAIPMHReader {
this.source = source;
// load the file from the net
Response response;
response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b));
String file = filePrefix + "." + filename4source(source) + "." + DateFormatter.formatShortMilliSecond(new Date());
File f0 = new File(targetDir, file + ".tmp");
File f1 = new File(targetDir, file + ".xml");
File f1 = new File(targetDir, OAIPMHImporter.filename4Source(source));
File f0 = new File(targetDir, f1.getName() + ".tmp");
// transaction-safe writing
FileUtils.copy(b, f0);
@ -81,15 +77,6 @@ public class OAIPMHReader {
*/
}
public static final String filename4source(DigestURI source) {
String s = ResumptionToken.truncatedURL(source);
if (s.endsWith("?")) s = s.substring(0, s.length() - 1);
if (s.endsWith("/")) s = s.substring(0, s.length() - 1);
if (s.startsWith("https://")) s = s.substring(8);
if (s.startsWith("http://")) s = s.substring(7);
return s.replace('.', '_').replace('/', '_').replace(':', '_');
}
public ResumptionToken getResumptionToken() {
return this.resumptionToken;
}

@ -216,35 +216,36 @@ public final class Cache implements ObjectIndex, Iterable<Row.Entry> {
public final synchronized boolean has(final byte[] key) {
// first look into the miss cache
if (readMissCache != null) {
if (readMissCache.get(key) == null) {
this.hasnotMiss++;
} else {
if (readMissCache.has(key)) {
this.hasnotHit++;
return false;
} else {
this.hasnotMiss++;
}
}
// then try the hit cache and the buffers
if (readHitCache != null) {
if (readHitCache.get(key) != null) {
if (readHitCache.has(key)) {
this.readHit++;
return true;
} else {
this.readMiss++;
}
}
// finally ask the back-end index
this.readMiss++;
return index.has(key);
}
public final synchronized Row.Entry get(final byte[] key) throws IOException {
// first look into the miss cache
if (readMissCache != null) {
if (readMissCache.get(key) == null) {
this.hasnotMiss++;
} else {
if (readMissCache.has(key)) {
this.hasnotHit++;
return null;
} else {
this.hasnotMiss++;
}
}

Loading…
Cancel
Save