- added NEAR operator (must be written in UPPERCASE in search query)

- more generics
- removed unused commons classes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4310 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 3e3d2e39a4
commit ecd7f8ba4e

@ -359,8 +359,8 @@ public class IndexControlRWIs_p {
}
private static plasmaSearchRankingProcess genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, int sortorder, boolean fetchURLs) {
plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, filter);
plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, sb.getRanking(), sortorder, Integer.MAX_VALUE);
plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, sb.getRanking(), filter);
plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, sortorder, Integer.MAX_VALUE);
ranked.execQuery(fetchURLs);
if (ranked.filteredCount() == 0) {

@ -101,9 +101,9 @@ public class Ranking_p {
putRanking(prop, rankingProfile.postToExternalMap(prefix), prefix, "Post");
}
private static void putRanking(serverObjects prop, Map map, String prefix, String attrExtension) {
private static void putRanking(serverObjects prop, Map<String, String> map, String prefix, String attrExtension) {
prop.put("attr" + attrExtension, map.size());
Iterator it = map.keySet().iterator();
Iterator<String> it = map.keySet().iterator();
String key;
int i, j = 0;
while (it.hasNext()) {

@ -36,9 +36,9 @@ import java.util.Set;
import java.util.TreeSet;
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.index.indexContainer;
import de.anomic.net.natLib;
import de.anomic.plasma.plasmaProfiling;
import de.anomic.plasma.plasmaSearchEvent;
@ -49,10 +49,10 @@ import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch;
import de.anomic.tools.crypt;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacySeed;
import de.anomic.tools.crypt;
public final class search {
@ -134,6 +134,9 @@ public final class search {
final TreeSet excludehashes = (exclude.length() == 0) ? new TreeSet(kelondroBase64Order.enhancedCoder) : plasmaSearchQuery.hashes2Set(exclude);
final long timestamp = System.currentTimeMillis();
// prepare a search profile
plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile);
// prepare an abstract result
StringBuffer indexabstract = new StringBuffer();
int indexabstractContainercount = 0;
@ -143,7 +146,7 @@ public final class search {
long urlRetrievalAllTime = 0, snippetComputationAllTime = 0;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false);
theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
@ -168,14 +171,12 @@ public final class search {
prop.put("references", "");
} else {
// retrieve index containers from search request
theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false);
theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
// prepare a search profile
plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile);
// make event
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, sb.wordIndex, null, true, abstractSet);
urlRetrievalAllTime = theSearch.getURLRetrievalTime();
snippetComputationAllTime = theSearch.getSnippetComputationTime();

@ -57,6 +57,7 @@ import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
@ -85,10 +86,10 @@ public class yacysearch {
if (env.getConfigBool("promoteSearchPageGreeting.useNetworkName", false)) promoteSearchPageGreeting = env.getConfig("network.unit.description", "");
if (promoteSearchPageGreeting.length() == 0) promoteSearchPageGreeting = "P2P WEB SEARCH";
// case if no values are requested
// get query
String querystring = (post == null) ? "" : post.get("search", "").trim();
boolean rss = (post == null) ? false : post.get("rss", "false").equals("true");
boolean rss = (post == null) ? false : post.get("rss", "false").equals("true");
if ((post == null) || (env == null) || (querystring.length() == 0) || (!searchAllowed)) {
/*
// save referrer
@ -188,7 +189,16 @@ public class yacysearch {
serverObjects prop = new serverObjects();
if (post.get("cat", "href").equals("href")) {
final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring); // converts also umlaute
final TreeSet<String>[] query = plasmaSearchQuery.cleanQuery(querystring); // converts also umlaute
boolean near = (query[0].contains("near")) && (querystring.indexOf("NEAR") >= 0);
if (near) {
query[0].remove("near");
}
plasmaSearchRankingProfile ranking = sb.getRanking();
if (near) {
ranking.coeff_worddistance = plasmaSearchRankingProfile.COEFF_MAX;
}
// filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords);
if (filtered.size() > 0) {
@ -250,6 +260,7 @@ public class yacysearch {
querystring,
queryHashes,
plasmaCondenser.words2hashes(query[1]),
ranking,
maxDistance,
prefermask,
contentdomCode,
@ -282,7 +293,7 @@ public class yacysearch {
theQuery.setOffset(0); // in case that this is a new search, always start without a offset
offset = 0;
}
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, sb.getRanking(), sb.wordIndex, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, null);
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, ranking, sb.wordIndex, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, null);
// generate result object
serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");

@ -36,9 +36,8 @@ import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -82,7 +81,6 @@ public class yacysearchitem {
return prop;
}
plasmaSearchQuery theQuery = theSearch.getQuery();
plasmaSearchRankingProfile ranking = theSearch.getRanking();
// dynamically update count values
if (!rss) {
@ -189,8 +187,8 @@ public class yacysearchitem {
prop.put("content", theQuery.contentdom + 1); // switch on specific content
prop.put("content_authorized", authenticated ? "1" : "0");
prop.put("content_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0");
prop.put("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", result.hash());
prop.putHTML("content_description", result.title());
prop.put("content_url", result.urlstring());

Binary file not shown.

Binary file not shown.

@ -53,8 +53,6 @@ import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;
import org.apache.commons.codec.net.QuotedPrintableCodec;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaParserDocument;
@ -86,14 +84,14 @@ public class vcfParser extends AbstractParser implements Parser {
* a list of library names that are needed by this parser
* @see Parser#getLibxDependences()
*/
private static final String[] LIBX_DEPENDENCIES = new String[] {"commons-codec-1.3.jar"};
private static final String[] LIBX_DEPENDENCIES = new String[] {};
public vcfParser() {
super(LIBX_DEPENDENCIES);
this.parserName = "vCard Parser";
}
public Hashtable getSupportedMimeTypes() {
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@ -157,7 +155,7 @@ public class vcfParser extends AbstractParser implements Parser {
value += line;
} while (line.endsWith("="));
}
value = (new QuotedPrintableCodec()).decode(value);
value = decodeQuotedPrintable(value);
} else if (encoding.equalsIgnoreCase("base64")) {
do {
line = inputReader.readLine();
@ -183,7 +181,7 @@ public class vcfParser extends AbstractParser implements Parser {
if (key.equalsIgnoreCase("END")) {
String name = null, title = null;
// using the name of the current persion as section headline
// using the name of the current version as section headline
if (parsedData.containsKey("FN")) {
parsedNames.add(name = (String)parsedData.get("FN"));
} else if (parsedData.containsKey("N")) {
@ -203,7 +201,7 @@ public class vcfParser extends AbstractParser implements Parser {
// looping through the properties and add there values to
// the text representation of the vCard
Iterator iter = parsedData.values().iterator();
Iterator<String> iter = parsedData.values().iterator();
while (iter.hasNext()) {
value = (String) iter.next();
parsedDataText.append(value).append("\r\n");
@ -266,6 +264,28 @@ public class vcfParser extends AbstractParser implements Parser {
super.reset();
}
public static final String decodeQuotedPrintable(String s) {
if (s == null) return null;
byte[] b = s.getBytes();
StringBuffer sb = new StringBuffer();
for (int i = 0; i < b.length; i++) {
int c = b[i];
if (c == '=') {
try {
int u = Character.digit((char) b[++i], 16);
int l = Character.digit((char) b[++i], 16);
if (u == -1 || l == -1) throw new RuntimeException("bad quoted-printable encoding");
sb.append((char) ((u << 4) + l));
} catch (ArrayIndexOutOfBoundsException e) {
throw new RuntimeException("bad quoted-printable encoding");
}
} else {
sb.append((char) c);
}
}
return sb.toString();
}
public static void main(String[] args) {
try {
yacyURL contentUrl = new yacyURL(args[0], null);

@ -886,14 +886,14 @@ public final class plasmaCondenser {
return s;
}
public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException {
public static Map<String, wordStatProp> getWords(byte[] text, String charset) throws UnsupportedEncodingException {
// returns a word/wordStatProp relation map
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
return new plasmaCondenser(buffer, charset, 2, 1).words();
}
public static Map getWords(String text) {
public static Map<String, wordStatProp> getWords(String text) {
// returns a word/wordStatProp relation map
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text.getBytes());
@ -905,7 +905,7 @@ public final class plasmaCondenser {
}
public static void main(String[] args) {
// read a property file and converty them into configuration lines
// read a property file and convert them into configuration lines
try {
File f = new File(args[0]);
Properties p = new Properties();

@ -48,6 +48,7 @@ import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.htmlFilter.htmlFilterInputStream;
@ -812,8 +813,8 @@ public final class plasmaParser {
}
public static void main(String[] args) {
//javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
//javac -sourcepath source source/de/anomic/plasma/plasmaParser.java
//java -cp source de.anomic.plasma.plasmaParser bug.html bug.out
httpc remote = null;
try {
Object content = null;

@ -59,13 +59,12 @@ public final class plasmaSearchEvent {
public static int workerThreadCount = 8;
public static String lastEventID = "";
private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests
private static HashMap<String, plasmaSearchEvent> lastEvents = new HashMap<String, plasmaSearchEvent>(); // a cache for objects from this class: re-use old search requests
public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes
private static final int max_results_preparation = 200;
private long eventTime;
private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking;
private plasmaWordIndex wordIndex;
private plasmaSearchRankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
@ -85,7 +84,6 @@ public final class plasmaSearchEvent {
private long snippetComputationAllTime;
private plasmaSearchEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaWordIndex wordIndex,
TreeMap preselectedPeerHashes,
boolean generateAbstracts,
@ -93,7 +91,6 @@ public final class plasmaSearchEvent {
this.eventTime = System.currentTimeMillis(); // for lifetime check
this.wordIndex = wordIndex;
this.query = query;
this.ranking = ranking;
this.rcAbstracts = (query.queryHashes.size() > 1) ? new TreeMap() : null; // generate abstracts only for combined searches
this.primarySearchThreads = null;
this.secondarySearchThreads = null;
@ -122,7 +119,7 @@ public final class plasmaSearchEvent {
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
// do a global search
this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, ranking, 2, max_results_preparation);
this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, 2, max_results_preparation);
int fetchpeers = (int) (query.maximumTime / 500L); // number of target peers; means 10 peers in 10 seconds
if (fetchpeers > 50) fetchpeers = 50;
@ -144,7 +141,7 @@ public final class plasmaSearchEvent {
rcAbstracts,
fetchpeers,
plasmaSwitchboard.urlBlacklist,
ranking,
query.ranking,
query.constraint,
(query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "remote search thread start", this.primarySearchThreads.length, System.currentTimeMillis() - timer));
@ -157,7 +154,7 @@ public final class plasmaSearchEvent {
serverLog.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
} else {
// do a local search
this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, ranking, 2, max_results_preparation);
this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, 2, max_results_preparation);
this.rankedCache.execQuery(true);
this.localcount = this.rankedCache.filteredCount();
//plasmaWordIndex.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process);
@ -416,10 +413,6 @@ public final class plasmaSearchEvent {
return query;
}
public plasmaSearchRankingProfile getRanking() {
return ranking;
}
public yacySearch[] getPrimarySearchThreads() {
return primarySearchThreads;
}
@ -459,7 +452,7 @@ public final class plasmaSearchEvent {
synchronized (lastEvents) {
plasmaSearchEvent event = (plasmaSearchEvent) lastEvents.get(query.id(false));
if (event == null) {
event = new plasmaSearchEvent(query, ranking, wordIndex, preselectedPeerHashes, generateAbstracts, abstractSet);
event = new plasmaSearchEvent(query, wordIndex, preselectedPeerHashes, generateAbstracts, abstractSet);
} else {
//re-new the event time for this event, so it is not deleted next time too early
event.eventTime = System.currentTimeMillis();
@ -685,7 +678,7 @@ public final class plasmaSearchEvent {
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words);
secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch(
words, "", urls, wordIndex, this.rankedCache, peer, plasmaSwitchboard.urlBlacklist,
ranking, query.constraint, preselectedPeerHashes);
query.ranking, query.constraint, preselectedPeerHashes);
}
}

@ -87,8 +87,12 @@ public final class plasmaSearchQuery {
public kelondroBitfield constraint;
public boolean allofconstraint;
public boolean onlineSnippetFetch;
public plasmaSearchRankingProfile ranking;
public plasmaSearchQuery(String queryString, int lines, kelondroBitfield constraint) {
public plasmaSearchQuery(String queryString,
int lines,
plasmaSearchRankingProfile ranking,
kelondroBitfield constraint) {
if ((queryString.length() == 12) && (kelondroBase64Order.enhancedCoder.wellformed(queryString.getBytes()))) {
this.queryString = null;
this.queryHashes = new TreeSet<String>();
@ -96,10 +100,11 @@ public final class plasmaSearchQuery {
this.queryHashes.add(queryString);
} else {
this.queryString = queryString;
TreeSet[] cq = cleanQuery(queryString);
TreeSet<String>[] cq = cleanQuery(queryString);
this.queryHashes = plasmaCondenser.words2hashes(cq[0]);
this.excludeHashes = plasmaCondenser.words2hashes(cq[1]);
}
this.ranking = ranking;
this.maxDistance = Integer.MAX_VALUE;
this.prefer = "";
this.contentdom = CONTENTDOM_ALL;
@ -115,7 +120,10 @@ public final class plasmaSearchQuery {
this.onlineSnippetFetch = false;
}
public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet excludeHashes, int maxDistance, String prefer, int contentdom,
public plasmaSearchQuery(
String queryString, TreeSet<String> queryHashes, TreeSet<String> excludeHashes,
plasmaSearchRankingProfile ranking,
int maxDistance, String prefer, int contentdom,
boolean onlineSnippetFetch,
int lines, int offset, long maximumTime, String urlMask,
int domType, String domGroupName, int domMaxTargets,
@ -123,6 +131,7 @@ public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet exclud
this.queryString = queryString;
this.queryHashes = queryHashes;
this.excludeHashes = excludeHashes;
this.ranking = ranking;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.contentdom = contentdom;
@ -175,33 +184,33 @@ public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet exclud
}
public static TreeSet<String> hashes2Set(String query) {
if (query == null) return new TreeSet(kelondroBase64Order.enhancedCoder);
final TreeSet keyhashes = new TreeSet(kelondroBase64Order.enhancedCoder);
if (query == null) return new TreeSet<String>(kelondroBase64Order.enhancedCoder);
final TreeSet<String> keyhashes = new TreeSet<String>(kelondroBase64Order.enhancedCoder);
for (int i = 0; i < (query.length() / yacySeedDB.commonHashLength); i++) {
keyhashes.add(query.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength));
}
return keyhashes;
}
public static String hashSet2hashString(Set hashes) {
Iterator i = hashes.iterator();
public static String hashSet2hashString(Set<String> hashes) {
Iterator<String> i = hashes.iterator();
StringBuffer sb = new StringBuffer(hashes.size() * yacySeedDB.commonHashLength);
while (i.hasNext()) sb.append((String) i.next());
while (i.hasNext()) sb.append(i.next());
return new String(sb);
}
public static String anonymizedQueryHashes(Set hashes) {
public static String anonymizedQueryHashes(Set<String> hashes) {
// create a more anonymized representation of euqery hashes for logging
Iterator i = hashes.iterator();
Iterator<String> i = hashes.iterator();
StringBuffer sb = new StringBuffer(hashes.size() * (yacySeedDB.commonHashLength + 2) + 2);
sb.append("[");
String hash;
if (i.hasNext()) {
hash = (String) i.next();
hash = i.next();
sb.append(hash.substring(0, 3)).append(".........");
}
while (i.hasNext()) {
hash = (String) i.next();
hash = i.next();
sb.append(", ").append(hash.substring(0, 3)).append(".........");
}
sb.append("]");
@ -252,29 +261,29 @@ public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet exclud
return this.queryString;
}
public TreeSet[] queryWords() {
public TreeSet<String>[] queryWords() {
return cleanQuery(this.queryString);
}
public void filterOut(Set blueList) {
public void filterOut(Set<String> blueList) {
// filter out words that appear in this set
// this is applied to the queryHashes
TreeSet blues = plasmaCondenser.words2hashes(blueList);
TreeSet<String> blues = plasmaCondenser.words2hashes(blueList);
kelondroMSetTools.excludeDestructive(queryHashes, blues);
}
public String id(boolean anonymized) {
// generate a string that identifies a search so results can be re-used in a cache
if (anonymized) {
return anonymizedQueryHashes(this.queryHashes) + "-" + anonymizedQueryHashes(this.excludeHashes) + ":" + this.contentdom;
return anonymizedQueryHashes(this.queryHashes) + "-" + anonymizedQueryHashes(this.excludeHashes) + ":" + this.contentdom + "*" + this.ranking.toExternalString();
} else {
return hashSet2hashString(this.queryHashes) + "-" + hashSet2hashString(this.excludeHashes) + ":" + this.contentdom;
return hashSet2hashString(this.queryHashes) + "-" + hashSet2hashString(this.excludeHashes) + ":" + this.contentdom + this.ranking.toExternalString();
}
}
public HashMap resultProfile(int searchcount, long searchtime, long urlretrieval, long snippetcomputation) {
public HashMap<String, Object> resultProfile(int searchcount, long searchtime, long urlretrieval, long snippetcomputation) {
// generate statistics about search: query, time, etc
HashMap r = new HashMap();
HashMap<String, Object> r = new HashMap<String, Object>();
r.put("queryhashes", queryHashes);
r.put("querystring", queryString);
r.put("querycount", new Integer(linesPerPage));

@ -56,7 +56,6 @@ public final class plasmaSearchRankingProcess {
private HashMap<String, TreeMap<Object, indexRWIEntry>> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries
private HashMap<String, String> handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process
private plasmaSearchQuery query;
private plasmaSearchRankingProfile ranking;
private int sortorder;
private int filteredCount;
private int maxentries;
@ -69,7 +68,7 @@ public final class plasmaSearchRankingProcess {
private plasmaWordIndex wordIndex;
private Map<String, indexContainer>[] localSearchContainerMaps;
public plasmaSearchRankingProcess(plasmaWordIndex wordIndex, plasmaSearchQuery query, plasmaSearchRankingProfile ranking, int sortorder, int maxentries) {
public plasmaSearchRankingProcess(plasmaWordIndex wordIndex, plasmaSearchQuery query, int sortorder, int maxentries) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
// sortorder: 0 = hash, 1 = url, 2 = ranking
@ -80,7 +79,6 @@ public final class plasmaSearchRankingProcess {
this.filteredCount = 0;
this.order = null;
this.query = query;
this.ranking = ranking;
this.maxentries = maxentries;
this.globalcount = 0;
this.urlhashes = new HashMap<String, Object>();
@ -170,7 +168,7 @@ public final class plasmaSearchRankingProcess {
long timer = System.currentTimeMillis();
if (this.order == null) {
this.order = new indexRWIEntryOrder(ranking);
this.order = new indexRWIEntryOrder(query.ranking);
}
this.order.extend(container);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, container.size(), System.currentTimeMillis() - timer));
@ -463,42 +461,42 @@ public final class plasmaSearchRankingProcess {
}
public long postRanking(
Set topwords,
Set<String> topwords,
plasmaSearchEvent.ResultEntry rentry,
int position) {
long r = (255 - position) << 8;
// for media search: prefer pages with many links
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) r += rentry.limage() << ranking.coeff_cathasimage;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) r += rentry.laudio() << ranking.coeff_cathasaudio;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) r += rentry.lvideo() << ranking.coeff_cathasvideo;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) r += rentry.lapp() << ranking.coeff_cathasapp;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp;
// prefer hit with 'prefer' pattern
if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << ranking.coeff_prefer;
if (rentry.title().matches(query.prefer)) r += 256 << ranking.coeff_prefer;
if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
// apply 'common-sense' heuristic using references
String urlstring = rentry.url().toNormalform(true, true);
String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring);
String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex);
for (int j = 0; j < urlcomps.length; j++) {
if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << ranking.coeff_urlcompintoplist;
if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << query.ranking.coeff_urlcompintoplist;
}
for (int j = 0; j < descrcomps.length; j++) {
if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << ranking.coeff_descrcompintoplist;
if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << query.ranking.coeff_descrcompintoplist;
}
// apply query-in-result matching
Set urlcomph = plasmaCondenser.words2hashSet(urlcomps);
Set descrcomph = plasmaCondenser.words2hashSet(descrcomps);
Iterator shi = query.queryHashes.iterator();
Set<String> urlcomph = plasmaCondenser.words2hashSet(urlcomps);
Set<String> descrcomph = plasmaCondenser.words2hashSet(descrcomps);
Iterator<String> shi = query.queryHashes.iterator();
String queryhash;
while (shi.hasNext()) {
queryhash = (String) shi.next();
if (urlcomph.contains(queryhash)) r += 256 << ranking.coeff_appurl;
if (descrcomph.contains(queryhash)) r += 256 << ranking.coeff_appdescr;
queryhash = shi.next();
if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl;
if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appdescr;
}
return r;

@ -81,6 +81,10 @@ public class plasmaSearchRankingProfile {
public static final String DESCRCOMPINTOPLIST = "descrcompintoplist";
public static final String PREFER = "prefer";
// coefficient max/min values
public static final int COEFF_MIN = 0;
public static final int COEFF_MAX = 15;
public int
coeff_domlength, coeff_ybr, coeff_date, coeff_wordsintitle, coeff_wordsintext, coeff_phrasesintext,
coeff_llocal, coeff_lother, coeff_urllength, coeff_urlcomps, coeff_hitcount,
@ -127,7 +131,7 @@ public class plasmaSearchRankingProfile {
this(plasmaSearchQuery.CONTENTDOM_TEXT); // set defaults
if ((profile != null) && (profile.length() > 0)) {
//parse external form
HashMap coeff = new HashMap();
HashMap<String, Integer> coeff = new HashMap<String, Integer>();
String[] elts = ((profile.startsWith("{") && (profile.endsWith("}"))) ? profile.substring(1, profile.length() - 1) : profile).split(",");
int p;
int s = (prefix == null) ? 0 : prefix.length();
@ -174,7 +178,7 @@ public class plasmaSearchRankingProfile {
}
}
private static int parseMap(HashMap coeff, String attr, int dflt) {
private static int parseMap(HashMap<String, Integer> coeff, String attr, int dflt) {
if (coeff.containsKey(attr)) try {
return ((Integer) coeff.get(attr)).intValue();
} catch (NumberFormatException e) {
@ -188,14 +192,14 @@ public class plasmaSearchRankingProfile {
return toExternalMap("").toString();
}
public Map toExternalMap(String prefix) {
Map ext = preToExternalMap(prefix);
public Map<String, String> toExternalMap(String prefix) {
Map<String, String> ext = preToExternalMap(prefix);
ext.putAll(postToExternalMap(prefix));
return ext;
}
public Map preToExternalMap(String prefix) {
Map ext = new HashMap();
public Map<String, String> preToExternalMap(String prefix) {
Map<String, String> ext = new HashMap<String, String>();
ext.put(prefix + DOMLENGTH, Integer.toString(coeff_domlength));
ext.put(prefix + YBR, Integer.toString(coeff_ybr));
ext.put(prefix + DATE, Integer.toString(coeff_date));
@ -226,8 +230,8 @@ public class plasmaSearchRankingProfile {
return ext;
}
public Map postToExternalMap(String prefix) {
Map ext = new HashMap();
public Map<String, String> postToExternalMap(String prefix) {
Map<String, String> ext = new HashMap<String, String>();
ext.put(prefix + URLCOMPINTOPLIST, Integer.toString(coeff_urlcompintoplist));
ext.put(prefix + DESCRCOMPINTOPLIST, Integer.toString(coeff_descrcompintoplist));
ext.put(prefix + PREFER, Integer.toString(coeff_prefer));
@ -235,14 +239,14 @@ public class plasmaSearchRankingProfile {
}
public String toExternalURLGet(String prefix) {
Iterator i = toExternalMap("").entrySet().iterator();
Map.Entry entry;
Iterator<Map.Entry<String, String>> i = toExternalMap("").entrySet().iterator();
Map.Entry<String, String> entry;
StringBuffer ext = new StringBuffer();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
entry = i.next();
ext.append("&");
ext.append(prefix);
ext.append((String) entry.getKey());
ext.append(entry.getKey());
ext.append("=");
ext.append(entry.getValue());
}

Loading…
Cancel
Save