diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index e526c2dc8..88c518e09 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -359,8 +359,8 @@ public class IndexControlRWIs_p { } private static plasmaSearchRankingProcess genSearchresult(serverObjects prop, plasmaSwitchboard sb, String keyhash, kelondroBitfield filter, int sortorder, boolean fetchURLs) { - plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, filter); - plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, sb.getRanking(), sortorder, Integer.MAX_VALUE); + plasmaSearchQuery query = new plasmaSearchQuery(keyhash, -1, sb.getRanking(), filter); + plasmaSearchRankingProcess ranked = new plasmaSearchRankingProcess(sb.wordIndex, query, sortorder, Integer.MAX_VALUE); ranked.execQuery(fetchURLs); if (ranked.filteredCount() == 0) { diff --git a/htroot/Ranking_p.java b/htroot/Ranking_p.java index e95b4e947..419b0eb1f 100644 --- a/htroot/Ranking_p.java +++ b/htroot/Ranking_p.java @@ -101,9 +101,9 @@ public class Ranking_p { putRanking(prop, rankingProfile.postToExternalMap(prefix), prefix, "Post"); } - private static void putRanking(serverObjects prop, Map map, String prefix, String attrExtension) { + private static void putRanking(serverObjects prop, Map map, String prefix, String attrExtension) { prop.put("attr" + attrExtension, map.size()); - Iterator it = map.keySet().iterator(); + Iterator it = map.keySet().iterator(); String key; int i, j = 0; while (it.hasNext()) { diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 3cec2fc8d..19bb9b00d 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -36,9 +36,9 @@ import java.util.Set; import java.util.TreeSet; import de.anomic.http.httpHeader; +import de.anomic.index.indexContainer; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; -import de.anomic.index.indexContainer; import de.anomic.net.natLib; import de.anomic.plasma.plasmaProfiling; import de.anomic.plasma.plasmaSearchEvent; @@ -49,10 +49,10 @@ import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverProfiling; import de.anomic.server.serverSwitch; +import de.anomic.tools.crypt; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNetwork; import de.anomic.yacy.yacySeed; -import de.anomic.tools.crypt; public final class search { @@ -134,6 +134,9 @@ public final class search { final TreeSet excludehashes = (exclude.length() == 0) ? new TreeSet(kelondroBase64Order.enhancedCoder) : plasmaSearchQuery.hashes2Set(exclude); final long timestamp = System.currentTimeMillis(); + // prepare a search profile + plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile); + // prepare an abstract result StringBuffer indexabstract = new StringBuffer(); int indexabstractContainercount = 0; @@ -143,7 +146,7 @@ public final class search { long urlRetrievalAllTime = 0, snippetComputationAllTime = 0; if ((query.length() == 0) && (abstractSet != null)) { // this is _not_ a normal search, only a request for index abstracts - theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false); + theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, null, false); theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); @@ -168,14 +171,12 @@ public final class search { prop.put("references", ""); } else { - // retrieve index containers from search request - theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false); + theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, rankingProfile, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false); theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); - // prepare a search profile - plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile); + // make event plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, sb.wordIndex, null, true, abstractSet); urlRetrievalAllTime = theSearch.getURLRetrievalTime(); snippetComputationAllTime = theSearch.getSnippetComputationTime(); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index b2a238e2a..b529d5bb8 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -57,6 +57,7 @@ import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; +import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; @@ -85,10 +86,10 @@ public class yacysearch { if (env.getConfigBool("promoteSearchPageGreeting.useNetworkName", false)) promoteSearchPageGreeting = env.getConfig("network.unit.description", ""); if (promoteSearchPageGreeting.length() == 0) promoteSearchPageGreeting = "P2P WEB SEARCH"; - // case if no values are requested + // get query String querystring = (post == null) ? "" : post.get("search", "").trim(); - boolean rss = (post == null) ? false : post.get("rss", "false").equals("true"); + boolean rss = (post == null) ? false : post.get("rss", "false").equals("true"); if ((post == null) || (env == null) || (querystring.length() == 0) || (!searchAllowed)) { /* // save referrer @@ -188,7 +189,16 @@ public class yacysearch { serverObjects prop = new serverObjects(); if (post.get("cat", "href").equals("href")) { - final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring); // converts also umlaute + final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring); // converts also umlaute + boolean near = (query[0].contains("near")) && (querystring.indexOf("NEAR") >= 0); + if (near) { + query[0].remove("near"); + } + plasmaSearchRankingProfile ranking = sb.getRanking(); + if (near) { + ranking.coeff_worddistance = plasmaSearchRankingProfile.COEFF_MAX; + } + // filter out stopwords final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords); if (filtered.size() > 0) { @@ -250,6 +260,7 @@ public class yacysearch { querystring, queryHashes, plasmaCondenser.words2hashes(query[1]), + ranking, maxDistance, prefermask, contentdomCode, @@ -282,7 +293,7 @@ public class yacysearch { theQuery.setOffset(0); // in case that this is a new search, always start without a offset offset = 0; } - plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, sb.getRanking(), sb.wordIndex, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, null); + plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, ranking, sb.wordIndex, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, null); // generate result object serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 9a44ba4ee..9446bf460 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -36,9 +36,8 @@ import de.anomic.http.httpHeader; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.plasma.plasmaSearchEvent; -import de.anomic.plasma.plasmaSearchRankingProcess; import de.anomic.plasma.plasmaSearchQuery; -import de.anomic.plasma.plasmaSearchRankingProfile; +import de.anomic.plasma.plasmaSearchRankingProcess; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -82,7 +81,6 @@ public class yacysearchitem { return prop; } plasmaSearchQuery theQuery = theSearch.getQuery(); - plasmaSearchRankingProfile ranking = theSearch.getRanking(); // dynamically update count values if (!rss) { @@ -189,8 +187,8 @@ public class yacysearchitem { prop.put("content", theQuery.contentdom + 1); // switch on specific content prop.put("content_authorized", authenticated ? "1" : "0"); prop.put("content_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0"); - prop.put("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*"); - prop.put("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*"); + prop.put("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*"); + prop.put("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*"); prop.put("content_authorized_urlhash", result.hash()); prop.putHTML("content_description", result.title()); prop.put("content_url", result.urlstring()); diff --git a/lib/commons-collections.jar b/lib/commons-collections.jar deleted file mode 100644 index 5cc4f9062..000000000 Binary files a/lib/commons-collections.jar and /dev/null differ diff --git a/lib/commons-pool.jar b/lib/commons-pool.jar deleted file mode 100644 index 4ba534c90..000000000 Binary files a/lib/commons-pool.jar and /dev/null differ diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index e14004c52..2130faa9c 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -53,8 +53,6 @@ import java.util.Hashtable; import java.util.Iterator; import java.util.LinkedList; -import org.apache.commons.codec.net.QuotedPrintableCodec; - import de.anomic.http.httpc; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.plasma.plasmaParserDocument; @@ -86,14 +84,14 @@ public class vcfParser extends AbstractParser implements Parser { * a list of library names that are needed by this parser * @see Parser#getLibxDependences() */ - private static final String[] LIBX_DEPENDENCIES = new String[] {"commons-codec-1.3.jar"}; + private static final String[] LIBX_DEPENDENCIES = new String[] {}; public vcfParser() { super(LIBX_DEPENDENCIES); this.parserName = "vCard Parser"; } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @@ -157,7 +155,7 @@ public class vcfParser extends AbstractParser implements Parser { value += line; } while (line.endsWith("=")); } - value = (new QuotedPrintableCodec()).decode(value); + value = decodeQuotedPrintable(value); } else if (encoding.equalsIgnoreCase("base64")) { do { line = inputReader.readLine(); @@ -183,7 +181,7 @@ public class vcfParser extends AbstractParser implements Parser { if (key.equalsIgnoreCase("END")) { String name = null, title = null; - // using the name of the current persion as section headline + // using the name of the current version as section headline if (parsedData.containsKey("FN")) { parsedNames.add(name = (String)parsedData.get("FN")); } else if (parsedData.containsKey("N")) { @@ -203,7 +201,7 @@ public class vcfParser extends AbstractParser implements Parser { // looping through the properties and add there values to // the text representation of the vCard - Iterator iter = parsedData.values().iterator(); + Iterator iter = parsedData.values().iterator(); while (iter.hasNext()) { value = (String) iter.next(); parsedDataText.append(value).append("\r\n"); @@ -266,6 +264,28 @@ public class vcfParser extends AbstractParser implements Parser { super.reset(); } + public static final String decodeQuotedPrintable(String s) { + if (s == null) return null; + byte[] b = s.getBytes(); + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < b.length; i++) { + int c = b[i]; + if (c == '=') { + try { + int u = Character.digit((char) b[++i], 16); + int l = Character.digit((char) b[++i], 16); + if (u == -1 || l == -1) throw new RuntimeException("bad quoted-printable encoding"); + sb.append((char) ((u << 4) + l)); + } catch (ArrayIndexOutOfBoundsException e) { + throw new RuntimeException("bad quoted-printable encoding"); + } + } else { + sb.append((char) c); + } + } + return sb.toString(); + } + public static void main(String[] args) { try { yacyURL contentUrl = new yacyURL(args[0], null); diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index e9c464476..274de3d5d 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -886,14 +886,14 @@ public final class plasmaCondenser { return s; } - public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException { + public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException { // returns a word/wordStatProp relation map if (text == null) return null; ByteArrayInputStream buffer = new ByteArrayInputStream(text); return new plasmaCondenser(buffer, charset, 2, 1).words(); } - public static Map getWords(String text) { + public static Map getWords(String text) { // returns a word/wordStatProp relation map if (text == null) return null; ByteArrayInputStream buffer = new ByteArrayInputStream(text.getBytes()); @@ -905,7 +905,7 @@ public final class plasmaCondenser { } public static void main(String[] args) { - // read a property file and converty them into configuration lines + // read a property file and convert them into configuration lines try { File f = new File(args[0]); Properties p = new Properties(); diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 05e58d872..3418195f6 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -48,6 +48,7 @@ import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; + import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterInputStream; @@ -812,8 +813,8 @@ public final class plasmaParser { } public static void main(String[] args) { - //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java - //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out + //javac -sourcepath source source/de/anomic/plasma/plasmaParser.java + //java -cp source de.anomic.plasma.plasmaParser bug.html bug.out httpc remote = null; try { Object content = null; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index bb924cdea..1d0f94165 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -59,13 +59,12 @@ public final class plasmaSearchEvent { public static int workerThreadCount = 8; public static String lastEventID = ""; - private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests + private static HashMap lastEvents = new HashMap(); // a cache for objects from this class: re-use old search requests public static final long eventLifetime = 600000; // the time an event will stay in the cache, 10 Minutes private static final int max_results_preparation = 200; private long eventTime; private plasmaSearchQuery query; - private plasmaSearchRankingProfile ranking; private plasmaWordIndex wordIndex; private plasmaSearchRankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation @@ -85,7 +84,6 @@ public final class plasmaSearchEvent { private long snippetComputationAllTime; private plasmaSearchEvent(plasmaSearchQuery query, - plasmaSearchRankingProfile ranking, plasmaWordIndex wordIndex, TreeMap preselectedPeerHashes, boolean generateAbstracts, @@ -93,7 +91,6 @@ public final class plasmaSearchEvent { this.eventTime = System.currentTimeMillis(); // for lifetime check this.wordIndex = wordIndex; this.query = query; - this.ranking = ranking; this.rcAbstracts = (query.queryHashes.size() > 1) ? new TreeMap() : null; // generate abstracts only for combined searches this.primarySearchThreads = null; this.secondarySearchThreads = null; @@ -122,7 +119,7 @@ public final class plasmaSearchEvent { if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) || (query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) { // do a global search - this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, ranking, 2, max_results_preparation); + this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, 2, max_results_preparation); int fetchpeers = (int) (query.maximumTime / 500L); // number of target peers; means 10 peers in 10 seconds if (fetchpeers > 50) fetchpeers = 50; @@ -144,7 +141,7 @@ public final class plasmaSearchEvent { rcAbstracts, fetchpeers, plasmaSwitchboard.urlBlacklist, - ranking, + query.ranking, query.constraint, (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "remote search thread start", this.primarySearchThreads.length, System.currentTimeMillis() - timer)); @@ -157,7 +154,7 @@ public final class plasmaSearchEvent { serverLog.logFine("SEARCH_EVENT", "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); } else { // do a local search - this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, ranking, 2, max_results_preparation); + this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, 2, max_results_preparation); this.rankedCache.execQuery(true); this.localcount = this.rankedCache.filteredCount(); //plasmaWordIndex.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process); @@ -416,10 +413,6 @@ public final class plasmaSearchEvent { return query; } - public plasmaSearchRankingProfile getRanking() { - return ranking; - } - public yacySearch[] getPrimarySearchThreads() { return primarySearchThreads; } @@ -459,7 +452,7 @@ public final class plasmaSearchEvent { synchronized (lastEvents) { plasmaSearchEvent event = (plasmaSearchEvent) lastEvents.get(query.id(false)); if (event == null) { - event = new plasmaSearchEvent(query, ranking, wordIndex, preselectedPeerHashes, generateAbstracts, abstractSet); + event = new plasmaSearchEvent(query, wordIndex, preselectedPeerHashes, generateAbstracts, abstractSet); } else { //re-new the event time for this event, so it is not deleted next time too early event.eventTime = System.currentTimeMillis(); @@ -685,7 +678,7 @@ public final class plasmaSearchEvent { //System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words); secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch( words, "", urls, wordIndex, this.rankedCache, peer, plasmaSwitchboard.urlBlacklist, - ranking, query.constraint, preselectedPeerHashes); + query.ranking, query.constraint, preselectedPeerHashes); } } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 1e3809cd7..c15dbf46b 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -87,8 +87,12 @@ public final class plasmaSearchQuery { public kelondroBitfield constraint; public boolean allofconstraint; public boolean onlineSnippetFetch; + public plasmaSearchRankingProfile ranking; - public plasmaSearchQuery(String queryString, int lines, kelondroBitfield constraint) { + public plasmaSearchQuery(String queryString, + int lines, + plasmaSearchRankingProfile ranking, + kelondroBitfield constraint) { if ((queryString.length() == 12) && (kelondroBase64Order.enhancedCoder.wellformed(queryString.getBytes()))) { this.queryString = null; this.queryHashes = new TreeSet(); @@ -96,10 +100,11 @@ public final class plasmaSearchQuery { this.queryHashes.add(queryString); } else { this.queryString = queryString; - TreeSet[] cq = cleanQuery(queryString); + TreeSet[] cq = cleanQuery(queryString); this.queryHashes = plasmaCondenser.words2hashes(cq[0]); this.excludeHashes = plasmaCondenser.words2hashes(cq[1]); } + this.ranking = ranking; this.maxDistance = Integer.MAX_VALUE; this.prefer = ""; this.contentdom = CONTENTDOM_ALL; @@ -115,7 +120,10 @@ public final class plasmaSearchQuery { this.onlineSnippetFetch = false; } -public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet excludeHashes, int maxDistance, String prefer, int contentdom, +public plasmaSearchQuery( + String queryString, TreeSet queryHashes, TreeSet excludeHashes, + plasmaSearchRankingProfile ranking, + int maxDistance, String prefer, int contentdom, boolean onlineSnippetFetch, int lines, int offset, long maximumTime, String urlMask, int domType, String domGroupName, int domMaxTargets, @@ -123,6 +131,7 @@ public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet exclud this.queryString = queryString; this.queryHashes = queryHashes; this.excludeHashes = excludeHashes; + this.ranking = ranking; this.maxDistance = maxDistance; this.prefer = prefer; this.contentdom = contentdom; @@ -175,33 +184,33 @@ public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet exclud } public static TreeSet hashes2Set(String query) { - if (query == null) return new TreeSet(kelondroBase64Order.enhancedCoder); - final TreeSet keyhashes = new TreeSet(kelondroBase64Order.enhancedCoder); + if (query == null) return new TreeSet(kelondroBase64Order.enhancedCoder); + final TreeSet keyhashes = new TreeSet(kelondroBase64Order.enhancedCoder); for (int i = 0; i < (query.length() / yacySeedDB.commonHashLength); i++) { keyhashes.add(query.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength)); } return keyhashes; } - public static String hashSet2hashString(Set hashes) { - Iterator i = hashes.iterator(); + public static String hashSet2hashString(Set hashes) { + Iterator i = hashes.iterator(); StringBuffer sb = new StringBuffer(hashes.size() * yacySeedDB.commonHashLength); - while (i.hasNext()) sb.append((String) i.next()); + while (i.hasNext()) sb.append(i.next()); return new String(sb); } - public static String anonymizedQueryHashes(Set hashes) { + public static String anonymizedQueryHashes(Set hashes) { // create a more anonymized representation of euqery hashes for logging - Iterator i = hashes.iterator(); + Iterator i = hashes.iterator(); StringBuffer sb = new StringBuffer(hashes.size() * (yacySeedDB.commonHashLength + 2) + 2); sb.append("["); String hash; if (i.hasNext()) { - hash = (String) i.next(); + hash = i.next(); sb.append(hash.substring(0, 3)).append("........."); } while (i.hasNext()) { - hash = (String) i.next(); + hash = i.next(); sb.append(", ").append(hash.substring(0, 3)).append("........."); } sb.append("]"); @@ -252,29 +261,29 @@ public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet exclud return this.queryString; } - public TreeSet[] queryWords() { + public TreeSet[] queryWords() { return cleanQuery(this.queryString); } - public void filterOut(Set blueList) { + public void filterOut(Set blueList) { // filter out words that appear in this set // this is applied to the queryHashes - TreeSet blues = plasmaCondenser.words2hashes(blueList); + TreeSet blues = plasmaCondenser.words2hashes(blueList); kelondroMSetTools.excludeDestructive(queryHashes, blues); } public String id(boolean anonymized) { // generate a string that identifies a search so results can be re-used in a cache if (anonymized) { - return anonymizedQueryHashes(this.queryHashes) + "-" + anonymizedQueryHashes(this.excludeHashes) + ":" + this.contentdom; + return anonymizedQueryHashes(this.queryHashes) + "-" + anonymizedQueryHashes(this.excludeHashes) + ":" + this.contentdom + "*" + this.ranking.toExternalString(); } else { - return hashSet2hashString(this.queryHashes) + "-" + hashSet2hashString(this.excludeHashes) + ":" + this.contentdom; + return hashSet2hashString(this.queryHashes) + "-" + hashSet2hashString(this.excludeHashes) + ":" + this.contentdom + this.ranking.toExternalString(); } } - public HashMap resultProfile(int searchcount, long searchtime, long urlretrieval, long snippetcomputation) { + public HashMap resultProfile(int searchcount, long searchtime, long urlretrieval, long snippetcomputation) { // generate statistics about search: query, time, etc - HashMap r = new HashMap(); + HashMap r = new HashMap(); r.put("queryhashes", queryHashes); r.put("querystring", queryString); r.put("querycount", new Integer(linesPerPage)); diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 73fe6bb21..b56f80ff3 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -56,7 +56,6 @@ public final class plasmaSearchRankingProcess { private HashMap> doubleDomCache; // key = domhash (6 bytes); value = TreeMap like sortedRWIEntries private HashMap handover; // key = urlhash, value = urlstring; used for double-check of urls that had been handed over to search process private plasmaSearchQuery query; - private plasmaSearchRankingProfile ranking; private int sortorder; private int filteredCount; private int maxentries; @@ -69,7 +68,7 @@ public final class plasmaSearchRankingProcess { private plasmaWordIndex wordIndex; private Map[] localSearchContainerMaps; - public plasmaSearchRankingProcess(plasmaWordIndex wordIndex, plasmaSearchQuery query, plasmaSearchRankingProfile ranking, int sortorder, int maxentries) { + public plasmaSearchRankingProcess(plasmaWordIndex wordIndex, plasmaSearchQuery query, int sortorder, int maxentries) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime // sortorder: 0 = hash, 1 = url, 2 = ranking @@ -80,7 +79,6 @@ public final class plasmaSearchRankingProcess { this.filteredCount = 0; this.order = null; this.query = query; - this.ranking = ranking; this.maxentries = maxentries; this.globalcount = 0; this.urlhashes = new HashMap(); @@ -170,7 +168,7 @@ public final class plasmaSearchRankingProcess { long timer = System.currentTimeMillis(); if (this.order == null) { - this.order = new indexRWIEntryOrder(ranking); + this.order = new indexRWIEntryOrder(query.ranking); } this.order.extend(container); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, container.size(), System.currentTimeMillis() - timer)); @@ -463,42 +461,42 @@ public final class plasmaSearchRankingProcess { } public long postRanking( - Set topwords, + Set topwords, plasmaSearchEvent.ResultEntry rentry, int position) { long r = (255 - position) << 8; // for media search: prefer pages with many links - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) r += rentry.limage() << ranking.coeff_cathasimage; - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) r += rentry.laudio() << ranking.coeff_cathasaudio; - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) r += rentry.lvideo() << ranking.coeff_cathasvideo; - if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) r += rentry.lapp() << ranking.coeff_cathasapp; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo; + if (query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp; // prefer hit with 'prefer' pattern - if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << ranking.coeff_prefer; - if (rentry.title().matches(query.prefer)) r += 256 << ranking.coeff_prefer; + if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer; + if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer; // apply 'common-sense' heuristic using references String urlstring = rentry.url().toNormalform(true, true); String[] urlcomps = htmlFilterContentScraper.urlComps(urlstring); String[] descrcomps = rentry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); for (int j = 0; j < urlcomps.length; j++) { - if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << ranking.coeff_urlcompintoplist; + if (topwords.contains(urlcomps[j])) r += Math.max(1, 256 - urlstring.length()) << query.ranking.coeff_urlcompintoplist; } for (int j = 0; j < descrcomps.length; j++) { - if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << ranking.coeff_descrcompintoplist; + if (topwords.contains(descrcomps[j])) r += Math.max(1, 256 - rentry.title().length()) << query.ranking.coeff_descrcompintoplist; } // apply query-in-result matching - Set urlcomph = plasmaCondenser.words2hashSet(urlcomps); - Set descrcomph = plasmaCondenser.words2hashSet(descrcomps); - Iterator shi = query.queryHashes.iterator(); + Set urlcomph = plasmaCondenser.words2hashSet(urlcomps); + Set descrcomph = plasmaCondenser.words2hashSet(descrcomps); + Iterator shi = query.queryHashes.iterator(); String queryhash; while (shi.hasNext()) { - queryhash = (String) shi.next(); - if (urlcomph.contains(queryhash)) r += 256 << ranking.coeff_appurl; - if (descrcomph.contains(queryhash)) r += 256 << ranking.coeff_appdescr; + queryhash = shi.next(); + if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl; + if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appdescr; } return r; diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 75f808999..694b951ef 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -81,6 +81,10 @@ public class plasmaSearchRankingProfile { public static final String DESCRCOMPINTOPLIST = "descrcompintoplist"; public static final String PREFER = "prefer"; + // coefficient max/min values + public static final int COEFF_MIN = 0; + public static final int COEFF_MAX = 15; + public int coeff_domlength, coeff_ybr, coeff_date, coeff_wordsintitle, coeff_wordsintext, coeff_phrasesintext, coeff_llocal, coeff_lother, coeff_urllength, coeff_urlcomps, coeff_hitcount, @@ -127,7 +131,7 @@ public class plasmaSearchRankingProfile { this(plasmaSearchQuery.CONTENTDOM_TEXT); // set defaults if ((profile != null) && (profile.length() > 0)) { //parse external form - HashMap coeff = new HashMap(); + HashMap coeff = new HashMap(); String[] elts = ((profile.startsWith("{") && (profile.endsWith("}"))) ? profile.substring(1, profile.length() - 1) : profile).split(","); int p; int s = (prefix == null) ? 0 : prefix.length(); @@ -174,7 +178,7 @@ public class plasmaSearchRankingProfile { } } - private static int parseMap(HashMap coeff, String attr, int dflt) { + private static int parseMap(HashMap coeff, String attr, int dflt) { if (coeff.containsKey(attr)) try { return ((Integer) coeff.get(attr)).intValue(); } catch (NumberFormatException e) { @@ -188,14 +192,14 @@ public class plasmaSearchRankingProfile { return toExternalMap("").toString(); } - public Map toExternalMap(String prefix) { - Map ext = preToExternalMap(prefix); + public Map toExternalMap(String prefix) { + Map ext = preToExternalMap(prefix); ext.putAll(postToExternalMap(prefix)); return ext; } - public Map preToExternalMap(String prefix) { - Map ext = new HashMap(); + public Map preToExternalMap(String prefix) { + Map ext = new HashMap(); ext.put(prefix + DOMLENGTH, Integer.toString(coeff_domlength)); ext.put(prefix + YBR, Integer.toString(coeff_ybr)); ext.put(prefix + DATE, Integer.toString(coeff_date)); @@ -226,8 +230,8 @@ public class plasmaSearchRankingProfile { return ext; } - public Map postToExternalMap(String prefix) { - Map ext = new HashMap(); + public Map postToExternalMap(String prefix) { + Map ext = new HashMap(); ext.put(prefix + URLCOMPINTOPLIST, Integer.toString(coeff_urlcompintoplist)); ext.put(prefix + DESCRCOMPINTOPLIST, Integer.toString(coeff_descrcompintoplist)); ext.put(prefix + PREFER, Integer.toString(coeff_prefer)); @@ -235,14 +239,14 @@ public class plasmaSearchRankingProfile { } public String toExternalURLGet(String prefix) { - Iterator i = toExternalMap("").entrySet().iterator(); - Map.Entry entry; + Iterator> i = toExternalMap("").entrySet().iterator(); + Map.Entry entry; StringBuffer ext = new StringBuffer(); while (i.hasNext()) { - entry = (Map.Entry) i.next(); + entry = i.next(); ext.append("&"); ext.append(prefix); - ext.append((String) entry.getKey()); + ext.append(entry.getKey()); ext.append("="); ext.append(entry.getValue()); }