From 83009d86f75c1502b1f4f1d6b7710d177480e33b Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 17 Jan 2012 01:53:08 +0100 Subject: [PATCH] added the vocabulary navigator. It can be very simply tested by switching on the locale dictionaries. --- htroot/yacy/search.java | 4 + htroot/yacysearch.html | 3 + htroot/yacysearch.java | 59 +++++++++++++-- htroot/yacysearchtrailer.html | 9 +++ htroot/yacysearchtrailer.java | 73 +++++++++++++++++++ htroot/yacysearchtrailer.json | 15 +++- htroot/yacysearchtrailer.xml | 17 +++-- source/net/yacy/document/Autotagging.java | 62 +++++++++++++--- source/net/yacy/search/query/QueryParams.java | 9 +++ source/net/yacy/search/query/RWIProcess.java | 63 ++++++++++++---- source/net/yacy/search/query/SearchEvent.java | 4 + .../net/yacy/search/query/SnippetProcess.java | 2 +- 12 files changed, 280 insertions(+), 40 deletions(-) diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 7dd686a1b..bf89f94ae 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -30,6 +30,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; @@ -44,6 +45,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue; +import net.yacy.document.Autotagging.Metatag; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; @@ -234,6 +236,7 @@ public final class search { prefer, ContentDomain.contentdomParser(contentdom), language, + new HashSet(), "", // no navigation CacheStrategy.CACHEONLY, count, @@ -296,6 +299,7 @@ public final class search { prefer, ContentDomain.contentdomParser(contentdom), language, + new HashSet(), "", // no navigation CacheStrategy.CACHEONLY, count, diff --git a/htroot/yacysearch.html b/htroot/yacysearch.html index d61ac016b..d4a7685ec 100644 --- a/htroot/yacysearch.html +++ b/htroot/yacysearch.html @@ -88,6 +88,9 @@ $(function() { collapsible: true, header: "h3" }); + #{sidebarVocabulary}# + $("#sidebar#[vocabulary]#").accordion({}); + #{/sidebarVocabulary}# $("#sidebarDomains").accordion({}); $("#sidebarProtocols").accordion({}); $("#sidebarProtocols").accordion('activate', false); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 046706203..e8fb55c95 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -28,6 +28,8 @@ // if the shell's current path is HTROOT import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -44,6 +46,8 @@ import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; +import net.yacy.document.Autotagging.Metatag; +import net.yacy.document.Autotagging.Vocabulary; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; @@ -81,8 +85,7 @@ import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.servletProperties; -public class yacysearch -{ +public class yacysearch { public static serverObjects respond( final RequestHeader header, @@ -115,6 +118,15 @@ public class yacysearch final servletProperties prop = new servletProperties(); prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0); + // produce vocabulary navigation sidebars + Collection vocabularies = LibraryProvider.autotagging.getVocabularies(); + int j = 0; + for (Vocabulary v: vocabularies) { + prop.put("sidebarVocabulary_" + j + "_vocabulary", v.getName()); + j++; + } + prop.put("sidebarVocabulary", j); + // get segment Segment indexSegment = null; if ( post != null && post.containsKey("segment") ) { @@ -386,11 +398,13 @@ public class yacysearch urlmask = "smb://.*"; modifier.append("/smb "); } + if ( querystring.indexOf("/file", 0) >= 0 ) { querystring = querystring.replace("/file", ""); urlmask = "file://.*"; modifier.append("/file "); } + if ( querystring.indexOf("/location", 0) >= 0 ) { querystring = querystring.replace("/location", ""); if ( constraint == null ) { @@ -399,6 +413,7 @@ public class yacysearch constraint.set(Condenser.flag_cat_haslocation, true); modifier.append("/location "); } + final int lrp = querystring.indexOf("/language/", 0); String language = ""; if ( lrp >= 0 ) { @@ -407,8 +422,9 @@ public class yacysearch } querystring = querystring.replace("/language/" + language, ""); language = language.toLowerCase(); - modifier.append("/language/").append(language).append(" "); + modifier.append("/language/").append(language).append(' '); } + final int inurl = querystring.indexOf("inurl:", 0); if ( inurl >= 0 ) { int ftb = querystring.indexOf(' ', inurl); @@ -420,8 +436,9 @@ public class yacysearch if ( !urlstr.isEmpty() ) { urlmask = urlmask == null ? ".*" + urlstr + ".*" : urlmask + urlstr + ".*"; } - modifier.append("inurl:").append(urlstr).append(" "); + modifier.append("inurl:").append(urlstr).append(' '); } + final int filetype = querystring.indexOf("filetype:", 0); if ( filetype >= 0 ) { int ftb = querystring.indexOf(' ', filetype); @@ -440,8 +457,31 @@ public class yacysearch urlmask = urlmask + ".*\\." + ft; } } - modifier.append("filetype:").append(ft).append(" "); + modifier.append("filetype:").append(ft).append(' '); } + + int voc = 0; + Collection metatags = new ArrayList(1); + while ((voc = querystring.indexOf("/vocabulary/", 0)) >= 0) { + String vocabulary = ""; + int ve = querystring.indexOf(' ', voc + 12); + if (ve < 0) { + vocabulary = querystring.substring(voc); + querystring = querystring.substring(0, voc).trim(); + } else { + vocabulary = querystring.substring(voc + 1, ve); + querystring = querystring.substring(0, voc) + querystring.substring(ve); + } + modifier.append(vocabulary).append(' '); + vocabulary = vocabulary.substring(12); + int p = vocabulary.indexOf('/'); + if (p > 0) { + String k = vocabulary.substring(0, p); + String v = vocabulary.substring(p + 1); + metatags.add(LibraryProvider.autotagging.metatag(LibraryProvider.autotagging.prefixChar + k + ":" + v)); + } + } + String tenant = null; if ( post.containsKey("tenant") ) { tenant = post.get("tenant"); @@ -456,6 +496,7 @@ public class yacysearch } } } + final int site = querystring.indexOf("site:", 0); String sitehash = null; String sitehost = null; @@ -473,7 +514,7 @@ public class yacysearch sitehost = sitehost.substring(0, sitehost.length() - 1); } sitehash = DigestURI.hosthash(sitehost); - modifier.append("site:").append(sitehost).append(" "); + modifier.append("site:").append(sitehost).append(' '); } final int heuristicScroogle = querystring.indexOf("/heuristic/scroogle", 0); @@ -509,10 +550,11 @@ public class yacysearch } author = querystring.substring(authori + 7, ftb); querystring = querystring.replace("author:" + author, ""); - modifier.append("author:").append(author).append(" "); + modifier.append("author:").append(author).append(' '); } authorhash = ASCII.String(Word.word2hash(author)); } + final int tld = querystring.indexOf("tld:", 0); if ( tld >= 0 ) { int ftb = querystring.indexOf(' ', tld); @@ -521,7 +563,7 @@ public class yacysearch } String domain = querystring.substring(tld + 4, ftb); querystring = querystring.replace("tld:" + domain, ""); - modifier.append("tld:").append(domain).append(" "); + modifier.append("tld:").append(domain).append(' '); while ( domain.length() > 0 && domain.charAt(0) == '.' ) { domain = domain.substring(1); } @@ -695,6 +737,7 @@ public class yacysearch prefermask, contentdom, language, + metatags, navigation, snippetFetchStrategy, maximumRecords, diff --git a/htroot/yacysearchtrailer.html b/htroot/yacysearchtrailer.html index 9e37529b4..ab05703cb 100644 --- a/htroot/yacysearchtrailer.html +++ b/htroot/yacysearchtrailer.html @@ -61,6 +61,15 @@ #(/nav-authors)# +#{nav-vocabulary}# +
+

#[navname]# Navigator

+
    #{element}# +
  • #[url]#
  • +#{/element}#
+
+#{/nav-vocabulary}# + #(nav-about)#::

#[headline]#

diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index d7522b6fb..5bc71d92c 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -25,9 +25,11 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.util.Iterator; +import java.util.Map; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ScoreMap; +import net.yacy.document.Autotagging; import net.yacy.document.LibraryProvider; import net.yacy.kelondro.util.EventTracker; import net.yacy.kelondro.util.Formatter; @@ -219,6 +221,77 @@ public class yacysearchtrailer { prop.put("nav-filetypes_element_" + i + "_nl", 0); } + // vocabulary navigators + final Map> vocabularyNavigators = theSearch.getVocabularyNavigators(); + if (vocabularyNavigators != null && vocabularyNavigators.size() > 0) { + int navvoccount = 0; + vocnav: for (Map.Entry> ve: vocabularyNavigators.entrySet()) { + String navname = ve.getKey(); + if (ve.getValue() == null || ve.getValue().isEmpty()) { + continue vocnav; + } + prop.put(fileType, "nav-vocabulary_" + navvoccount + "_navname", navname); + navigatorIterator = ve.getValue().keys(false); + int i = 0; + String anav; + while (i < 20 && navigatorIterator.hasNext()) { + name = navigatorIterator.next(); + count = ve.getValue().get(name); + anav = "/vocabulary/" + navname + "/" + Autotagging.encodePrintname(name); + prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_name", name); + prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_url", "" + name + " (" + count + ")"); + prop.putJSON("nav-vocabulary_" + navvoccount + "_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString()); + prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_count", count); + prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_modifier", anav); + prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_nl", 1); + i++; + } + prop.put("nav-vocabulary_" + navvoccount + "_element", i); + i--; + prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_nl", 0); + navvoccount++; + } + prop.put("nav-vocabulary", navvoccount); + } else { + prop.put("nav-vocabulary", 0); + } +/* +html +#{nav-vocabulary}# +
+

#[navname]# Navigator

+
    #{element}# +
  • #[url]#
  • +#{/element}#
+
+#{/nav-vocabulary}# + +xml +#{nav-vocabulary}# + +#{element}# + +#{/element}# + +#{/nav-vocabulary}# + +json +#{nav-vocabulary}# + { + "facetname": "#[navname]#", + "displayname": "#[navname]#", + "type": "String", + "min": "0", + "max": "0", + "mean": "0", + "elements": [ +#{element}# + {"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)# +#{/element}# + ] + },#{nav-vocabulary}# + */ + // about box final String aboutBody = env.getConfig("about.body", ""); final String aboutHeadline = env.getConfig("about.headline", ""); diff --git a/htroot/yacysearchtrailer.json b/htroot/yacysearchtrailer.json index 379bdaef2..46ce7f2c9 100644 --- a/htroot/yacysearchtrailer.json +++ b/htroot/yacysearchtrailer.json @@ -63,7 +63,20 @@ {"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)# #{/element}# ] - },#(/nav-authors)##(nav-topics)#:: + },#(/nav-authors)##{nav-vocabulary}# + { + "facetname": "#[navname]#", + "displayname": "#[navname]#", + "type": "String", + "min": "0", + "max": "0", + "mean": "0", + "elements": [ +#{element}# + {"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)# +#{/element}# + ] + },#{nav-vocabulary}##(nav-topics)#:: { "facetname": "topics", "displayname": "Topics", diff --git a/htroot/yacysearchtrailer.xml b/htroot/yacysearchtrailer.xml index 608338281..dab1438d6 100644 --- a/htroot/yacysearchtrailer.xml +++ b/htroot/yacysearchtrailer.xml @@ -7,40 +7,47 @@ #(/nav-domains)# #(nav-namespace)#:: - + #{element}# #{/element}# #(/nav-namespace)# #(nav-authors)#:: - + #{element}# #{/element}# #(/nav-authors)# #(nav-filetype)#:: - + #{element}# #{/element}# #(/nav-filetype)# #(nav-protocol)#:: - + #{element}# #{/element}# #(/nav-protocol)# #(nav-topics)#:: - + #{element}# #{/element}# #(/nav-topics)# +#{nav-vocabulary}# + +#{element}# + +#{/element}# + +#{/nav-vocabulary}# #[num-results_totalcount]# diff --git a/source/net/yacy/document/Autotagging.java b/source/net/yacy/document/Autotagging.java index 08b0dfeeb..e772e0c0c 100644 --- a/source/net/yacy/document/Autotagging.java +++ b/source/net/yacy/document/Autotagging.java @@ -24,6 +24,7 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.HashSet; import java.util.Map; import java.util.Set; @@ -43,12 +44,12 @@ import net.yacy.kelondro.util.FileUtils; */ public class Autotagging { - final static Object PRESENT = new Object(); + private final static Object PRESENT = new Object(); - final char prefixChar; - final File autotaggingPath; - final Map vocabularies; - final Map allTags; + public final char prefixChar; + private final File autotaggingPath; + private final Map vocabularies; + private final Map allTags; public Autotagging(final File autotaggingPath, char prefixChar) { this.vocabularies = new ConcurrentHashMap(); @@ -92,6 +93,10 @@ public class Autotagging { } } + public Collection getVocabularies() { + return this.vocabularies.values(); + } + public Set allTags() { return this.allTags.keySet(); } @@ -138,7 +143,7 @@ public class Autotagging { word = normalizeWord(word); for (Map.Entry v: this.vocabularies.entrySet()) { tag = v.getValue().getMetatag(word); - if (tag != null) return tag.getMetatag(); + if (tag != null) return tag.toString(); } return null; } @@ -178,6 +183,11 @@ public class Autotagging { continue vocloop; } k = line.substring(0, p).trim(); + k = k.replaceAll(" \\+", ", "); // remove symbols that are bad in a query attribute + k = k.replaceAll(" /", ", "); + k = k.replaceAll("\\+", ","); + k = k.replaceAll("/", ","); + k = k.replaceAll(" ", " "); v = line.substring(p + 1); tags = v.split(","); tagloop: for (String t: tags) { @@ -238,6 +248,8 @@ public class Autotagging { private final static Pattern PATTERN_OE = Pattern.compile("\u00F6"); private final static Pattern PATTERN_UE = Pattern.compile("\u00FC"); private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF"); + private final static Pattern PATTERN_UL = Pattern.compile("_"); + private final static Pattern PATTERN_SP = Pattern.compile(" "); private static final String normalizeWord(String word) { word = word.trim().toLowerCase(); @@ -255,12 +267,12 @@ public class Autotagging { this.vocName = vocName; this.print = print; } - public Metatag(String metatag) { + public Metatag(String metatag) throws RuntimeException { assert metatag.charAt(0) == Autotagging.this.prefixChar; int p = metatag.indexOf(':'); - assert p > 0; + if (p < 0) throw new RuntimeException("bad metatag: metatag = " + metatag); this.vocName = metatag.substring(1, p); - this.print = metatag.substring(p + 1); + this.print = decodeMaskname(metatag.substring(p + 1)); } public String getVocabularyName() { return this.vocName; @@ -268,19 +280,45 @@ public class Autotagging { public String getPrintName() { return this.print; } - public String getMetatag() { - return Autotagging.this.prefixChar + this.vocName + ":" + this.print.replaceAll(" ", "_"); + @Override + public String toString() { + return Autotagging.this.prefixChar + this.vocName + ":" + encodePrintname(this.print); + } + @Override + public boolean equals(Object m) { + Metatag m0 = (Metatag) m; + return this.vocName.equals(m0.vocName) && this.print.equals(m0.print); + } + @Override + public int hashCode() { + return this.vocName.hashCode() + this.print.hashCode(); } } + public static final String encodePrintname(String printname) { + return PATTERN_SP.matcher(printname).replaceAll("_"); + } + + public static final String decodeMaskname(String maskname) { + return PATTERN_UL.matcher(maskname).replaceAll(" "); + } + public Metatag metatag(String vocName, String print) { return new Metatag(vocName, print); } - public Metatag metatag(String metatag) { + public Metatag metatag(String metatag) throws RuntimeException { return new Metatag(metatag); } + public static boolean metatagAppearIn(final Metatag metatag, final String[] tags) { + String tag = metatag.toString(); + for (String s: tags) { + if (tag.equals(s)) return true; + } + return false; + } + public static void main(String[] args) { Autotagging a = new Autotagging(new File("DATA/DICTIONARIES/" + LibraryProvider.path_to_autotagging_dictionaries), '$'); for (Map.Entry entry: a.vocabularies.entrySet()) { diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index ec344512d..3a2502a43 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -28,6 +28,8 @@ package net.yacy.search.query; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -42,6 +44,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.services.federated.yacy.CacheStrategy; +import net.yacy.document.Autotagging; import net.yacy.document.Condenser; import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.CharacterCoding; @@ -113,6 +116,7 @@ public final class QueryParams { public final boolean urlMask_isCatchall, prefer_isMatchnothing; public final ContentDomain contentdom; public final String targetlang; + public final Collection metatags; public final String navigators; public final Searchdom domType; public final int zonecode; @@ -176,6 +180,7 @@ public final class QueryParams { this.itemsPerPage = itemsPerPage; this.offset = 0; this.targetlang = "en"; + this.metatags = new ArrayList(0); this.domType = Searchdom.LOCAL; this.zonecode = DigestURI.TLD_any_zone_filter; this.domMaxTargets = 0; @@ -205,6 +210,7 @@ public final class QueryParams { final String modifier, final int maxDistance, final String prefer, final ContentDomain contentdom, final String language, + final Collection metatags, final String navigators, final CacheStrategy snippetCacheStrategy, final int itemsPerPage, final int offset, final String urlMask, @@ -247,6 +253,7 @@ public final class QueryParams { this.prefer_isMatchnothing = this.prefer.toString().equals(matchnothing_pattern.toString()); assert language != null; this.targetlang = language; + this.metatags = metatags; this.navigators = navigators; this.domType = domType; this.zonecode = domainzone; @@ -506,6 +513,8 @@ public final class QueryParams { context.append(asterisk); context.append(this.maxDistance); context.append(asterisk); + context.append(this.modifier.s); + context.append(asterisk); context.append(this.snippetCacheStrategy == null ? "null" : this.snippetCacheStrategy.name()); if (anonymized) { this.idCacheAnon = context.toString(); diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java index 6a0bda80e..54852962b 100644 --- a/source/net/yacy/search/query/RWIProcess.java +++ b/source/net/yacy/search/query/RWIProcess.java @@ -46,7 +46,10 @@ import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement; +import net.yacy.document.Autotagging; +import net.yacy.document.Autotagging.Metatag; import net.yacy.document.Condenser; +import net.yacy.document.LibraryProvider; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -101,6 +104,7 @@ public final class RWIProcess extends Thread private final ScoreMap namespaceNavigator; // a counter for name spaces private final ScoreMap protocolNavigator; // a counter for protocol types private final ScoreMap filetypeNavigator; // a counter for file types + private final Map> vocabularyNavigator; // counters for Vocabularies public RWIProcess(final QueryParams query, final ReferenceOrder order, final int maxentries, final boolean remote) { // we collect the urlhashes and construct a list with urlEntry objects @@ -132,6 +136,7 @@ public final class RWIProcess extends Thread this.namespaceNavigator = new ConcurrentScoreMap(); this.protocolNavigator = new ConcurrentScoreMap(); this.filetypeNavigator = new ConcurrentScoreMap(); + this.vocabularyNavigator = new ConcurrentHashMap>(); this.ref = new ConcurrentScoreMap(); this.feedersAlive = new AtomicInteger(0); this.feedersTerminated = new AtomicInteger(0); @@ -349,8 +354,7 @@ public final class RWIProcess extends Thread this.urlhashes.putUnique(iEntry.urlhash()); rankingtryloop: while ( true ) { try { - this.stack.put(new ReverseElement(iEntry, this.order - .cardinal(iEntry))); // inserts the element and removes the worst (which is smallest) + this.stack.put(new ReverseElement(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest) break rankingtryloop; } catch ( final ArithmeticException e ) { // this may happen if the concurrent normalizer changes values during cardinal computation @@ -482,8 +486,7 @@ public final class RWIProcess extends Thread m = this.doubleDomCache.get(hosthash); if ( m == null ) { // first appearance of dom. we create an entry to signal that one of that domain was already returned - m = - new WeakPriorityBlockingQueue((this.query.specialRights) + m = new WeakPriorityBlockingQueue((this.query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll); this.doubleDomCache.put(hosthash, m); @@ -504,8 +507,7 @@ public final class RWIProcess extends Thread WeakPriorityBlockingQueue.Element bestEntry = null; WeakPriorityBlockingQueue.Element o; synchronized ( this.doubleDomCache ) { - final Iterator> i = - this.doubleDomCache.values().iterator(); + final Iterator> i = this.doubleDomCache.values().iterator(); while ( i.hasNext() ) { try { m = i.next(); @@ -557,10 +559,9 @@ public final class RWIProcess extends Thread final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime); int p = -1; long timeleft; - while ( (timeleft = timeout - System.currentTimeMillis()) > 0 ) { + takeloop: while ( (timeleft = timeout - System.currentTimeMillis()) > 0 ) { //System.out.println("timeleft = " + timeleft); - final WeakPriorityBlockingQueue.Element obrwi = - takeRWI(skipDoubleDom, timeleft); + final WeakPriorityBlockingQueue.Element obrwi = takeRWI(skipDoubleDom, timeleft); if ( obrwi == null ) { return null; // all time was already wasted in takeRWI to get another element } @@ -635,6 +636,20 @@ public final class RWIProcess extends Thread continue; } + // check vocabulary constraint + final String tags = page.dc_subject(); + final String[] taglist = tags == null || tags.length() == 0 ? new String[0] : SPACE_PATTERN.split(page.dc_subject()); + if (this.query.metatags != null && this.query.metatags.size() > 0) { + // all metatags must appear in the tags list + for (Metatag metatag: this.query.metatags) { + if (!Autotagging.metatagAppearIn(metatag, taglist)) { + this.sortout++; + Log.logInfo("RWIProcess", "sorted out " + page.url()); + continue takeloop; + } + } + } + // evaluate information of metadata for navigation // author navigation: if ( pageauthor != null && pageauthor.length() > 0 ) { @@ -654,6 +669,12 @@ public final class RWIProcess extends Thread continue; } + // check Scanner + if ( !Scanner.acceptURL(page.url()) ) { + this.sortout++; + continue; + } + // namespace navigation String pagepath = page.url().getPath(); if ( (p = pagepath.indexOf(':')) >= 0 ) { @@ -675,10 +696,20 @@ public final class RWIProcess extends Thread this.filetypeNavigator.inc(fileext); } - // check Scanner - if ( !Scanner.acceptURL(page.url()) ) { - this.sortout++; - continue; + // vocabulary navigation + tagharvest: for (String tag: taglist) { + if (tag.length() < 1 || tag.charAt(0) != LibraryProvider.tagPrefix) continue tagharvest; + try { + Metatag metatag = LibraryProvider.autotagging.metatag(tag); + ScoreMap voc = this.vocabularyNavigator.get(metatag.getVocabularyName()); + if (voc == null) { + voc = new ConcurrentScoreMap(); + this.vocabularyNavigator.put(metatag.getVocabularyName(), voc); + } + voc.inc(metatag.getPrintName()); + } catch (RuntimeException e) { + // tag may not be well-formed + } } // accept url @@ -687,6 +718,8 @@ public final class RWIProcess extends Thread return null; } + final static Pattern SPACE_PATTERN = Pattern.compile(" "); + public int sizeQueue() { int c = this.stack.sizeQueue(); for ( final WeakPriorityBlockingQueue s : this.doubleDomCache.values() ) { @@ -818,6 +851,10 @@ public final class RWIProcess extends Thread return this.filetypeNavigator; } + public Map> getVocabularyNavigators() { + return this.vocabularyNavigator; + } + public static final Comparator> mecomp = new Comparator>() { @Override diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 195c7287b..5e2887812 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -472,6 +472,10 @@ public final class SearchEvent return this.rankingProcess.getFiletypeNavigator(); } + public Map> getVocabularyNavigators() { + return this.rankingProcess.getVocabularyNavigators(); + } + public void addHeuristic(final byte[] urlhash, final String heuristicName, final boolean redundant) { synchronized ( this.heuristics ) { this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant)); diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index e0af0f363..0633405e4 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -481,7 +481,7 @@ public class SnippetProcess { } // get next entry - page = SnippetProcess.this.rankingProcess.takeURL(true, Math.min(100, this.timeout - System.currentTimeMillis())); + page = SnippetProcess.this.rankingProcess.takeURL(true, Math.min(500, Math.max(100, this.timeout - System.currentTimeMillis()))); //if (page != null) Log.logInfo("ResultFetcher", "got one page: " + page.metadata().url().toNormalform(true, false)); //if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis()); if (page == null) {