added the vocabulary navigator. It can be very simply tested by

switching on the locale dictionaries.
pull/1/head
Michael Peter Christen 13 years ago
parent 37d43e5589
commit 83009d86f7

@ -30,6 +30,7 @@
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
@ -44,6 +45,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.document.Autotagging.Metatag;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory; import net.yacy.kelondro.data.word.WordReferenceFactory;
@ -234,6 +236,7 @@ public final class search {
prefer, prefer,
ContentDomain.contentdomParser(contentdom), ContentDomain.contentdomParser(contentdom),
language, language,
new HashSet<Metatag>(),
"", // no navigation "", // no navigation
CacheStrategy.CACHEONLY, CacheStrategy.CACHEONLY,
count, count,
@ -296,6 +299,7 @@ public final class search {
prefer, prefer,
ContentDomain.contentdomParser(contentdom), ContentDomain.contentdomParser(contentdom),
language, language,
new HashSet<Metatag>(),
"", // no navigation "", // no navigation
CacheStrategy.CACHEONLY, CacheStrategy.CACHEONLY,
count, count,

@ -88,6 +88,9 @@ $(function() {
collapsible: true, collapsible: true,
header: "h3" header: "h3"
}); });
#{sidebarVocabulary}#
$("#sidebar#[vocabulary]#").accordion({});
#{/sidebarVocabulary}#
$("#sidebarDomains").accordion({}); $("#sidebarDomains").accordion({});
$("#sidebarProtocols").accordion({}); $("#sidebarProtocols").accordion({});
$("#sidebarProtocols").accordion('activate', false); $("#sidebarProtocols").accordion('activate', false);

@ -28,6 +28,8 @@
// if the shell's current path is HTROOT // if the shell's current path is HTROOT
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
@ -44,6 +46,8 @@ import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Autotagging.Metatag;
import net.yacy.document.Autotagging.Vocabulary;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
@ -81,8 +85,7 @@ import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties; import de.anomic.server.servletProperties;
public class yacysearch public class yacysearch {
{
public static serverObjects respond( public static serverObjects respond(
final RequestHeader header, final RequestHeader header,
@ -115,6 +118,15 @@ public class yacysearch
final servletProperties prop = new servletProperties(); final servletProperties prop = new servletProperties();
prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0); prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0);
// produce vocabulary navigation sidebars
Collection<Vocabulary> vocabularies = LibraryProvider.autotagging.getVocabularies();
int j = 0;
for (Vocabulary v: vocabularies) {
prop.put("sidebarVocabulary_" + j + "_vocabulary", v.getName());
j++;
}
prop.put("sidebarVocabulary", j);
// get segment // get segment
Segment indexSegment = null; Segment indexSegment = null;
if ( post != null && post.containsKey("segment") ) { if ( post != null && post.containsKey("segment") ) {
@ -386,11 +398,13 @@ public class yacysearch
urlmask = "smb://.*"; urlmask = "smb://.*";
modifier.append("/smb "); modifier.append("/smb ");
} }
if ( querystring.indexOf("/file", 0) >= 0 ) { if ( querystring.indexOf("/file", 0) >= 0 ) {
querystring = querystring.replace("/file", ""); querystring = querystring.replace("/file", "");
urlmask = "file://.*"; urlmask = "file://.*";
modifier.append("/file "); modifier.append("/file ");
} }
if ( querystring.indexOf("/location", 0) >= 0 ) { if ( querystring.indexOf("/location", 0) >= 0 ) {
querystring = querystring.replace("/location", ""); querystring = querystring.replace("/location", "");
if ( constraint == null ) { if ( constraint == null ) {
@ -399,6 +413,7 @@ public class yacysearch
constraint.set(Condenser.flag_cat_haslocation, true); constraint.set(Condenser.flag_cat_haslocation, true);
modifier.append("/location "); modifier.append("/location ");
} }
final int lrp = querystring.indexOf("/language/", 0); final int lrp = querystring.indexOf("/language/", 0);
String language = ""; String language = "";
if ( lrp >= 0 ) { if ( lrp >= 0 ) {
@ -407,8 +422,9 @@ public class yacysearch
} }
querystring = querystring.replace("/language/" + language, ""); querystring = querystring.replace("/language/" + language, "");
language = language.toLowerCase(); language = language.toLowerCase();
modifier.append("/language/").append(language).append(" "); modifier.append("/language/").append(language).append(' ');
} }
final int inurl = querystring.indexOf("inurl:", 0); final int inurl = querystring.indexOf("inurl:", 0);
if ( inurl >= 0 ) { if ( inurl >= 0 ) {
int ftb = querystring.indexOf(' ', inurl); int ftb = querystring.indexOf(' ', inurl);
@ -420,8 +436,9 @@ public class yacysearch
if ( !urlstr.isEmpty() ) { if ( !urlstr.isEmpty() ) {
urlmask = urlmask == null ? ".*" + urlstr + ".*" : urlmask + urlstr + ".*"; urlmask = urlmask == null ? ".*" + urlstr + ".*" : urlmask + urlstr + ".*";
} }
modifier.append("inurl:").append(urlstr).append(" "); modifier.append("inurl:").append(urlstr).append(' ');
} }
final int filetype = querystring.indexOf("filetype:", 0); final int filetype = querystring.indexOf("filetype:", 0);
if ( filetype >= 0 ) { if ( filetype >= 0 ) {
int ftb = querystring.indexOf(' ', filetype); int ftb = querystring.indexOf(' ', filetype);
@ -440,8 +457,31 @@ public class yacysearch
urlmask = urlmask + ".*\\." + ft; urlmask = urlmask + ".*\\." + ft;
} }
} }
modifier.append("filetype:").append(ft).append(" "); modifier.append("filetype:").append(ft).append(' ');
} }
int voc = 0;
Collection<Metatag> metatags = new ArrayList<Metatag>(1);
while ((voc = querystring.indexOf("/vocabulary/", 0)) >= 0) {
String vocabulary = "";
int ve = querystring.indexOf(' ', voc + 12);
if (ve < 0) {
vocabulary = querystring.substring(voc);
querystring = querystring.substring(0, voc).trim();
} else {
vocabulary = querystring.substring(voc + 1, ve);
querystring = querystring.substring(0, voc) + querystring.substring(ve);
}
modifier.append(vocabulary).append(' ');
vocabulary = vocabulary.substring(12);
int p = vocabulary.indexOf('/');
if (p > 0) {
String k = vocabulary.substring(0, p);
String v = vocabulary.substring(p + 1);
metatags.add(LibraryProvider.autotagging.metatag(LibraryProvider.autotagging.prefixChar + k + ":" + v));
}
}
String tenant = null; String tenant = null;
if ( post.containsKey("tenant") ) { if ( post.containsKey("tenant") ) {
tenant = post.get("tenant"); tenant = post.get("tenant");
@ -456,6 +496,7 @@ public class yacysearch
} }
} }
} }
final int site = querystring.indexOf("site:", 0); final int site = querystring.indexOf("site:", 0);
String sitehash = null; String sitehash = null;
String sitehost = null; String sitehost = null;
@ -473,7 +514,7 @@ public class yacysearch
sitehost = sitehost.substring(0, sitehost.length() - 1); sitehost = sitehost.substring(0, sitehost.length() - 1);
} }
sitehash = DigestURI.hosthash(sitehost); sitehash = DigestURI.hosthash(sitehost);
modifier.append("site:").append(sitehost).append(" "); modifier.append("site:").append(sitehost).append(' ');
} }
final int heuristicScroogle = querystring.indexOf("/heuristic/scroogle", 0); final int heuristicScroogle = querystring.indexOf("/heuristic/scroogle", 0);
@ -509,10 +550,11 @@ public class yacysearch
} }
author = querystring.substring(authori + 7, ftb); author = querystring.substring(authori + 7, ftb);
querystring = querystring.replace("author:" + author, ""); querystring = querystring.replace("author:" + author, "");
modifier.append("author:").append(author).append(" "); modifier.append("author:").append(author).append(' ');
} }
authorhash = ASCII.String(Word.word2hash(author)); authorhash = ASCII.String(Word.word2hash(author));
} }
final int tld = querystring.indexOf("tld:", 0); final int tld = querystring.indexOf("tld:", 0);
if ( tld >= 0 ) { if ( tld >= 0 ) {
int ftb = querystring.indexOf(' ', tld); int ftb = querystring.indexOf(' ', tld);
@ -521,7 +563,7 @@ public class yacysearch
} }
String domain = querystring.substring(tld + 4, ftb); String domain = querystring.substring(tld + 4, ftb);
querystring = querystring.replace("tld:" + domain, ""); querystring = querystring.replace("tld:" + domain, "");
modifier.append("tld:").append(domain).append(" "); modifier.append("tld:").append(domain).append(' ');
while ( domain.length() > 0 && domain.charAt(0) == '.' ) { while ( domain.length() > 0 && domain.charAt(0) == '.' ) {
domain = domain.substring(1); domain = domain.substring(1);
} }
@ -695,6 +737,7 @@ public class yacysearch
prefermask, prefermask,
contentdom, contentdom,
language, language,
metatags,
navigation, navigation,
snippetFetchStrategy, snippetFetchStrategy,
maximumRecords, maximumRecords,

@ -61,6 +61,15 @@
</div> </div>
#(/nav-authors)# #(/nav-authors)#
#{nav-vocabulary}#
<div id="sidebar#[navname]#" style="float: right; margin-top:5px; width: 220px;">
<h3 style="padding-left:25px;">#[navname]# Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
</div>
#{/nav-vocabulary}#
#(nav-about)#:: #(nav-about)#::
<div id="sidebarAbout" style="float: right; margin-top:5px; width: 220px;"> <div id="sidebarAbout" style="float: right; margin-top:5px; width: 220px;">
<h3 style="padding-left:25px;">#[headline]#</h3> <h3 style="padding-left:25px;">#[headline]#</h3>

@ -25,9 +25,11 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Iterator; import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.ScoreMap;
import net.yacy.document.Autotagging;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.util.EventTracker; import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.Formatter; import net.yacy.kelondro.util.Formatter;
@ -219,6 +221,77 @@ public class yacysearchtrailer {
prop.put("nav-filetypes_element_" + i + "_nl", 0); prop.put("nav-filetypes_element_" + i + "_nl", 0);
} }
// vocabulary navigators
final Map<String, ScoreMap<String>> vocabularyNavigators = theSearch.getVocabularyNavigators();
if (vocabularyNavigators != null && vocabularyNavigators.size() > 0) {
int navvoccount = 0;
vocnav: for (Map.Entry<String, ScoreMap<String>> ve: vocabularyNavigators.entrySet()) {
String navname = ve.getKey();
if (ve.getValue() == null || ve.getValue().isEmpty()) {
continue vocnav;
}
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_navname", navname);
navigatorIterator = ve.getValue().keys(false);
int i = 0;
String anav;
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = ve.getValue().get(name);
anav = "/vocabulary/" + navname + "/" + Autotagging.encodePrintname(name);
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_name", name);
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-vocabulary_" + navvoccount + "_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_count", count);
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_modifier", anav);
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_nl", 1);
i++;
}
prop.put("nav-vocabulary_" + navvoccount + "_element", i);
i--;
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_nl", 0);
navvoccount++;
}
prop.put("nav-vocabulary", navvoccount);
} else {
prop.put("nav-vocabulary", 0);
}
/*
html
#{nav-vocabulary}#
<div id="sidebar#[navname]#" style="float: right; margin-top:5px; width: 220px;">
<h3 style="padding-left:25px;">#[navname]# Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
</div>
#{/nav-vocabulary}#
xml
#{nav-vocabulary}#
<yacy:facet name="#[navname]#" displayname="#[navname]#" type="String" min="0" max="0" mean="0">
#{element}#
<yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" />
#{/element}#
</yacy:facet>
#{/nav-vocabulary}#
json
#{nav-vocabulary}#
{
"facetname": "#[navname]#",
"displayname": "#[navname]#",
"type": "String",
"min": "0",
"max": "0",
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#{nav-vocabulary}#
*/
// about box // about box
final String aboutBody = env.getConfig("about.body", ""); final String aboutBody = env.getConfig("about.body", "");
final String aboutHeadline = env.getConfig("about.headline", ""); final String aboutHeadline = env.getConfig("about.headline", "");

@ -63,7 +63,20 @@
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)# {"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
#{/element}# #{/element}#
] ]
},#(/nav-authors)##(nav-topics)#:: },#(/nav-authors)##{nav-vocabulary}#
{
"facetname": "#[navname]#",
"displayname": "#[navname]#",
"type": "String",
"min": "0",
"max": "0",
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#{nav-vocabulary}##(nav-topics)#::
{ {
"facetname": "topics", "facetname": "topics",
"displayname": "Topics", "displayname": "Topics",

@ -7,40 +7,47 @@
</yacy:facet> </yacy:facet>
#(/nav-domains)# #(/nav-domains)#
#(nav-namespace)#:: #(nav-namespace)#::
<yacy:facet name="domains" displayname="Namespace" type="String" min="0" max="0" mean="0"> <yacy:facet name="namespace" displayname="Namespace" type="String" min="0" max="0" mean="0">
#{element}# #{element}#
<yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" /> <yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" />
#{/element}# #{/element}#
</yacy:facet> </yacy:facet>
#(/nav-namespace)# #(/nav-namespace)#
#(nav-authors)#:: #(nav-authors)#::
<yacy:facet name="domains" displayname="Authors" type="String" min="0" max="0" mean="0"> <yacy:facet name="authors" displayname="Authors" type="String" min="0" max="0" mean="0">
#{element}# #{element}#
<yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" /> <yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" />
#{/element}# #{/element}#
</yacy:facet> </yacy:facet>
#(/nav-authors)# #(/nav-authors)#
#(nav-filetype)#:: #(nav-filetype)#::
<yacy:facet name="domains" displayname="Filetypes" type="String" min="0" max="0" mean="0"> <yacy:facet name="filetypes" displayname="Filetypes" type="String" min="0" max="0" mean="0">
#{element}# #{element}#
<yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" /> <yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" />
#{/element}# #{/element}#
</yacy:facet> </yacy:facet>
#(/nav-filetype)# #(/nav-filetype)#
#(nav-protocol)#:: #(nav-protocol)#::
<yacy:facet name="domains" displayname="Protocols" type="String" min="0" max="0" mean="0"> <yacy:facet name="protocols" displayname="Protocols" type="String" min="0" max="0" mean="0">
#{element}# #{element}#
<yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" /> <yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" />
#{/element}# #{/element}#
</yacy:facet> </yacy:facet>
#(/nav-protocol)# #(/nav-protocol)#
#(nav-topics)#:: #(nav-topics)#::
<yacy:facet name="topwords" displayname="Topics" type="String" min="0" max="0" mean="0"> <yacy:facet name="topics" displayname="Topics" type="String" min="0" max="0" mean="0">
#{element}# #{element}#
<yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" /> <yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" />
#{/element}# #{/element}#
</yacy:facet> </yacy:facet>
#(/nav-topics)# #(/nav-topics)#
#{nav-vocabulary}#
<yacy:facet name="#[navname]#" displayname="#[navname]#" type="String" min="0" max="0" mean="0">
#{element}#
<yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" />
#{/element}#
</yacy:facet>
#{/nav-vocabulary}#
</yacy:navigation> </yacy:navigation>
<opensearch:totalResults>#[num-results_totalcount]#</opensearch:totalResults> <opensearch:totalResults>#[num-results_totalcount]#</opensearch:totalResults>

@ -24,6 +24,7 @@ import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
@ -43,12 +44,12 @@ import net.yacy.kelondro.util.FileUtils;
*/ */
public class Autotagging { public class Autotagging {
final static Object PRESENT = new Object(); private final static Object PRESENT = new Object();
final char prefixChar; public final char prefixChar;
final File autotaggingPath; private final File autotaggingPath;
final Map<String, Vocabulary> vocabularies; private final Map<String, Vocabulary> vocabularies;
final Map<String, Object> allTags; private final Map<String, Object> allTags;
public Autotagging(final File autotaggingPath, char prefixChar) { public Autotagging(final File autotaggingPath, char prefixChar) {
this.vocabularies = new ConcurrentHashMap<String, Vocabulary>(); this.vocabularies = new ConcurrentHashMap<String, Vocabulary>();
@ -92,6 +93,10 @@ public class Autotagging {
} }
} }
public Collection<Vocabulary> getVocabularies() {
return this.vocabularies.values();
}
public Set<String> allTags() { public Set<String> allTags() {
return this.allTags.keySet(); return this.allTags.keySet();
} }
@ -138,7 +143,7 @@ public class Autotagging {
word = normalizeWord(word); word = normalizeWord(word);
for (Map.Entry<String, Vocabulary> v: this.vocabularies.entrySet()) { for (Map.Entry<String, Vocabulary> v: this.vocabularies.entrySet()) {
tag = v.getValue().getMetatag(word); tag = v.getValue().getMetatag(word);
if (tag != null) return tag.getMetatag(); if (tag != null) return tag.toString();
} }
return null; return null;
} }
@ -178,6 +183,11 @@ public class Autotagging {
continue vocloop; continue vocloop;
} }
k = line.substring(0, p).trim(); k = line.substring(0, p).trim();
k = k.replaceAll(" \\+", ", "); // remove symbols that are bad in a query attribute
k = k.replaceAll(" /", ", ");
k = k.replaceAll("\\+", ",");
k = k.replaceAll("/", ",");
k = k.replaceAll(" ", " ");
v = line.substring(p + 1); v = line.substring(p + 1);
tags = v.split(","); tags = v.split(",");
tagloop: for (String t: tags) { tagloop: for (String t: tags) {
@ -238,6 +248,8 @@ public class Autotagging {
private final static Pattern PATTERN_OE = Pattern.compile("\u00F6"); private final static Pattern PATTERN_OE = Pattern.compile("\u00F6");
private final static Pattern PATTERN_UE = Pattern.compile("\u00FC"); private final static Pattern PATTERN_UE = Pattern.compile("\u00FC");
private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF"); private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF");
private final static Pattern PATTERN_UL = Pattern.compile("_");
private final static Pattern PATTERN_SP = Pattern.compile(" ");
private static final String normalizeWord(String word) { private static final String normalizeWord(String word) {
word = word.trim().toLowerCase(); word = word.trim().toLowerCase();
@ -255,12 +267,12 @@ public class Autotagging {
this.vocName = vocName; this.vocName = vocName;
this.print = print; this.print = print;
} }
public Metatag(String metatag) { public Metatag(String metatag) throws RuntimeException {
assert metatag.charAt(0) == Autotagging.this.prefixChar; assert metatag.charAt(0) == Autotagging.this.prefixChar;
int p = metatag.indexOf(':'); int p = metatag.indexOf(':');
assert p > 0; if (p < 0) throw new RuntimeException("bad metatag: metatag = " + metatag);
this.vocName = metatag.substring(1, p); this.vocName = metatag.substring(1, p);
this.print = metatag.substring(p + 1); this.print = decodeMaskname(metatag.substring(p + 1));
} }
public String getVocabularyName() { public String getVocabularyName() {
return this.vocName; return this.vocName;
@ -268,19 +280,45 @@ public class Autotagging {
public String getPrintName() { public String getPrintName() {
return this.print; return this.print;
} }
public String getMetatag() { @Override
return Autotagging.this.prefixChar + this.vocName + ":" + this.print.replaceAll(" ", "_"); public String toString() {
return Autotagging.this.prefixChar + this.vocName + ":" + encodePrintname(this.print);
}
@Override
public boolean equals(Object m) {
Metatag m0 = (Metatag) m;
return this.vocName.equals(m0.vocName) && this.print.equals(m0.print);
}
@Override
public int hashCode() {
return this.vocName.hashCode() + this.print.hashCode();
} }
} }
public static final String encodePrintname(String printname) {
return PATTERN_SP.matcher(printname).replaceAll("_");
}
public static final String decodeMaskname(String maskname) {
return PATTERN_UL.matcher(maskname).replaceAll(" ");
}
public Metatag metatag(String vocName, String print) { public Metatag metatag(String vocName, String print) {
return new Metatag(vocName, print); return new Metatag(vocName, print);
} }
public Metatag metatag(String metatag) { public Metatag metatag(String metatag) throws RuntimeException {
return new Metatag(metatag); return new Metatag(metatag);
} }
public static boolean metatagAppearIn(final Metatag metatag, final String[] tags) {
String tag = metatag.toString();
for (String s: tags) {
if (tag.equals(s)) return true;
}
return false;
}
public static void main(String[] args) { public static void main(String[] args) {
Autotagging a = new Autotagging(new File("DATA/DICTIONARIES/" + LibraryProvider.path_to_autotagging_dictionaries), '$'); Autotagging a = new Autotagging(new File("DATA/DICTIONARIES/" + LibraryProvider.path_to_autotagging_dictionaries), '$');
for (Map.Entry<String, Vocabulary> entry: a.vocabularies.entrySet()) { for (Map.Entry<String, Vocabulary> entry: a.vocabularies.entrySet()) {

@ -28,6 +28,8 @@ package net.yacy.search.query;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
@ -42,6 +44,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Autotagging;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
@ -113,6 +116,7 @@ public final class QueryParams {
public final boolean urlMask_isCatchall, prefer_isMatchnothing; public final boolean urlMask_isCatchall, prefer_isMatchnothing;
public final ContentDomain contentdom; public final ContentDomain contentdom;
public final String targetlang; public final String targetlang;
public final Collection<Autotagging.Metatag> metatags;
public final String navigators; public final String navigators;
public final Searchdom domType; public final Searchdom domType;
public final int zonecode; public final int zonecode;
@ -176,6 +180,7 @@ public final class QueryParams {
this.itemsPerPage = itemsPerPage; this.itemsPerPage = itemsPerPage;
this.offset = 0; this.offset = 0;
this.targetlang = "en"; this.targetlang = "en";
this.metatags = new ArrayList<Autotagging.Metatag>(0);
this.domType = Searchdom.LOCAL; this.domType = Searchdom.LOCAL;
this.zonecode = DigestURI.TLD_any_zone_filter; this.zonecode = DigestURI.TLD_any_zone_filter;
this.domMaxTargets = 0; this.domMaxTargets = 0;
@ -205,6 +210,7 @@ public final class QueryParams {
final String modifier, final String modifier,
final int maxDistance, final String prefer, final ContentDomain contentdom, final int maxDistance, final String prefer, final ContentDomain contentdom,
final String language, final String language,
final Collection<Autotagging.Metatag> metatags,
final String navigators, final String navigators,
final CacheStrategy snippetCacheStrategy, final CacheStrategy snippetCacheStrategy,
final int itemsPerPage, final int offset, final String urlMask, final int itemsPerPage, final int offset, final String urlMask,
@ -247,6 +253,7 @@ public final class QueryParams {
this.prefer_isMatchnothing = this.prefer.toString().equals(matchnothing_pattern.toString()); this.prefer_isMatchnothing = this.prefer.toString().equals(matchnothing_pattern.toString());
assert language != null; assert language != null;
this.targetlang = language; this.targetlang = language;
this.metatags = metatags;
this.navigators = navigators; this.navigators = navigators;
this.domType = domType; this.domType = domType;
this.zonecode = domainzone; this.zonecode = domainzone;
@ -506,6 +513,8 @@ public final class QueryParams {
context.append(asterisk); context.append(asterisk);
context.append(this.maxDistance); context.append(this.maxDistance);
context.append(asterisk); context.append(asterisk);
context.append(this.modifier.s);
context.append(asterisk);
context.append(this.snippetCacheStrategy == null ? "null" : this.snippetCacheStrategy.name()); context.append(this.snippetCacheStrategy == null ? "null" : this.snippetCacheStrategy.name());
if (anonymized) { if (anonymized) {
this.idCacheAnon = context.toString(); this.idCacheAnon = context.toString();

@ -46,7 +46,10 @@ import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement; import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.document.Autotagging;
import net.yacy.document.Autotagging.Metatag;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
@ -101,6 +104,7 @@ public final class RWIProcess extends Thread
private final ScoreMap<String> namespaceNavigator; // a counter for name spaces private final ScoreMap<String> namespaceNavigator; // a counter for name spaces
private final ScoreMap<String> protocolNavigator; // a counter for protocol types private final ScoreMap<String> protocolNavigator; // a counter for protocol types
private final ScoreMap<String> filetypeNavigator; // a counter for file types private final ScoreMap<String> filetypeNavigator; // a counter for file types
private final Map<String, ScoreMap<String>> vocabularyNavigator; // counters for Vocabularies
public RWIProcess(final QueryParams query, final ReferenceOrder order, final int maxentries, final boolean remote) { public RWIProcess(final QueryParams query, final ReferenceOrder order, final int maxentries, final boolean remote) {
// we collect the urlhashes and construct a list with urlEntry objects // we collect the urlhashes and construct a list with urlEntry objects
@ -132,6 +136,7 @@ public final class RWIProcess extends Thread
this.namespaceNavigator = new ConcurrentScoreMap<String>(); this.namespaceNavigator = new ConcurrentScoreMap<String>();
this.protocolNavigator = new ConcurrentScoreMap<String>(); this.protocolNavigator = new ConcurrentScoreMap<String>();
this.filetypeNavigator = new ConcurrentScoreMap<String>(); this.filetypeNavigator = new ConcurrentScoreMap<String>();
this.vocabularyNavigator = new ConcurrentHashMap<String, ScoreMap<String>>();
this.ref = new ConcurrentScoreMap<String>(); this.ref = new ConcurrentScoreMap<String>();
this.feedersAlive = new AtomicInteger(0); this.feedersAlive = new AtomicInteger(0);
this.feedersTerminated = new AtomicInteger(0); this.feedersTerminated = new AtomicInteger(0);
@ -349,8 +354,7 @@ public final class RWIProcess extends Thread
this.urlhashes.putUnique(iEntry.urlhash()); this.urlhashes.putUnique(iEntry.urlhash());
rankingtryloop: while ( true ) { rankingtryloop: while ( true ) {
try { try {
this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
break rankingtryloop; break rankingtryloop;
} catch ( final ArithmeticException e ) { } catch ( final ArithmeticException e ) {
// this may happen if the concurrent normalizer changes values during cardinal computation // this may happen if the concurrent normalizer changes values during cardinal computation
@ -482,8 +486,7 @@ public final class RWIProcess extends Thread
m = this.doubleDomCache.get(hosthash); m = this.doubleDomCache.get(hosthash);
if ( m == null ) { if ( m == null ) {
// first appearance of dom. we create an entry to signal that one of that domain was already returned // first appearance of dom. we create an entry to signal that one of that domain was already returned
m = m = new WeakPriorityBlockingQueue<WordReferenceVars>((this.query.specialRights)
new WeakPriorityBlockingQueue<WordReferenceVars>((this.query.specialRights)
? maxDoubleDomSpecial ? maxDoubleDomSpecial
: maxDoubleDomAll); : maxDoubleDomAll);
this.doubleDomCache.put(hosthash, m); this.doubleDomCache.put(hosthash, m);
@ -504,8 +507,7 @@ public final class RWIProcess extends Thread
WeakPriorityBlockingQueue.Element<WordReferenceVars> bestEntry = null; WeakPriorityBlockingQueue.Element<WordReferenceVars> bestEntry = null;
WeakPriorityBlockingQueue.Element<WordReferenceVars> o; WeakPriorityBlockingQueue.Element<WordReferenceVars> o;
synchronized ( this.doubleDomCache ) { synchronized ( this.doubleDomCache ) {
final Iterator<WeakPriorityBlockingQueue<WordReferenceVars>> i = final Iterator<WeakPriorityBlockingQueue<WordReferenceVars>> i = this.doubleDomCache.values().iterator();
this.doubleDomCache.values().iterator();
while ( i.hasNext() ) { while ( i.hasNext() ) {
try { try {
m = i.next(); m = i.next();
@ -557,10 +559,9 @@ public final class RWIProcess extends Thread
final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime); final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime);
int p = -1; int p = -1;
long timeleft; long timeleft;
while ( (timeleft = timeout - System.currentTimeMillis()) > 0 ) { takeloop: while ( (timeleft = timeout - System.currentTimeMillis()) > 0 ) {
//System.out.println("timeleft = " + timeleft); //System.out.println("timeleft = " + timeleft);
final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi = final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi = takeRWI(skipDoubleDom, timeleft);
takeRWI(skipDoubleDom, timeleft);
if ( obrwi == null ) { if ( obrwi == null ) {
return null; // all time was already wasted in takeRWI to get another element return null; // all time was already wasted in takeRWI to get another element
} }
@ -635,6 +636,20 @@ public final class RWIProcess extends Thread
continue; continue;
} }
// check vocabulary constraint
final String tags = page.dc_subject();
final String[] taglist = tags == null || tags.length() == 0 ? new String[0] : SPACE_PATTERN.split(page.dc_subject());
if (this.query.metatags != null && this.query.metatags.size() > 0) {
// all metatags must appear in the tags list
for (Metatag metatag: this.query.metatags) {
if (!Autotagging.metatagAppearIn(metatag, taglist)) {
this.sortout++;
Log.logInfo("RWIProcess", "sorted out " + page.url());
continue takeloop;
}
}
}
// evaluate information of metadata for navigation // evaluate information of metadata for navigation
// author navigation: // author navigation:
if ( pageauthor != null && pageauthor.length() > 0 ) { if ( pageauthor != null && pageauthor.length() > 0 ) {
@ -654,6 +669,12 @@ public final class RWIProcess extends Thread
continue; continue;
} }
// check Scanner
if ( !Scanner.acceptURL(page.url()) ) {
this.sortout++;
continue;
}
// namespace navigation // namespace navigation
String pagepath = page.url().getPath(); String pagepath = page.url().getPath();
if ( (p = pagepath.indexOf(':')) >= 0 ) { if ( (p = pagepath.indexOf(':')) >= 0 ) {
@ -675,10 +696,20 @@ public final class RWIProcess extends Thread
this.filetypeNavigator.inc(fileext); this.filetypeNavigator.inc(fileext);
} }
// check Scanner // vocabulary navigation
if ( !Scanner.acceptURL(page.url()) ) { tagharvest: for (String tag: taglist) {
this.sortout++; if (tag.length() < 1 || tag.charAt(0) != LibraryProvider.tagPrefix) continue tagharvest;
continue; try {
Metatag metatag = LibraryProvider.autotagging.metatag(tag);
ScoreMap<String> voc = this.vocabularyNavigator.get(metatag.getVocabularyName());
if (voc == null) {
voc = new ConcurrentScoreMap<String>();
this.vocabularyNavigator.put(metatag.getVocabularyName(), voc);
}
voc.inc(metatag.getPrintName());
} catch (RuntimeException e) {
// tag may not be well-formed
}
} }
// accept url // accept url
@ -687,6 +718,8 @@ public final class RWIProcess extends Thread
return null; return null;
} }
final static Pattern SPACE_PATTERN = Pattern.compile(" ");
public int sizeQueue() { public int sizeQueue() {
int c = this.stack.sizeQueue(); int c = this.stack.sizeQueue();
for ( final WeakPriorityBlockingQueue<WordReferenceVars> s : this.doubleDomCache.values() ) { for ( final WeakPriorityBlockingQueue<WordReferenceVars> s : this.doubleDomCache.values() ) {
@ -818,6 +851,10 @@ public final class RWIProcess extends Thread
return this.filetypeNavigator; return this.filetypeNavigator;
} }
public Map<String,ScoreMap<String>> getVocabularyNavigators() {
return this.vocabularyNavigator;
}
public static final Comparator<Map.Entry<String, Integer>> mecomp = public static final Comparator<Map.Entry<String, Integer>> mecomp =
new Comparator<Map.Entry<String, Integer>>() { new Comparator<Map.Entry<String, Integer>>() {
@Override @Override

@ -472,6 +472,10 @@ public final class SearchEvent
return this.rankingProcess.getFiletypeNavigator(); return this.rankingProcess.getFiletypeNavigator();
} }
public Map<String,ScoreMap<String>> getVocabularyNavigators() {
return this.rankingProcess.getVocabularyNavigators();
}
public void addHeuristic(final byte[] urlhash, final String heuristicName, final boolean redundant) { public void addHeuristic(final byte[] urlhash, final String heuristicName, final boolean redundant) {
synchronized ( this.heuristics ) { synchronized ( this.heuristics ) {
this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant)); this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant));

@ -481,7 +481,7 @@ public class SnippetProcess {
} }
// get next entry // get next entry
page = SnippetProcess.this.rankingProcess.takeURL(true, Math.min(100, this.timeout - System.currentTimeMillis())); page = SnippetProcess.this.rankingProcess.takeURL(true, Math.min(500, Math.max(100, this.timeout - System.currentTimeMillis())));
//if (page != null) Log.logInfo("ResultFetcher", "got one page: " + page.metadata().url().toNormalform(true, false)); //if (page != null) Log.logInfo("ResultFetcher", "got one page: " + page.metadata().url().toNormalform(true, false));
//if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis()); //if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis());
if (page == null) { if (page == null) {

Loading…
Cancel
Save