From f106345eefb8157e19f4d5fbfbfc0573a80ebb82 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 1 Sep 2013 14:35:36 +0200 Subject: [PATCH] link strings should not be tokenized --- defaults/solr.collection.schema | 4 ++-- htroot/HostBrowser.java | 4 ++-- source/net/yacy/kelondro/data/meta/URIMetadataNode.java | 2 +- source/net/yacy/search/Switchboard.java | 4 ++-- source/net/yacy/search/query/QueryParams.java | 2 +- source/net/yacy/search/schema/CollectionConfiguration.java | 4 ++-- source/net/yacy/search/schema/CollectionSchema.java | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index e531ac8b5..b1ce9665f 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -231,13 +231,13 @@ h6_txt inboundlinks_protocol_sxt ## internal links, the url only without the protocol -inboundlinks_urlstub_txt +inboundlinks_urlstub_sxt ## external links, only the protocol outboundlinks_protocol_sxt ## external links, the url only without the protocol -outboundlinks_urlstub_txt +outboundlinks_urlstub_sxt ## all text/words appearing in image alt texts or the tokenized url images_text_t diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 7cfdc8a52..871181ad4 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -270,9 +270,9 @@ public class HostBrowser { CollectionSchema.failreason_s.getSolrFieldName(), CollectionSchema.failtype_s.getSolrFieldName(), CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(), - CollectionSchema.inboundlinks_urlstub_txt.getSolrFieldName(), + CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(), CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(), - CollectionSchema.outboundlinks_urlstub_txt.getSolrFieldName(), + CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(), CollectionSchema.clickdepth_i.getSolrFieldName(), CollectionSchema.references_i.getSolrFieldName(), CollectionSchema.references_internal_i.getSolrFieldName(), diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index c3125360e..f613679bc 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -284,7 +284,7 @@ public class URIMetadataNode { } public static Iterator getLinks(SolrDocument doc, boolean inbound) { - Collection urlstub = doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_urlstub_txt : CollectionSchema.outboundlinks_urlstub_txt).getSolrFieldName()); + Collection urlstub = doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_urlstub_sxt : CollectionSchema.outboundlinks_urlstub_sxt).getSolrFieldName()); Collection urlprot = urlstub == null ? null : indexedList2protocolList(doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_protocol_sxt : CollectionSchema.outboundlinks_protocol_sxt).getSolrFieldName()), urlstub.size()); String u; LinkedHashSet list = new LinkedHashSet(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 583205489..dc0fc71ed 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -437,8 +437,8 @@ public final class Switchboard extends serverSwitch { CollectionSchema.host_s, CollectionSchema.load_date_dt, CollectionSchema.url_file_ext_s, CollectionSchema.last_modified, // needed for media search and /date operator /*YaCySchema.url_paths_sxt,*/ CollectionSchema.host_organization_s, // needed to search in the url - /*YaCySchema.inboundlinks_protocol_sxt,*/ CollectionSchema.inboundlinks_urlstub_txt, // needed for HostBrowser - /*YaCySchema.outboundlinks_protocol_sxt,*/ CollectionSchema.outboundlinks_urlstub_txt,// needed to enhance the crawler + /*YaCySchema.inboundlinks_protocol_sxt,*/ CollectionSchema.inboundlinks_urlstub_sxt, // needed for HostBrowser + /*YaCySchema.outboundlinks_protocol_sxt,*/ CollectionSchema.outboundlinks_urlstub_sxt,// needed to enhance the crawler CollectionSchema.httpstatus_i // used in all search queries to filter out error documents }) { SchemaConfiguration.Entry entry = solrCollectionConfigurationWork.get(field.name()); entry.setEnable(true); solrCollectionConfigurationWork.put(field.name(), entry); diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 4bacd8697..609e79f76 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -470,7 +470,7 @@ public final class QueryParams { } if (this.inlink != null) { - fq.append(" AND ").append(CollectionSchema.outboundlinks_urlstub_txt.getSolrFieldName()).append(":\"").append(this.inlink).append('\"'); + fq.append(" AND ").append(CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()).append(":\"").append(this.inlink).append('\"'); } if (!this.urlMask_isCatchall) { diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index dfb2cafa1..8a7b843a5 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -805,9 +805,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // list all links doc.webgraphDocuments.addAll(subgraph.edges); if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0])); - if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, subgraph.urlStubs[0]); + if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_sxt)) add(doc, CollectionSchema.inboundlinks_urlstub_sxt, subgraph.urlStubs[0]); if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1])); - if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_txt)) add(doc, CollectionSchema.outboundlinks_urlstub_txt, subgraph.urlStubs[1]); + if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_sxt)) add(doc, CollectionSchema.outboundlinks_urlstub_sxt, subgraph.urlStubs[1]); // charset if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset()); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 70721bed4..dd0c666cb 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -116,9 +116,9 @@ public enum CollectionSchema implements SchemaDeclaration { robots_i(SolrType.num_integer, true, true, false, false, false, "content of tag and the \"X-Robots-Tag\" HTTP property"), metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of tag"), inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"), - inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, false, false, "internal links, the url only without the protocol"), + inboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, false, "internal links, the url only without the protocol"), outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"), - outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, false, false, "external links, the url only without the protocol"), + outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, false, "external links, the url only without the protocol"), images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"), images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"),