link strings should not be tokenized

pull/1/head
orbiter 12 years ago
parent deadeb406e
commit f106345eef

@ -231,13 +231,13 @@ h6_txt
inboundlinks_protocol_sxt inboundlinks_protocol_sxt
## internal links, the url only without the protocol ## internal links, the url only without the protocol
inboundlinks_urlstub_txt inboundlinks_urlstub_sxt
## external links, only the protocol ## external links, only the protocol
outboundlinks_protocol_sxt outboundlinks_protocol_sxt
## external links, the url only without the protocol ## external links, the url only without the protocol
outboundlinks_urlstub_txt outboundlinks_urlstub_sxt
## all text/words appearing in image alt texts or the tokenized url ## all text/words appearing in image alt texts or the tokenized url
images_text_t images_text_t

@ -270,9 +270,9 @@ public class HostBrowser {
CollectionSchema.failreason_s.getSolrFieldName(), CollectionSchema.failreason_s.getSolrFieldName(),
CollectionSchema.failtype_s.getSolrFieldName(), CollectionSchema.failtype_s.getSolrFieldName(),
CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(), CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.inboundlinks_urlstub_txt.getSolrFieldName(), CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(), CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.outboundlinks_urlstub_txt.getSolrFieldName(), CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(),
CollectionSchema.clickdepth_i.getSolrFieldName(), CollectionSchema.clickdepth_i.getSolrFieldName(),
CollectionSchema.references_i.getSolrFieldName(), CollectionSchema.references_i.getSolrFieldName(),
CollectionSchema.references_internal_i.getSolrFieldName(), CollectionSchema.references_internal_i.getSolrFieldName(),

@ -284,7 +284,7 @@ public class URIMetadataNode {
} }
public static Iterator<String> getLinks(SolrDocument doc, boolean inbound) { public static Iterator<String> getLinks(SolrDocument doc, boolean inbound) {
Collection<Object> urlstub = doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_urlstub_txt : CollectionSchema.outboundlinks_urlstub_txt).getSolrFieldName()); Collection<Object> urlstub = doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_urlstub_sxt : CollectionSchema.outboundlinks_urlstub_sxt).getSolrFieldName());
Collection<String> urlprot = urlstub == null ? null : indexedList2protocolList(doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_protocol_sxt : CollectionSchema.outboundlinks_protocol_sxt).getSolrFieldName()), urlstub.size()); Collection<String> urlprot = urlstub == null ? null : indexedList2protocolList(doc.getFieldValues((inbound ? CollectionSchema.inboundlinks_protocol_sxt : CollectionSchema.outboundlinks_protocol_sxt).getSolrFieldName()), urlstub.size());
String u; String u;
LinkedHashSet<String> list = new LinkedHashSet<String>(); LinkedHashSet<String> list = new LinkedHashSet<String>();

@ -437,8 +437,8 @@ public final class Switchboard extends serverSwitch {
CollectionSchema.host_s, CollectionSchema.load_date_dt, CollectionSchema.host_s, CollectionSchema.load_date_dt,
CollectionSchema.url_file_ext_s, CollectionSchema.last_modified, // needed for media search and /date operator CollectionSchema.url_file_ext_s, CollectionSchema.last_modified, // needed for media search and /date operator
/*YaCySchema.url_paths_sxt,*/ CollectionSchema.host_organization_s, // needed to search in the url /*YaCySchema.url_paths_sxt,*/ CollectionSchema.host_organization_s, // needed to search in the url
/*YaCySchema.inboundlinks_protocol_sxt,*/ CollectionSchema.inboundlinks_urlstub_txt, // needed for HostBrowser /*YaCySchema.inboundlinks_protocol_sxt,*/ CollectionSchema.inboundlinks_urlstub_sxt, // needed for HostBrowser
/*YaCySchema.outboundlinks_protocol_sxt,*/ CollectionSchema.outboundlinks_urlstub_txt,// needed to enhance the crawler /*YaCySchema.outboundlinks_protocol_sxt,*/ CollectionSchema.outboundlinks_urlstub_sxt,// needed to enhance the crawler
CollectionSchema.httpstatus_i // used in all search queries to filter out error documents CollectionSchema.httpstatus_i // used in all search queries to filter out error documents
}) { }) {
SchemaConfiguration.Entry entry = solrCollectionConfigurationWork.get(field.name()); entry.setEnable(true); solrCollectionConfigurationWork.put(field.name(), entry); SchemaConfiguration.Entry entry = solrCollectionConfigurationWork.get(field.name()); entry.setEnable(true); solrCollectionConfigurationWork.put(field.name(), entry);

@ -470,7 +470,7 @@ public final class QueryParams {
} }
if (this.inlink != null) { if (this.inlink != null) {
fq.append(" AND ").append(CollectionSchema.outboundlinks_urlstub_txt.getSolrFieldName()).append(":\"").append(this.inlink).append('\"'); fq.append(" AND ").append(CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName()).append(":\"").append(this.inlink).append('\"');
} }
if (!this.urlMask_isCatchall) { if (!this.urlMask_isCatchall) {

@ -805,9 +805,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// list all links // list all links
doc.webgraphDocuments.addAll(subgraph.edges); doc.webgraphDocuments.addAll(subgraph.edges);
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0])); if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, subgraph.urlStubs[0]); if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_sxt)) add(doc, CollectionSchema.inboundlinks_urlstub_sxt, subgraph.urlStubs[0]);
if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1])); if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1]));
if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_txt)) add(doc, CollectionSchema.outboundlinks_urlstub_txt, subgraph.urlStubs[1]); if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_sxt)) add(doc, CollectionSchema.outboundlinks_urlstub_sxt, subgraph.urlStubs[1]);
// charset // charset
if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset()); if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset());

@ -116,9 +116,9 @@ public enum CollectionSchema implements SchemaDeclaration {
robots_i(SolrType.num_integer, true, true, false, false, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"), robots_i(SolrType.num_integer, true, true, false, false, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of <meta name=\"generator\" content=#content#> tag"), metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of <meta name=\"generator\" content=#content#> tag"),
inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"), inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"),
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, false, false, "internal links, the url only without the protocol"), inboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, false, "internal links, the url only without the protocol"),
outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"), outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, false, false, "external links, the url only without the protocol"), outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, false, "external links, the url only without the protocol"),
images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"), images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"),
images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"), images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"),

Loading…
Cancel
Save