From 5f5a97bafc346f8890ac06ee6e2b637845020410 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 8 Oct 2013 18:41:07 +0200 Subject: [PATCH] added the anchor text within web pages to the searcheable entities of a web page. This can be of benefit for the ranking if these fields are used for boosts. --- defaults/solr.collection.schema | 6 ++++++ source/net/yacy/search/schema/CollectionConfiguration.java | 2 ++ source/net/yacy/search/schema/CollectionSchema.java | 6 ++++-- source/net/yacy/search/schema/WebgraphConfiguration.java | 6 ++++-- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 8382892ed..328b1f77a 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -242,12 +242,18 @@ inboundlinks_protocol_sxt ## internal links, the url only without the protocol inboundlinks_urlstub_sxt +## internal links, the visible anchor text +inboundlinks_anchortext_txt + ## external links, only the protocol outboundlinks_protocol_sxt ## external links, the url only without the protocol outboundlinks_urlstub_sxt +## external links, the visible anchor text +outboundlinks_anchortext_txt + ## all text/words appearing in image alt texts or the tokenized url images_text_t diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index df60c02da..46417b3c7 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -824,8 +824,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri doc.webgraphDocuments.addAll(subgraph.edges); if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0])); if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_sxt)) add(doc, CollectionSchema.inboundlinks_urlstub_sxt, subgraph.urlStubs[0]); + if (allAttr || contains(CollectionSchema.inboundlinks_anchortext_txt)) add(doc, CollectionSchema.inboundlinks_anchortext_txt, subgraph.urlAnchorTexts[0]); if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1])); if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_sxt)) add(doc, CollectionSchema.outboundlinks_urlstub_sxt, subgraph.urlStubs[1]); + if (allAttr || contains(CollectionSchema.outboundlinks_anchortext_txt)) add(doc, CollectionSchema.outboundlinks_anchortext_txt, subgraph.urlAnchorTexts[1]); // charset if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset()); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 3b177201b..2d802481b 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -119,9 +119,11 @@ public enum CollectionSchema implements SchemaDeclaration { robots_i(SolrType.num_integer, true, true, false, false, false, "content of tag and the \"X-Robots-Tag\" HTTP property"), metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of tag"), inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"), - inboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, false, "internal links, the url only without the protocol"), + inboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, true, "internal links, the url only without the protocol"), + inboundlinks_anchortext_txt(SolrType.text_general, true, true, true, false, true, "internal links, the visible anchor text"), outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"), - outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, false, "external links, the url only without the protocol"), + outboundlinks_urlstub_sxt(SolrType.string, true, true, true, false, true, "external links, the url only without the protocol"), + outboundlinks_anchortext_txt(SolrType.text_general, true, true, true, false, true, "external links, the visible anchor text"), images_text_t(SolrType.text_general, true, true, false, false, true, "all text/words appearing in image alt texts or the tokenized url"), images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"), diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index d0b6d3a18..934c6ec41 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -101,12 +101,13 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } public static class Subgraph { - public final ArrayList[] urlProtocols, urlStubs; + public final ArrayList[] urlProtocols, urlStubs, urlAnchorTexts; public final ArrayList edges; @SuppressWarnings("unchecked") public Subgraph(int inboundSize, int outboundSize) { this.urlProtocols = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; this.urlStubs = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; + this.urlAnchorTexts = new ArrayList[]{new ArrayList(inboundSize), new ArrayList(outboundSize)}; this.edges = new ArrayList(inboundSize + outboundSize); } } @@ -226,8 +227,9 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial final String target_url_string = target_url.toNormalform(false); int pr_target = target_url_string.indexOf("://",0); subgraph.urlProtocols[ioidx].add(target_url_string.substring(0, pr_target)); - if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target)); subgraph.urlStubs[ioidx].add(target_url_string.substring(pr_target + 3)); + subgraph.urlAnchorTexts[ioidx].add(text); + if (allAttr || contains(WebgraphSchema.target_protocol_s)) add(edge, WebgraphSchema.target_protocol_s, target_url_string.substring(0, pr_target)); if (allAttr || contains(WebgraphSchema.target_urlstub_s)) add(edge, WebgraphSchema.target_urlstub_s, target_url_string.substring(pr_target + 3)); Map target_searchpart = target_url.getSearchpartMap(); if (target_searchpart == null) {