diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 03e07e400..02c01e369 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -46,16 +46,16 @@ keywords charset_s ## tags of css entries, normalized with absolute URL, textgen -attr_css_tag +css_tag_txt ## urls of css entries, normalized with absolute URL, textgen -attr_css_url +css_url_txt ## number of css entries, int csscount_i ## urls of script entries, normalized with absolute URL, textgen -attr_scripts +scripts_txt ## number of script entries, int scriptscount_i @@ -86,25 +86,25 @@ text_t wordcount_i ## internal links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen -attr_inboundlinks_tag +inboundlinks_tag_txt ## internal links, only the protocol -#attr_inboundlinks_protocol +#inboundlinks_protocol_txt ## internal links, the url only without the protocol -#attr_inboundlinks_urlstub +#inboundlinks_urlstub_txt ## internal links, the name property of the a-tag -#attr_inboundlinks_name +#inboundlinks_name_txt ## internal links, the rel property of the a-tag -#attr_inboundlinks_rel +#inboundlinks_rel_txt ## internal links, the rel property of the a-tag, coded binary -#attr_inboundlinks_relflags +#inboundlinks_relflags_txt ## internal links, the text content of the a-tag -#attr_inboundlinks_text +#inboundlinks_text_txt ## total number of inbound links, int inboundlinkscount_i @@ -113,70 +113,70 @@ inboundlinkscount_i inboundlinksnoindexcount_i ## external links, normalized (absolute URLs), as - tag with anchor text and nofollow, textgen -attr_outboundlinks_tag +outboundlinks_tag_txt ## external links, only the protocol -#attr_outboundlinks_protocol +#outboundlinks_protocol_txt ## external links, the url only without the protocol -#attr_outboundlinks_urlstub +#outboundlinks_urlstub_txt ## external links, the name property of the a-tag -#attr_outboundlinks_name +#outboundlinks_name_txt ## external links, the rel property of the a-tag -#attr_outboundlinks_rel +#outboundlinks_rel_txt ## external links, the rel property of the a-tag, coded binary -#attr_outboundlinks_relflags +#outboundlinks_relflags_txt ## external links, the text content of the a-tag -#attr_outboundlinks_text +#outboundlinks_text_txt ## external number of inbound links, int -outboundlinks_i +outboundlinkscount_i ## number of external links with noindex tag, int outboundlinksnoindexcount_i ## all image tags, encoded as tag inclusive alt- and title property, textgen -attr_images_tag +images_tag_txt ## all image links without the protocol and '://' -#attr_images_urlstub +#images_urlstub_txt ## all image link protocols -#attr_images_protocol +#images_protocol_txt ## all image link alt tag -#attr_images_alt +#images_alt_txt ## number of images, int imagescount_i ## h1 header, textgen -attr_h1 +h1_txt ## h2 header, textgen -attr_h2 +h2_txt ## h3 header, textgen -attr_h3 +h3_txt ## h4 header, textgen -attr_h4 +h4_txt ## h5 header, textgen -attr_h5 +h5_txt ## h6 header, textgen -attr_h6 +h6_txt ## binary pattern for the existance of h1..h6 headlines, int htags_i ## all path elements in the url, textgen -attr_paths +paths_txt ## host of the url, string host_s @@ -185,79 +185,80 @@ host_s canonical_s ## all texts in
  • tags, textgen -attr_li +li_txt ## number of
  • tags, int licount_i ## all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen -attr_bold +bold_txt -## number of occurrences of texts in attr_bold, textgen -attr_boldcount +## number of occurrences of texts in bold_txt, textgen +#bold_val ## total number of occurrences of or , int -bold_i +boldcount_i ## all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order, textgen -attr_italic +italic_txt -## number of occurrences of texts in attr_italic, textgen -attr_italiccount +## number of occurrences of texts in italic_txt, textgen +#italic_val ## total number of occurrences of , int -italic_i +italiccount_i ## flag that shows if a swf file is linked, boolean flash_b ## list of all links to frames, textgen -attr_frames +frames_txt ## number of attr_frames, int framesscount_i ## list of all links to iframes, textgen -attr_iframes +iframes_txt ## number of attr_iframes, int iframesscount_i ## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias, textgen -attr_cms +#ext_cms_txt ##number of attributes that count for a specific cms in attr_cms, textgen -attr_cmscount +#ext_cms_val ## names of ad-servers/ad-services, textgen -attr_ads +#ext_ads_txt ## number of attributes counts in attr_ads, textgen -attr_adscount +#ext_ads_val ## names of recognized community functions, textgen -attr_community +#ext_community_txt ## number of attribute counts in attr_community, textgen -attr_communitycount +#ext_community_val ## names of map services, textgen -attr_maps +#ext_maps_txt ## number of attribute counts in attr_maps, textgen -attr_mapscount +#ext_maps_val ## names of tracker server, textgen -attr_tracker +#ext_tracker_txt ## number of attribute counts in attr_tracker, textgen -attr_trackercount +#ext_tracker_val ## names matching title expressions, textgen -attr_title +#ext_title_txt ## number of matching title expressions, textgen -attr_titlecount +#ext_title_val + ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text failreason_t diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java index 1496966ad..564deb598 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java @@ -44,6 +44,7 @@ import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; @@ -65,71 +66,238 @@ public class SolrScheme extends ConfigurationSet { */ public SolrScheme(final File configurationFile) { super(configurationFile); + // check consistency: compare with Field enum + for (String name: this) { + try { + Field.valueOf(name); + } catch (IllegalArgumentException e) { + Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + name + "'"); + } + } + /* + for (Field field: Field.values()) { + if (!this.contains(field.name())) { + Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " omits known attribute '" + field.name() + "'"); + } + } + */ } - private void addSolr(final SolrInputDocument solrdoc, final String key, final String value) { - if (isEmpty() || contains(key)) solrdoc.setField(key, value); + private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) { + if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - private void addSolr(final SolrInputDocument solrdoc, final String key, final Date value) { - if (isEmpty() || contains(key)) solrdoc.setField(key, value); + private void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) { + if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - private void addSolr(final SolrInputDocument solrdoc, final String key, final int value) { - if (isEmpty() || contains(key)) solrdoc.setField(key, value); + private void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) { + if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - private void addSolr(final SolrInputDocument solrdoc, final String key, final String[] value) { - if (isEmpty() || contains(key)) solrdoc.setField(key, value); + private void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) { + if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - private void addSolr(final SolrInputDocument solrdoc, final String key, final float value) { - if (isEmpty() || contains(key)) solrdoc.setField(key, value); + private void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) { + if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - private void addSolr(final SolrInputDocument solrdoc, final String key, final boolean value) { - if (isEmpty() || contains(key)) solrdoc.setField(key, value); + private void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) { + if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value); } - private void addSolr(final SolrInputDocument solrdoc, final String key, final String value, final float boost) { - if (isEmpty() || contains(key)) solrdoc.setField(key, value, boost); + private void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) { + if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value, boost); + } + + public static enum Types { + string, + text_general, + text_en_splitting_tight, + date, + integer("int"), + tdouble, + bool("boolean"); + + private String printName; + private Types() { + this.printName = this.name(); + } + private Types(String printName) { + this.printName = printName; + } + public String printName() { + return this.printName; + } + } + + public static enum Field { + + id(Types.string, true, true), + sku(Types.text_en_splitting_tight, true, true, false, true), + ip_s(Types.string, true, true), + host_s(Types.string, true, true), + title(Types.text_general, true, true, true), + author(Types.text_general, true, true), + description(Types.text_general, true, true), + content_type(Types.string, true, true, true), + last_modified(Types.date, true, true), + keywords(Types.text_general, true, true), + text_t(Types.text_general, true, true), + wordcount_i(Types.integer, true, true), + paths_txt(Types.text_general, true, true, true), + inboundlinkscount_i(Types.integer, true, true), + inboundlinksnoindexcount_i(Types.integer, true, true), + inboundlinks_tag_txt(Types.text_general, true, true, true), + inboundlinks_protocol_txt(Types.text_general, true, true, true), + inboundlinks_urlstub_txt(Types.text_general, true, true, true), + inboundlinks_name_txt(Types.text_general, true, true, true), + inboundlinks_rel_txt(Types.text_general, true, true, true), + inboundlinks_relflags_txt(Types.text_general, true, true, true), + inboundlinks_text_txt(Types.text_general, true, true, true), + outboundlinkscount_i(Types.integer, true, true), + outboundlinksnoindexcount_i(Types.integer, true, true), + outboundlinks_tag_txt(Types.text_general, true, true, true), + outboundlinks_protocol_txt(Types.text_general, true, true, true), + outboundlinks_urlstub_txt(Types.text_general, true, true, true), + outboundlinks_name_txt(Types.text_general, true, true, true), + outboundlinks_rel_txt(Types.text_general, true, true, true), + outboundlinks_relflags_txt(Types.text_general, true, true, true), + outboundlinks_text_txt(Types.text_general, true, true, true), + charset_s(Types.string, true, true), + lon_coordinate(Types.tdouble, true, false), + lat_coordinate(Types.tdouble, true, false), + httpstatus_i(Types.integer, true, true), + h1_txt(Types.text_general, true, true, true), + h2_txt(Types.text_general, true, true, true), + h3_txt(Types.text_general, true, true, true), + h4_txt(Types.text_general, true, true, true), + h5_txt(Types.text_general, true, true, true), + h6_txt(Types.text_general, true, true, true), + htags_i(Types.integer, true, true), + canonical_s(Types.string, true, true), + robots_i(Types.integer, true, true), + metagenerator_t(Types.text_general, true, true), + boldcount_i(Types.integer, true, true), + bold_txt(Types.text_general, true, true, true), + bold_val(Types.integer, true, true, true), + italiccount_i(Types.integer, true, true), + italic_txt(Types.text_general, true, true, true), + italic_val(Types.integer, true, true, true), + licount_i(Types.integer, true, true), + li_txt(Types.text_general, true, true, true), + imagescount_i(Types.integer, true, true), + images_tag_txt(Types.text_general, true, true, true), + images_protocol_txt(Types.text_general, true, true, true), + images_urlstub_txt(Types.text_general, true, true, true), + images_alt_txt(Types.text_general, true, true, true), + csscount_i(Types.integer, true, true), + css_tag_txt(Types.text_general, true, true, true), + css_url_txt(Types.text_general, true, true, true), + scripts_txt(Types.text_general, true, true, true), + scriptscount_i(Types.integer, true, true), + frames_txt(Types.text_general, true, true, true), + framesscount_i(Types.integer, true, true), + iframes_txt(Types.text_general, true, true, true), + iframesscount_i(Types.integer, true, true), + flash_b(Types.bool, true, true), + responsetime_i(Types.integer, true, true), + + ext_cms_txt(Types.text_general, true, true, true), + ext_cms_val(Types.integer, true, true, true), + ext_ads_txt(Types.text_general, true, true, true), + ext_ads_val(Types.integer, true, true, true), + ext_community_txt(Types.text_general, true, true, true), + ext_community_val(Types.integer, true, true, true), + ext_maps_txt(Types.text_general, true, true, true), + ext_maps_val(Types.integer, true, true, true), + ext_tracker_txt(Types.text_general, true, true, true), + ext_tracker_val(Types.integer, true, true, true), + ext_title_txt(Types.text_general, true, true, true), + ext_title_val(Types.integer, true, true, true), + + failreason_t(Types.text_general, true, true); + + final Types type; + final boolean indexed, stored; + boolean multiValued, omitNorms; + + private Field(final Types type, final boolean indexed, final boolean stored) { + this.type = type; + this.indexed = indexed; + this.stored = stored; + this.multiValued = false; + this.omitNorms = false; + } + + private Field(final Types type, final boolean indexed, final boolean stored, final boolean multiValued) { + this(type, indexed, stored); + this.multiValued = multiValued; + } + + private Field(final Types type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms) { + this(type, indexed, stored, multiValued); + this.omitNorms = omitNorms; + } + + public final Types getType() { + return this.type; + } + + public final boolean isIndexed() { + return this.indexed; + } + + public final boolean isStored() { + return this.stored; + } + + public final boolean isMultiValued() { + return this.multiValued; + } + + public final boolean isOmitNorms() { + return this.omitNorms; + } + } public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) { // we user the SolrCell design as index scheme final SolrInputDocument solrdoc = new SolrInputDocument(); final DigestURI digestURI = new DigestURI(yacydoc.dc_source()); - addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before) - addSolr(solrdoc, "id", id); - addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f); + addSolr(solrdoc, Field.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) + addSolr(solrdoc, Field.id, id); + addSolr(solrdoc, Field.sku, digestURI.toNormalform(true, false), 3.0f); final InetAddress address = digestURI.getInetAddress(); - if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress()); - if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost()); - addSolr(solrdoc, "title", yacydoc.dc_title()); - addSolr(solrdoc, "author", yacydoc.dc_creator()); - addSolr(solrdoc, "description", yacydoc.dc_description()); - addSolr(solrdoc, "content_type", yacydoc.dc_format()); - addSolr(solrdoc, "last_modified", header.lastModified()); - addSolr(solrdoc, "keywords", yacydoc.dc_subject(' ')); + if (address != null) addSolr(solrdoc, Field.ip_s, address.getHostAddress()); + if (digestURI.getHost() != null) addSolr(solrdoc, Field.host_s, digestURI.getHost()); + addSolr(solrdoc, Field.title, yacydoc.dc_title()); + addSolr(solrdoc, Field.author, yacydoc.dc_creator()); + addSolr(solrdoc, Field.description, yacydoc.dc_description()); + addSolr(solrdoc, Field.content_type, yacydoc.dc_format()); + addSolr(solrdoc, Field.last_modified, header.lastModified()); + addSolr(solrdoc, Field.keywords, yacydoc.dc_subject(' ')); final String content = UTF8.String(yacydoc.getTextBytes()); - addSolr(solrdoc, "text_t", content); - if (isEmpty() || contains("wordcount_i")) { + addSolr(solrdoc, Field.text_t, content); + if (isEmpty() || contains(Field.wordcount_i.name())) { final int contentwc = content.split(" ").length; - addSolr(solrdoc, "wordcount_i", contentwc); + addSolr(solrdoc, Field.wordcount_i, contentwc); } // path elements of link final String path = digestURI.getPath(); - if (path != null && (isEmpty() || contains("attr_paths"))) { + if (path != null && (isEmpty() || contains(Field.paths_txt.name()))) { final String[] paths = path.split("/"); - if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths); + if (paths.length > 0) addSolr(solrdoc, Field.paths_txt, paths); } // list all links final Map alllinks = yacydoc.getAnchors(); int c = 0; - if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount()); - if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount()); + if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, yacydoc.inboundLinkCount()); + if (isEmpty() || contains(Field.inboundlinksnoindexcount_i.name())) addSolr(solrdoc, Field.inboundlinksnoindexcount_i, yacydoc.inboundLinkNoindexCount()); final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()]; final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()]; final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()]; @@ -156,17 +324,17 @@ public class SolrScheme extends ConfigurationSet { ((text.length() > 0) ? text : "") + ""; c++; } - if (isEmpty() || contains("attr_inboundlinks_tag")) addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag); - if (isEmpty() || contains("attr_inboundlinks_protocol")) addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol); - if (isEmpty() || contains("attr_inboundlinks_urlstub")) addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub); - if (isEmpty() || contains("attr_inboundlinks_name")) addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName); - if (isEmpty() || contains("attr_inboundlinks_rel")) addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel); - if (isEmpty() || contains("attr_inboundlinks_relflags")) addSolr(solrdoc, "attr_inboundlinks_relflags", relEval(inboundlinksRel)); - if (isEmpty() || contains("attr_inboundlinks_text")) addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText); + if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag); + if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, inboundlinksURLProtocol); + if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub); + if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName); + if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel); + if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel)); + if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText); c = 0; - if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount()); - if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount()); + if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, yacydoc.outboundLinkCount()); + if (isEmpty() || contains(Field.outboundlinksnoindexcount_i.name())) addSolr(solrdoc, Field.outboundlinksnoindexcount_i, yacydoc.outboundLinkNoindexCount()); final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()]; final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()]; final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()]; @@ -193,24 +361,24 @@ public class SolrScheme extends ConfigurationSet { ((text.length() > 0) ? text : "") + ""; c++; } - if (isEmpty() || contains("attr_outboundlinks_tag")) addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag); - if (isEmpty() || contains("attr_outboundlinks_protocol")) addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol); - if (isEmpty() || contains("attr_outboundlinks_urlstub")) addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub); - if (isEmpty() || contains("attr_outboundlinks_name")) addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName); - if (isEmpty() || contains("attr_outboundlinks_rel")) addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel); - if (isEmpty() || contains("attr_outboundlinks_relflags")) addSolr(solrdoc, "attr_outboundlinks_relflags", relEval(inboundlinksRel)); - if (isEmpty() || contains("attr_outboundlinks_text")) addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText); + if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag); + if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, outboundlinksURLProtocol); + if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub); + if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName); + if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel); + if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel)); + if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText); // charset - addSolr(solrdoc, "charset_s", yacydoc.getCharset()); + addSolr(solrdoc, Field.charset_s, yacydoc.getCharset()); // coordinates if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { - addSolr(solrdoc, "lon_coordinate", yacydoc.lon()); - addSolr(solrdoc, "lat_coordinate", yacydoc.lat()); + addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon()); + addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat()); } - addSolr(solrdoc, "httpstatus_i", 200); + addSolr(solrdoc, Field.httpstatus_i, 200); final Object parser = yacydoc.getParserObject(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; @@ -218,16 +386,19 @@ public class SolrScheme extends ConfigurationSet { // header tags int h = 0; int f = 1; - for (int i = 1; i <= 6; i++) { - final String[] hs = html.getHeadlines(i); - h = h | (hs.length > 0 ? f : 0); - f = f * 2; - addSolr(solrdoc, "attr_h" + i, hs); - } - addSolr(solrdoc, "htags_i", h); + String[] hs; + + hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h1_txt, hs); + hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h2_txt, hs); + hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h3_txt, hs); + hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h4_txt, hs); + hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h5_txt, hs); + hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h6_txt, hs); + + addSolr(solrdoc, Field.htags_i, h); // canonical tag - if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false)); + if (html.getCanonical() != null) addSolr(solrdoc, Field.canonical_s, html.getCanonical().toNormalform(false, false)); // noindex and nofollow attributes // from HTML (meta-tag in HTML header: robots) @@ -261,32 +432,32 @@ public class SolrScheme extends ConfigurationSet { if (x_robots_tag.indexOf("nofollow",0) >= 0) b += 2048; // set bit 11 if (x_robots_tag.indexOf("unavailable_after",0) >=0) b += 4096; // set bit 12 } - addSolr(solrdoc, "robots_i", b); + addSolr(solrdoc, Field.robots_i, b); // meta tags: generator final String generator = html.getMetas().get("generator"); - if (generator != null) addSolr(solrdoc, "metagenerator_t", generator); + if (generator != null) addSolr(solrdoc, Field.metagenerator_t, generator); // bold, italic final String[] bold = html.getBold(); - addSolr(solrdoc, "boldcount_i", bold.length); + addSolr(solrdoc, Field.boldcount_i, bold.length); if (bold.length > 0) { - addSolr(solrdoc, "attr_bold", bold); - if (isEmpty() || contains("attr_boldcount")) { - addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold)); + addSolr(solrdoc, Field.bold_txt, bold); + if (isEmpty() || contains(Field.bold_val.name())) { + addSolr(solrdoc, Field.bold_val, html.getBoldCount(bold)); } } final String[] italic = html.getItalic(); - addSolr(solrdoc, "italiccount_i", italic.length); + addSolr(solrdoc, Field.italiccount_i, italic.length); if (italic.length > 0) { - addSolr(solrdoc, "attr_italic", italic); - if (isEmpty() || contains("attr_italiccount")) { - addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic)); + addSolr(solrdoc, Field.italic_txt, italic); + if (isEmpty() || contains(Field.italic_val.name())) { + addSolr(solrdoc, Field.italic_val, html.getItalicCount(italic)); } } final String[] li = html.getLi(); - addSolr(solrdoc, "licount_i", li.length); - if (li.length > 0) addSolr(solrdoc, "attr_li", li); + addSolr(solrdoc, Field.licount_i, li.length); + if (li.length > 0) addSolr(solrdoc, Field.li_txt, li); // images final Collection imagesc = html.getImages().values(); @@ -303,14 +474,14 @@ public class SolrScheme extends ConfigurationSet { imgalts[c] = ie.alt(); c++; } - addSolr(solrdoc, "imagescount_i", imgtags.length); - if (isEmpty() || contains("attr_images_tag")) addSolr(solrdoc, "attr_images_tag", imgtags); - if (isEmpty() || contains("attr_images_protocol")) addSolr(solrdoc, "attr_images_protocol", imgprots); - if (isEmpty() || contains("attr_images_urlstub")) addSolr(solrdoc, "attr_images_urlstub", imgstubs); - if (isEmpty() || contains("attr_images_alt")) addSolr(solrdoc, "attr_images_alt", imgalts); + addSolr(solrdoc, Field.imagescount_i, imgtags.length); + if (isEmpty() || contains(Field.images_tag_txt.name())) addSolr(solrdoc, Field.images_tag_txt, imgtags); + if (isEmpty() || contains(Field.images_protocol_txt.name())) addSolr(solrdoc, Field.images_protocol_txt, imgprots); + if (isEmpty() || contains(Field.images_urlstub_txt.name())) addSolr(solrdoc, Field.images_urlstub_txt, imgstubs); + if (isEmpty() || contains(Field.images_alt_txt.name())) addSolr(solrdoc, Field.images_alt_txt, imgalts); // style sheets - if (isEmpty() || contains("attr_css")) { + if (isEmpty() || contains("css_txt")) { final Map csss = html.getCSS(); final String[] css_tag = new String[csss.size()]; final String[] css_url = new String[csss.size()]; @@ -323,63 +494,64 @@ public class SolrScheme extends ConfigurationSet { css_url[c] = url; c++; } - addSolr(solrdoc, "csscount_i", css_tag.length); - if (css_tag.length > 0) addSolr(solrdoc, "attr_css_tag", css_tag); - if (css_url.length > 0) addSolr(solrdoc, "attr_css_url", css_url); + addSolr(solrdoc, Field.csscount_i, css_tag.length); + if (css_tag.length > 0) addSolr(solrdoc, Field.css_tag_txt, css_tag); + if (css_url.length > 0) addSolr(solrdoc, Field.css_url_txt, css_url); } // Scripts - if (isEmpty() || contains("attr_scripts")) { + if (isEmpty() || contains(Field.scripts_txt.name())) { final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; for (final MultiProtocolURI url: scriptss) { scripts[c++] = url.toNormalform(false, false, false, false); } - addSolr(solrdoc, "scriptscount_i", scripts.length); - if (scripts.length > 0) addSolr(solrdoc, "attr_scripts", scripts); + addSolr(solrdoc, Field.scriptscount_i, scripts.length); + if (scripts.length > 0) addSolr(solrdoc, Field.scripts_txt, scripts); } // Frames - if (isEmpty() || contains("attr_frames")) { + if (isEmpty() || contains(Field.frames_txt.name())) { final Set framess = html.getFrames(); final String[] frames = new String[framess.size()]; c = 0; for (final MultiProtocolURI entry: framess) { frames[c++] = entry.toNormalform(false, false, false, false); } - addSolr(solrdoc, "framesscount_i", frames.length); - if (frames.length > 0) addSolr(solrdoc, "attr_frames", frames); + addSolr(solrdoc, Field.framesscount_i, frames.length); + if (frames.length > 0) addSolr(solrdoc, Field.frames_txt, frames); } // IFrames - if (isEmpty() || contains("attr_iframes")) { + if (isEmpty() || contains(Field.iframes_txt.name() + )) { final Set iframess = html.getIFrames(); final String[] iframes = new String[iframess.size()]; c = 0; for (final MultiProtocolURI entry: iframess) { iframes[c++] = entry.toNormalform(false, false, false, false); } - addSolr(solrdoc, "iframesscount_i", iframes.length); - if (iframes.length > 0) addSolr(solrdoc, "attr_iframes", iframes); + addSolr(solrdoc, Field.iframesscount_i, iframes.length); + if (iframes.length > 0) addSolr(solrdoc, Field.iframes_txt, iframes); } // flash embedded - addSolr(solrdoc, "flash_b", html.containsFlash()); + addSolr(solrdoc, Field.flash_b, html.containsFlash()); // generic evaluation pattern for (final String model: html.getEvaluationModelNames()) { - if (isEmpty() || contains("attr_" + model)) { + if (isEmpty() || contains("ext_" + model + "_txt")) { final String[] scorenames = html.getEvaluationModelScoreNames(model); if (scorenames.length > 0) { - addSolr(solrdoc, "attr_" + model, scorenames); - addSolr(solrdoc, "attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames)); + addSolr(solrdoc, Field.valueOf("ext_" + model + "_txt"), scorenames); + addSolr(solrdoc, Field.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames)); } } } // response time - addSolr(solrdoc, "responsetime_i", header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")); + addSolr(solrdoc, Field.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")); } return solrdoc; } diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java index 3e497f222..f23f939e6 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java @@ -143,6 +143,7 @@ public class SolrSingleConnector implements SolrConnector { public void pleaseStop() { this.shallRun = false; } + @Override public void run() { while (this.shallRun) { if (SolrSingleConnector.this.transmissionQueue[this.idx].size() > 0) { @@ -165,6 +166,7 @@ public class SolrSingleConnector implements SolrConnector { } } + @Override public void close() { for (int i = 0; i < transmissionQueueCount; i++) { if (this.transmissionWorker[i].isAlive()) { @@ -204,6 +206,7 @@ public class SolrSingleConnector implements SolrConnector { * delete everything in the solr index * @throws IOException */ + @Override public void clear() throws IOException { try { this.server.deleteByQuery("*:*"); @@ -213,6 +216,7 @@ public class SolrSingleConnector implements SolrConnector { } } + @Override public void delete(final String id) throws IOException { try { this.server.deleteById(id); @@ -221,6 +225,7 @@ public class SolrSingleConnector implements SolrConnector { } } + @Override public void delete(final List ids) throws IOException { try { this.server.deleteById(ids); @@ -229,6 +234,7 @@ public class SolrSingleConnector implements SolrConnector { } } + @Override public boolean exists(final String id) throws IOException { try { final SolrDocumentList list = get("id:" + id, 0, 1); @@ -254,10 +260,12 @@ public class SolrSingleConnector implements SolrConnector { } } + @Override public void add(final String id, final ResponseHeader header, final Document doc) throws IOException, SolrException { add(this.scheme.yacy2solr(id, header, doc)); } + @Override public void add(final SolrInputDocument solrdoc) throws IOException, SolrException { int thisrrc = this.transmissionRoundRobinCounter; int nextrrc = thisrrc++; @@ -284,11 +292,15 @@ public class SolrSingleConnector implements SolrConnector { req.add( docs ); UpdateResponse rsp = req.process( server ); */ + } catch (final SolrException e) { + // the field is probably not known + Log.logWarning("SolrConnector", e.getMessage()); } catch (final Throwable e) { throw new IOException(e); } } + @Override public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException { final SolrInputDocument solrdoc = new SolrInputDocument(); @@ -330,6 +342,7 @@ public class SolrSingleConnector implements SolrConnector { * @param querystring * @throws IOException */ + @Override public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException { // construct query final SolrQuery query = new SolrQuery();