You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/search/index/YaCySchema.java

218 lines
13 KiB

/**
* SolrField
* Copyright 2011 by Michael Peter Christen
* First released 14.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7654 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.index;
import net.yacy.cora.services.federated.solr.Schema;
import net.yacy.cora.services.federated.solr.SolrType;
public enum YaCySchema implements Schema {
id(SolrType.string, true, true, "primary key of document, the URL hash **mandatory field**"),
sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"),
ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"),
host_s(SolrType.string, true, true, "host of the url"),
title(SolrType.text_general, true, true, true, "content of title tag"),
author(SolrType.text_general, true, true, "content of author-tag"),
description(SolrType.text_general, true, true, "content of description-tag"),
content_type(SolrType.string, true, true, true, "mime-type of document"),
last_modified(SolrType.date, true, true, "last-modified from http header"),
keywords(SolrType.text_general, true, true, "content of keywords tag; words are separated by space"),
text_t(SolrType.text_general, true, true, "all visible text"),
wordcount_i(SolrType.integer, true, true, "number of words in visible area"),
paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"),
// encoded as binary value into an integer:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "noindex" contained in html header meta
// bit 3: "nofollow" contained in html header meta
// bit 8: "noarchive" contained in http header properties
// bit 9: "nosnippet" contained in http header properties
// bit 10: "noindex" contained in http header properties
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
robots_i(SolrType.integer, true, true, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
inboundlinkscount_i(SolrType.integer, true, true, "total number of inbound links"),
inboundlinksnofollowcount_i(SolrType.integer, true, true, "number of inbound links with nofollow tag"),
inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
inboundlinks_protocol_txt(SolrType.text_general, true, true, true, "internal links, only the protocol"),
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"),
inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"),
inboundlinks_rel_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag"),
inboundlinks_relflags_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"),
outboundlinkscount_i(SolrType.integer, true, true, "external number of inbound links"),
outboundlinksnofollowcount_i(SolrType.integer, true, true, "number of external links with nofollow tag"),
outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_txt(SolrType.text_general, true, true, true, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"),
outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"),
outboundlinks_rel_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag"),
outboundlinks_relflags_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag, coded binary"),
outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"),
charset_s(SolrType.string, true, true, "character encoding"),
lon_coordinate(SolrType.tdouble, true, true, "longitude of location as declared in WSG84"),
lat_coordinate(SolrType.tdouble, true, true, "latitude of location as declared in WSG84"),
httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
h1_txt(SolrType.text_general, true, true, true, "h1 header"),
h2_txt(SolrType.text_general, true, true, true, "h2 header"),
h3_txt(SolrType.text_general, true, true, true, "h3 header"),
h4_txt(SolrType.text_general, true, true, true, "h4 header"),
h5_txt(SolrType.text_general, true, true, true, "h5 header"),
h6_txt(SolrType.text_general, true, true, true, "h6 header"),
htags_i(SolrType.integer, true, true, "binary pattern for the existance of h1..h6 headlines"),
canonical_s(SolrType.string, true, true, "url inside the canonical link element"),
refresh_s(SolrType.string, true, true, "link from the url property inside the refresh link element"),
metagenerator_t(SolrType.text_general, true, true, "content of <meta name=\"generator\" content=#content#> tag"),
boldcount_i(SolrType.integer, true, true, "total number of occurrences of <b> or <strong>"),
bold_txt(SolrType.text_general, true, true, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
italiccount_i(SolrType.integer, true, true, "total number of occurrences of <i>"),
italic_txt(SolrType.text_general, true, true, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),
licount_i(SolrType.integer, true, true, "number of <li> tags"),
li_txt(SolrType.text_general, true, true, true, "all texts in <li> tags"),
imagescount_i(SolrType.integer, true, true, "number of images"),
images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_protocol_txt(SolrType.text_general, true, true, true, "all image link protocols"),
images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"),
csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"),
css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"),
scripts_txt(SolrType.text_general, true, true, true, "normalized urls within a scripts tag"),
scriptscount_i(SolrType.integer, true, true, "number of entries in scripts_txt"),
frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"),
framesscount_i(SolrType.integer, true, true, "number of frames_txt"),
iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"),
iframesscount_i(SolrType.integer, true, true, "number of iframes_txt"),
flash_b(SolrType.bool, true, true, "flag that shows if a swf file is linked"),
responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"),
ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
ext_cms_val(SolrType.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"),
ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"),
ext_ads_val(SolrType.integer, true, true, true, "number of attributes counts in ext_ads_txt"),
ext_community_txt(SolrType.text_general, true, true, true, "names of recognized community functions"),
ext_community_val(SolrType.integer, true, true, true, "number of attribute counts in attr_community"),
ext_maps_txt(SolrType.text_general, true, true, true, "names of map services"),
ext_maps_val(SolrType.integer, true, true, true, "number of attribute counts in ext_maps_txt"),
ext_tracker_txt(SolrType.text_general, true, true, true, "names of tracker server"),
ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"),
failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
// values used additionally by URIMetadataRow
load_date_dt(SolrType.date, true, true, "time when resource was loaded"),
fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"),
host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
referrer_id_txt(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash();
md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5();
publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher();
language_txt(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language();
size_i(SolrType.integer, true, true, "the size of the raw source"),// int size();
audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio();
videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo();
applinkscount_i(SolrType.integer, true, true, "number of links to application resources");// int lapp();
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type;
private final boolean indexed, stored;
private boolean multiValued, omitNorms;
private String comment;
private YaCySchema(final SolrType type, final boolean indexed, final boolean stored, final String comment) {
this.type = type;
this.indexed = indexed;
this.stored = stored;
this.multiValued = false;
this.omitNorms = false;
this.comment = comment;
}
private YaCySchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final String comment) {
this(type, indexed, stored, comment);
this.multiValued = multiValued;
}
private YaCySchema(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final String comment) {
this(type, indexed, stored, multiValued, comment);
this.omitNorms = omitNorms;
}
/**
* Returns the YaCy default or (if available) custom field name for Solr
* @return SolrFieldname String
*/
@Override
public final String getSolrFieldName() {
return (this.solrFieldName == null ? this.name() : this.solrFieldName);
}
/**
* Set a custom Solr field name (and converts it to lower case)
* @param theValue = the field name
*/
public final void setSolrFieldName(String theValue) {
// make sure no empty string is assigned
if ( (theValue != null) && (!theValue.isEmpty()) ) {
this.solrFieldName = theValue.toLowerCase();
} else {
this.solrFieldName = null;
}
}
@Override
public final SolrType getType() {
return this.type;
}
@Override
public final boolean isIndexed() {
return this.indexed;
}
@Override
public final boolean isStored() {
return this.stored;
}
@Override
public final boolean isMultiValued() {
return this.multiValued;
}
@Override
public final boolean isOmitNorms() {
return this.omitNorms;
}
@Override
public final String getComment() {
return this.comment;
}
}