From cca19d94d4f1ac2516b9992c4e09efa4e6bac454 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 6 May 2013 16:45:54 +0200 Subject: [PATCH] re-declared some fields to be of type string rather than text which makes them more efficient and less large --- defaults/solr.collection.schema | 16 +++++++-------- htroot/Crawler_p.java | 2 +- htroot/HostBrowser.java | 2 +- .../net/yacy/cora/federate/solr/SolrType.java | 17 +++++++++++++--- .../solr/connector/AbstractSolrConnector.java | 4 ++-- source/net/yacy/search/Switchboard.java | 6 ++++++ source/net/yacy/search/index/Fulltext.java | 2 +- source/net/yacy/search/query/QueryGoal.java | 2 +- .../schema/CollectionConfiguration.java | 20 +++++++++---------- .../yacy/search/schema/CollectionSchema.java | 18 ++++++++--------- .../yacy/search/schema/WebgraphSchema.java | 2 +- 11 files changed, 54 insertions(+), 37 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 674a42be3..7b7a5f030 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -54,8 +54,8 @@ fuzzy_signature_unique_b ## the size of the raw source (mandatory field) size_i -## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field) -failreason_t +## fail reason if a page was not loaded. if the page was loaded then this field is empty, string (mandatory field) +failreason_s ## fail type if a page was not loaded. This field is either empty, 'excl' or 'fail' failtype_s @@ -190,18 +190,18 @@ h6_txt #ip_s ## tags of css entries, normalized with absolute URL -#css_tag_txt +#css_tag_sxt ## urls of css entries, normalized with absolute URL -#css_url_txt +#css_url_sxt ## number of css entries, int #csscount_i ## urls of script entries, normalized with absolute URL -#scripts_txt +#scripts_sxt -## number of script entries, int +## number of entries in scripts_sxt, int #scriptscount_i ## encoded as binary value into an integer: @@ -233,10 +233,10 @@ outboundlinks_protocol_sxt outboundlinks_urlstub_txt ## all image tags, encoded as tag inclusive alt- and title property -#images_tag_txt +#images_tag_sxt ## all image links without the protocol and '://' -#images_urlstub_txt +#images_urlstub_sxt ## all image link protocols #images_protocol_sxt diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 0d8679610..dcb21c0d2 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -382,7 +382,7 @@ public class Crawler_p { sb.crawlQueues.errorURL.removeHost(hosthashes, true); for (byte[] hosthash: hosthashes) { try { - sb.index.fulltext().getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_t.getSolrFieldName() + ":[* TO *]"); + sb.index.fulltext().getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); } catch (IOException e) {Log.logException(e);} } sb.index.fulltext().commit(true); diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 7c30edae4..68b4b6706 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -261,7 +261,7 @@ public class HostBrowser { BlockingQueue docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), 0, 100000, TIMEOUT, 100, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), - CollectionSchema.failreason_t.getSolrFieldName(), + CollectionSchema.failreason_s.getSolrFieldName(), CollectionSchema.failtype_s.getSolrFieldName(), CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(), CollectionSchema.inboundlinks_urlstub_txt.getSolrFieldName(), diff --git a/source/net/yacy/cora/federate/solr/SolrType.java b/source/net/yacy/cora/federate/solr/SolrType.java index 855e9e87e..95dad9e03 100644 --- a/source/net/yacy/cora/federate/solr/SolrType.java +++ b/source/net/yacy/cora/federate/solr/SolrType.java @@ -48,12 +48,23 @@ public enum SolrType { public String printName() { return this.printName; } - public boolean appropriateName(final String field, final boolean multivalue) { + public boolean appropriateName(final SchemaDeclaration collectionSchema) { + String field = collectionSchema.name(); int p = field.indexOf('_'); if (p < 0 || field.length() - p > 4) return true; // special names may have no type extension String ext = field.substring(p + 1); - boolean ok = multivalue ? this.multivalExt.equals(ext) : this.singlevalExt.equals(ext); - assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(multivalue).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt; + boolean ok = collectionSchema.isMultiValued() ? this.multivalExt.equals(ext) : this.singlevalExt.equals(ext); + assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(collectionSchema.isMultiValued()).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt; + if (!ok) return ok; + ok = !"s".equals(this.singlevalExt) || collectionSchema.isMultiValued() || field.endsWith("s"); + assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(collectionSchema.isMultiValued()).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt; + if (!ok) return ok; + ok = !"sxt".equals(this.singlevalExt) || !collectionSchema.isMultiValued() || field.endsWith("sxt"); + assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(collectionSchema.isMultiValued()).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt; + if (!ok) return ok; + ok = !"t".equals(this.singlevalExt) || collectionSchema.isMultiValued() || field.endsWith("t"); + assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(collectionSchema.isMultiValued()).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt; + if (!ok) return ok; return ok; } } \ No newline at end of file diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index 807aad050..b4cc0ed11 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -59,8 +59,8 @@ public abstract class AbstractSolrConnector implements SolrConnector { } public final static SolrQuery catchSuccessQuery = new SolrQuery(); static { - //catchSuccessQuery.setQuery("-" + CollectionSchema.failreason_t.getSolrFieldName() + ":[* TO *]"); - catchSuccessQuery.setQuery(CATCHALL_TERM); // failreason_t is only available for core collection1 + //catchSuccessQuery.setQuery("-" + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); + catchSuccessQuery.setQuery(CATCHALL_TERM); // failreason_s is only available for core collection1 catchSuccessQuery.setFields(CollectionSchema.id.getSolrFieldName()); catchSuccessQuery.clearSorts(); catchSuccessQuery.setIncludeScore(false); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 6f3439557..4c30bf9c8 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -502,6 +502,12 @@ public final class Switchboard extends serverSwitch { } } + // for index migration in case of obsolete entries, delete entries now + try { + this.index.fulltext().getDefaultConnector().deleteByQuery("failreason_t:[* TO *]"); // field was renamed to failreason_s + } catch (IOException e1) { + } + // initialize network database final File mySeedFile = new File(this.networkRoot, SeedDB.DBFILE_OWN_SEED); this.peers = diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index cf69e74ac..4aa31644f 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -743,7 +743,7 @@ public final class Fulltext { public String failReason(final String urlHash) throws IOException { if (urlHash == null) return null; - String reason = (String) this.getDefaultConnector().getFieldById(urlHash, CollectionSchema.failreason_t.getSolrFieldName()); + String reason = (String) this.getDefaultConnector().getFieldById(urlHash, CollectionSchema.failreason_s.getSolrFieldName()); if (reason == null) return null; return reason == null ? null : reason.length() == 0 ? null : reason; } diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 3c8e5dcbc..f12ae54dd 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -242,7 +242,7 @@ public class QueryGoal { // add filter to prevent that results come from failed urls q.append(" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200"); - //q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]"); + //q.append(" AND -").append(YaCySchema.failreason_s.getSolrFieldName()).append(":[* TO *]"); return q; } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 2df6d66ed..b92f7eb08 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -190,7 +190,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final DigestURI digestURI = md.url(); boolean allAttr = this.isEmpty(); - if (allAttr || contains(CollectionSchema.failreason_t)) add(doc, CollectionSchema.failreason_t, ""); + if (allAttr || contains(CollectionSchema.failreason_s)) add(doc, CollectionSchema.failreason_s, ""); add(doc, CollectionSchema.id, ASCII.String(md.hash())); String us = digestURI.toNormalform(true); add(doc, CollectionSchema.sku, us); @@ -340,7 +340,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Set processTypes = new LinkedHashSet(); add(doc, CollectionSchema.id, id); - if (allAttr || contains(CollectionSchema.failreason_t)) add(doc, CollectionSchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) + if (allAttr || contains(CollectionSchema.failreason_s)) add(doc, CollectionSchema.failreason_s, ""); // overwrite a possible fail reason (in case that there was a fail reason before) String docurl = digestURI.toNormalform(true); add(doc, CollectionSchema.sku, docurl); @@ -587,14 +587,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (ie.alt() != null && ie.alt().length() > 0) withalt++; } if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, imgtags.size()); - if (allAttr || contains(CollectionSchema.images_tag_txt)) add(doc, CollectionSchema.images_tag_txt, imgtags); + if (allAttr || contains(CollectionSchema.images_tag_sxt)) add(doc, CollectionSchema.images_tag_sxt, imgtags); if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots)); - if (allAttr || contains(CollectionSchema.images_urlstub_txt)) add(doc, CollectionSchema.images_urlstub_txt, imgstubs); + if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs); if (allAttr || contains(CollectionSchema.images_alt_txt)) add(doc, CollectionSchema.images_alt_txt, imgalts); if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt); // style sheets - if (allAttr || contains(CollectionSchema.css_tag_txt)) { + if (allAttr || contains(CollectionSchema.css_tag_sxt)) { final Map csss = html.getCSS(); final String[] css_tag = new String[csss.size()]; final String[] css_url = new String[csss.size()]; @@ -610,12 +610,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri c++; } add(doc, CollectionSchema.csscount_i, css_tag.length); - if (css_tag.length > 0) add(doc, CollectionSchema.css_tag_txt, css_tag); - if (css_url.length > 0) add(doc, CollectionSchema.css_url_txt, css_url); + if (css_tag.length > 0) add(doc, CollectionSchema.css_tag_sxt, css_tag); + if (css_url.length > 0) add(doc, CollectionSchema.css_url_sxt, css_url); } // Scripts - if (allAttr || contains(CollectionSchema.scripts_txt)) { + if (allAttr || contains(CollectionSchema.scripts_sxt)) { final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; @@ -625,7 +625,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri scripts[c++] = u.toNormalform(false); } add(doc, CollectionSchema.scriptscount_i, scripts.length); - if (scripts.length > 0) add(doc, CollectionSchema.scripts_txt, scripts); + if (scripts.length > 0) add(doc, CollectionSchema.scripts_sxt, scripts); } // Frames @@ -933,7 +933,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (contains(CollectionSchema.url_file_ext_s)) add(solrdoc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension()); // fail reason and status - if (contains(CollectionSchema.failreason_t)) add(solrdoc, CollectionSchema.failreason_t, failReason); + if (contains(CollectionSchema.failreason_s)) add(solrdoc, CollectionSchema.failreason_s, failReason); if (contains(CollectionSchema.failtype_s)) add(solrdoc, CollectionSchema.failtype_s, failType.name()); if (contains(CollectionSchema.httpstatus_i)) add(solrdoc, CollectionSchema.httpstatus_i, httpstatus); return solrdoc; diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index af34e59ba..129f862cf 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -46,7 +46,7 @@ public enum CollectionSchema implements SchemaDeclaration { fuzzy_signature_text_t(SolrType.text_general, true, true, false, false, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"), fuzzy_signature_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"), size_i(SolrType.num_integer, true, true, false, false, false, "the size of the raw source"),// int size(); - failreason_t(SolrType.text_general, true, true, false, false, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), + failreason_s(SolrType.string, true, true, false, false, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), failtype_s(SolrType.string, true, true, false, false, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"), httpstatus_i(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), @@ -98,10 +98,10 @@ public enum CollectionSchema implements SchemaDeclaration { // optional values, not part of standard YaCy handling (but useful for external applications) collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"), csscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in css_tag_txt and css_url_txt"), - css_tag_txt(SolrType.text_general, true, true, true, false, false, "full css tag with normalized url"), - css_url_txt(SolrType.text_general, true, true, true, false, false, "normalized urls within a css tag"), - scripts_txt(SolrType.text_general, true, true, true, false, false, "normalized urls within a scripts tag"), - scriptscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in scripts_txt"), + css_tag_sxt(SolrType.string, true, true, true, false, false, "full css tag with normalized url"), + css_url_sxt(SolrType.string, true, true, true, false, false, "normalized urls within a css tag"), + scripts_sxt(SolrType.string, true, true, true, false, false, "normalized urls within a scripts tag"), + scriptscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in scripts_sxt"), // encoded as binary value into an integer: // bit 0: "all" contained in html header meta // bit 1: "index" contained in html header meta @@ -119,9 +119,9 @@ public enum CollectionSchema implements SchemaDeclaration { outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"), outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, false, false, "external links, the url only without the protocol"), - images_tag_txt(SolrType.text_general, true, true, true, false, true, " all image tags, encoded as tag inclusive alt- and title property"), - images_urlstub_txt(SolrType.text_general, true, true, true, false, true, "all image links without the protocol and '://'"), - images_protocol_sxt(SolrType.text_general, true, true, true, false, false, "all image link protocols"), + images_tag_sxt(SolrType.string, true, true, true, false, true, " all image tags, encoded as tag inclusive alt- and title property"), + images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"), + images_protocol_sxt(SolrType.string, true, true, true, false, false, "all image link protocols"), images_alt_txt(SolrType.text_general, true, true, true, false, true, "all image link alt tag"), images_withalt_i(SolrType.num_integer, true, true, false, false, false, "number of image links with alt tag"), htags_i(SolrType.num_integer, true, true, false, false, false, "binary pattern for the existance of h1..h6 headlines"), @@ -219,7 +219,7 @@ public enum CollectionSchema implements SchemaDeclaration { this.omitNorms = omitNorms; this.searchable = searchable; this.comment = comment; - assert type.appropriateName(this.name(), this.multiValued) : "bad configuration: " + this.name(); + assert type.appropriateName(this) : "bad configuration: " + this.name(); } /** diff --git a/source/net/yacy/search/schema/WebgraphSchema.java b/source/net/yacy/search/schema/WebgraphSchema.java index 4ff02e6a1..15d257263 100644 --- a/source/net/yacy/search/schema/WebgraphSchema.java +++ b/source/net/yacy/search/schema/WebgraphSchema.java @@ -111,7 +111,7 @@ public enum WebgraphSchema implements SchemaDeclaration { this.omitNorms = omitNorms; this.searchable = searchable; this.comment = comment; - assert type.appropriateName(this.name(), this.multiValued) : "bad configuration: " + this.name(); + assert type.appropriateName(this) : "bad configuration: " + this.name(); } /**