re-declared some fields to be of type string rather than text which

makes them more efficient and less large
pull/1/head
Michael Peter Christen 12 years ago
parent cc90f82dbb
commit cca19d94d4

@ -54,8 +54,8 @@ fuzzy_signature_unique_b
## the size of the raw source (mandatory field)
size_i
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
failreason_t
## fail reason if a page was not loaded. if the page was loaded then this field is empty, string (mandatory field)
failreason_s
## fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'
failtype_s
@ -190,18 +190,18 @@ h6_txt
#ip_s
## tags of css entries, normalized with absolute URL
#css_tag_txt
#css_tag_sxt
## urls of css entries, normalized with absolute URL
#css_url_txt
#css_url_sxt
## number of css entries, int
#csscount_i
## urls of script entries, normalized with absolute URL
#scripts_txt
#scripts_sxt
## number of script entries, int
## number of entries in scripts_sxt, int
#scriptscount_i
## encoded as binary value into an integer:
@ -233,10 +233,10 @@ outboundlinks_protocol_sxt
outboundlinks_urlstub_txt
## all image tags, encoded as <img> tag inclusive alt- and title property
#images_tag_txt
#images_tag_sxt
## all image links without the protocol and '://'
#images_urlstub_txt
#images_urlstub_sxt
## all image link protocols
#images_protocol_sxt

@ -382,7 +382,7 @@ public class Crawler_p {
sb.crawlQueues.errorURL.removeHost(hosthashes, true);
for (byte[] hosthash: hosthashes) {
try {
sb.index.fulltext().getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_t.getSolrFieldName() + ":[* TO *]");
sb.index.fulltext().getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
} catch (IOException e) {Log.logException(e);}
}
sb.index.fulltext().commit(true);

@ -261,7 +261,7 @@ public class HostBrowser {
BlockingQueue<SolrDocument> docs = fulltext.getDefaultConnector().concurrentDocumentsByQuery(q.toString(), 0, 100000, TIMEOUT, 100,
CollectionSchema.id.getSolrFieldName(),
CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.failreason_t.getSolrFieldName(),
CollectionSchema.failreason_s.getSolrFieldName(),
CollectionSchema.failtype_s.getSolrFieldName(),
CollectionSchema.inboundlinks_protocol_sxt.getSolrFieldName(),
CollectionSchema.inboundlinks_urlstub_txt.getSolrFieldName(),

@ -48,12 +48,23 @@ public enum SolrType {
public String printName() {
return this.printName;
}
public boolean appropriateName(final String field, final boolean multivalue) {
public boolean appropriateName(final SchemaDeclaration collectionSchema) {
String field = collectionSchema.name();
int p = field.indexOf('_');
if (p < 0 || field.length() - p > 4) return true; // special names may have no type extension
String ext = field.substring(p + 1);
boolean ok = multivalue ? this.multivalExt.equals(ext) : this.singlevalExt.equals(ext);
assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(multivalue).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt;
boolean ok = collectionSchema.isMultiValued() ? this.multivalExt.equals(ext) : this.singlevalExt.equals(ext);
assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(collectionSchema.isMultiValued()).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt;
if (!ok) return ok;
ok = !"s".equals(this.singlevalExt) || collectionSchema.isMultiValued() || field.endsWith("s");
assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(collectionSchema.isMultiValued()).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt;
if (!ok) return ok;
ok = !"sxt".equals(this.singlevalExt) || !collectionSchema.isMultiValued() || field.endsWith("sxt");
assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(collectionSchema.isMultiValued()).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt;
if (!ok) return ok;
ok = !"t".equals(this.singlevalExt) || collectionSchema.isMultiValued() || field.endsWith("t");
assert ok : "SolrType = " + this.name() + ", field = " + field + ", ext = " + ext + ", multivalue = " + new Boolean(collectionSchema.isMultiValued()).toString() + ", singlevalExt = " + this.singlevalExt + ", multivalExt = " + this.multivalExt;
if (!ok) return ok;
return ok;
}
}

@ -59,8 +59,8 @@ public abstract class AbstractSolrConnector implements SolrConnector {
}
public final static SolrQuery catchSuccessQuery = new SolrQuery();
static {
//catchSuccessQuery.setQuery("-" + CollectionSchema.failreason_t.getSolrFieldName() + ":[* TO *]");
catchSuccessQuery.setQuery(CATCHALL_TERM); // failreason_t is only available for core collection1
//catchSuccessQuery.setQuery("-" + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
catchSuccessQuery.setQuery(CATCHALL_TERM); // failreason_s is only available for core collection1
catchSuccessQuery.setFields(CollectionSchema.id.getSolrFieldName());
catchSuccessQuery.clearSorts();
catchSuccessQuery.setIncludeScore(false);

@ -502,6 +502,12 @@ public final class Switchboard extends serverSwitch {
}
}
// for index migration in case of obsolete entries, delete entries now
try {
this.index.fulltext().getDefaultConnector().deleteByQuery("failreason_t:[* TO *]"); // field was renamed to failreason_s
} catch (IOException e1) {
}
// initialize network database
final File mySeedFile = new File(this.networkRoot, SeedDB.DBFILE_OWN_SEED);
this.peers =

@ -743,7 +743,7 @@ public final class Fulltext {
public String failReason(final String urlHash) throws IOException {
if (urlHash == null) return null;
String reason = (String) this.getDefaultConnector().getFieldById(urlHash, CollectionSchema.failreason_t.getSolrFieldName());
String reason = (String) this.getDefaultConnector().getFieldById(urlHash, CollectionSchema.failreason_s.getSolrFieldName());
if (reason == null) return null;
return reason == null ? null : reason.length() == 0 ? null : reason;
}

@ -242,7 +242,7 @@ public class QueryGoal {
// add filter to prevent that results come from failed urls
q.append(" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
//q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");
//q.append(" AND -").append(YaCySchema.failreason_s.getSolrFieldName()).append(":[* TO *]");
return q;
}

@ -190,7 +190,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final DigestURI digestURI = md.url();
boolean allAttr = this.isEmpty();
if (allAttr || contains(CollectionSchema.failreason_t)) add(doc, CollectionSchema.failreason_t, "");
if (allAttr || contains(CollectionSchema.failreason_s)) add(doc, CollectionSchema.failreason_s, "");
add(doc, CollectionSchema.id, ASCII.String(md.hash()));
String us = digestURI.toNormalform(true);
add(doc, CollectionSchema.sku, us);
@ -340,7 +340,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
add(doc, CollectionSchema.id, id);
if (allAttr || contains(CollectionSchema.failreason_t)) add(doc, CollectionSchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
if (allAttr || contains(CollectionSchema.failreason_s)) add(doc, CollectionSchema.failreason_s, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
String docurl = digestURI.toNormalform(true);
add(doc, CollectionSchema.sku, docurl);
@ -587,14 +587,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (ie.alt() != null && ie.alt().length() > 0) withalt++;
}
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, imgtags.size());
if (allAttr || contains(CollectionSchema.images_tag_txt)) add(doc, CollectionSchema.images_tag_txt, imgtags);
if (allAttr || contains(CollectionSchema.images_tag_sxt)) add(doc, CollectionSchema.images_tag_sxt, imgtags);
if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots));
if (allAttr || contains(CollectionSchema.images_urlstub_txt)) add(doc, CollectionSchema.images_urlstub_txt, imgstubs);
if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs);
if (allAttr || contains(CollectionSchema.images_alt_txt)) add(doc, CollectionSchema.images_alt_txt, imgalts);
if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt);
// style sheets
if (allAttr || contains(CollectionSchema.css_tag_txt)) {
if (allAttr || contains(CollectionSchema.css_tag_sxt)) {
final Map<DigestURI, String> csss = html.getCSS();
final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()];
@ -610,12 +610,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
c++;
}
add(doc, CollectionSchema.csscount_i, css_tag.length);
if (css_tag.length > 0) add(doc, CollectionSchema.css_tag_txt, css_tag);
if (css_url.length > 0) add(doc, CollectionSchema.css_url_txt, css_url);
if (css_tag.length > 0) add(doc, CollectionSchema.css_tag_sxt, css_tag);
if (css_url.length > 0) add(doc, CollectionSchema.css_url_sxt, css_url);
}
// Scripts
if (allAttr || contains(CollectionSchema.scripts_txt)) {
if (allAttr || contains(CollectionSchema.scripts_sxt)) {
final Set<DigestURI> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
@ -625,7 +625,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
scripts[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.scriptscount_i, scripts.length);
if (scripts.length > 0) add(doc, CollectionSchema.scripts_txt, scripts);
if (scripts.length > 0) add(doc, CollectionSchema.scripts_sxt, scripts);
}
// Frames
@ -933,7 +933,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (contains(CollectionSchema.url_file_ext_s)) add(solrdoc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension());
// fail reason and status
if (contains(CollectionSchema.failreason_t)) add(solrdoc, CollectionSchema.failreason_t, failReason);
if (contains(CollectionSchema.failreason_s)) add(solrdoc, CollectionSchema.failreason_s, failReason);
if (contains(CollectionSchema.failtype_s)) add(solrdoc, CollectionSchema.failtype_s, failType.name());
if (contains(CollectionSchema.httpstatus_i)) add(solrdoc, CollectionSchema.httpstatus_i, httpstatus);
return solrdoc;

@ -46,7 +46,7 @@ public enum CollectionSchema implements SchemaDeclaration {
fuzzy_signature_text_t(SolrType.text_general, true, true, false, false, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"),
fuzzy_signature_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"),
size_i(SolrType.num_integer, true, true, false, false, false, "the size of the raw source"),// int size();
failreason_t(SolrType.text_general, true, true, false, false, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
failreason_s(SolrType.string, true, true, false, false, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
failtype_s(SolrType.string, true, true, false, false, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
httpstatus_i(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
httpstatus_redirect_s(SolrType.num_integer, true, true, false, false, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
@ -98,10 +98,10 @@ public enum CollectionSchema implements SchemaDeclaration {
// optional values, not part of standard YaCy handling (but useful for external applications)
collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
csscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(SolrType.text_general, true, true, true, false, false, "full css tag with normalized url"),
css_url_txt(SolrType.text_general, true, true, true, false, false, "normalized urls within a css tag"),
scripts_txt(SolrType.text_general, true, true, true, false, false, "normalized urls within a scripts tag"),
scriptscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in scripts_txt"),
css_tag_sxt(SolrType.string, true, true, true, false, false, "full css tag with normalized url"),
css_url_sxt(SolrType.string, true, true, true, false, false, "normalized urls within a css tag"),
scripts_sxt(SolrType.string, true, true, true, false, false, "normalized urls within a scripts tag"),
scriptscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in scripts_sxt"),
// encoded as binary value into an integer:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
@ -119,9 +119,9 @@ public enum CollectionSchema implements SchemaDeclaration {
outboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, false, false, "external links, the url only without the protocol"),
images_tag_txt(SolrType.text_general, true, true, true, false, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_urlstub_txt(SolrType.text_general, true, true, true, false, true, "all image links without the protocol and '://'"),
images_protocol_sxt(SolrType.text_general, true, true, true, false, false, "all image link protocols"),
images_tag_sxt(SolrType.string, true, true, true, false, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_urlstub_sxt(SolrType.string, true, true, true, false, true, "all image links without the protocol and '://'"),
images_protocol_sxt(SolrType.string, true, true, true, false, false, "all image link protocols"),
images_alt_txt(SolrType.text_general, true, true, true, false, true, "all image link alt tag"),
images_withalt_i(SolrType.num_integer, true, true, false, false, false, "number of image links with alt tag"),
htags_i(SolrType.num_integer, true, true, false, false, false, "binary pattern for the existance of h1..h6 headlines"),
@ -219,7 +219,7 @@ public enum CollectionSchema implements SchemaDeclaration {
this.omitNorms = omitNorms;
this.searchable = searchable;
this.comment = comment;
assert type.appropriateName(this.name(), this.multiValued) : "bad configuration: " + this.name();
assert type.appropriateName(this) : "bad configuration: " + this.name();
}
/**

@ -111,7 +111,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
this.omitNorms = omitNorms;
this.searchable = searchable;
this.comment = comment;
assert type.appropriateName(this.name(), this.multiValued) : "bad configuration: " + this.name();
assert type.appropriateName(this) : "bad configuration: " + this.name();
}
/**

Loading…
Cancel
Save