From efd2c4622d85c7ca8c060af1e4e3c1ce0c2b2690 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 23 Nov 2012 14:00:30 +0100 Subject: [PATCH] added a new fail type attribute for the index to distinguish two separate fail types: network fail and forced exclusion (i.e. by robots or forwarding rules). --- defaults/solr.keys.list | 4 ++- .../net/yacy/cora/federate/solr/FailType.java | 28 +++++++++++++++++++ .../yacy/cora/federate/solr/YaCySchema.java | 5 ++-- source/net/yacy/crawler/data/ZURL.java | 17 ++++++----- .../yacy/search/index/SolrConfiguration.java | 4 ++- 5 files changed, 47 insertions(+), 11 deletions(-) create mode 100644 source/net/yacy/cora/federate/solr/FailType.java diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index fe796e31c..2da0aa29e 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -56,13 +56,15 @@ process_s ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field) failreason_t +## fail type if a page was not loaded. This field is either empty, 'excl' or 'fail' +failtype_s + ## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field) httpstatus_i ## redirect url if the error code is 299 < httpstatus_i < 310 #httpstatus_redirect_s - ### optional but highly recommended values, part of the index distribution process ## time when resource was loaded diff --git a/source/net/yacy/cora/federate/solr/FailType.java b/source/net/yacy/cora/federate/solr/FailType.java new file mode 100644 index 000000000..59a57e7f8 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/FailType.java @@ -0,0 +1,28 @@ +/** + * FailType + * Copyright 2012 by Michael Peter Christen + * First released 23.11.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr; + +public enum FailType { + + fail, // failed because of network failure + excl; // failed because content had to be excluded + +} diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java index 32e90fec8..078e15355 100644 --- a/source/net/yacy/cora/federate/solr/YaCySchema.java +++ b/source/net/yacy/cora/federate/solr/YaCySchema.java @@ -27,7 +27,7 @@ import java.util.List; import org.apache.solr.common.SolrInputDocument; public enum YaCySchema implements Schema { - + // mandatory id(SolrType.string, true, true, false, "primary key of document, the URL hash **mandatory field**"), sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"), @@ -44,6 +44,7 @@ public enum YaCySchema implements Schema { size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size(); process_s(SolrType.string, true, true, false, "index creation comment"), failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), + failtype_s(SolrType.string, true, true, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"), httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), @@ -192,7 +193,7 @@ public enum YaCySchema implements Schema { ext_tracker_val(SolrType.num_integer, true, true, true, "number of attribute counts in ext_tracker_txt"), ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"), ext_title_val(SolrType.num_integer, true, true, true, "number of matching title expressions"); - + private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() ) private final SolrType type; private final boolean indexed, stored; diff --git a/source/net/yacy/crawler/data/ZURL.java b/source/net/yacy/crawler/data/ZURL.java index 771caab8c..134c71c0d 100644 --- a/source/net/yacy/crawler/data/ZURL.java +++ b/source/net/yacy/crawler/data/ZURL.java @@ -38,6 +38,7 @@ import java.util.concurrent.LinkedBlockingQueue; import org.apache.solr.common.SolrInputDocument; import net.yacy.cora.document.UTF8; +import net.yacy.cora.federate.solr.FailType; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.NaturalOrder; @@ -63,16 +64,18 @@ public class ZURL implements Iterable { public enum FailCategory { // TEMPORARY categories are such failure cases that should be tried again // FINAL categories are such failure cases that are final and should not be tried again - TEMPORARY_NETWORK_FAILURE(true), // an entity could not been loaded - FINAL_PROCESS_CONTEXT(false), // because of a processing context we do not want that url again (i.e. remote crawling) - FINAL_LOAD_CONTEXT(false), // the crawler configuration does not want to load the entity - FINAL_ROBOTS_RULE(true), // a remote server denies indexing or loading - FINAL_REDIRECT_RULE(true); // the remote server redirects this page, thus disallowing reading of content + TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded + FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling) + FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity + FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading + FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content public final boolean store; + public final FailType failType; - private FailCategory(boolean store) { + private FailCategory(boolean store, FailType failType) { this.store = store; + this.failType = failType; } } @@ -180,7 +183,7 @@ public class ZURL implements Iterable { if (this.solrConnector != null && failCategory.store) { // send the error to solr try { - SolrInputDocument errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, httpcode); + SolrInputDocument errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, failCategory.failType, httpcode); this.solrConnector.add(errorDoc); } catch (final IOException e) { Log.logWarning("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage()); diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 7c0a9154e..7ce8d31dd 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -42,6 +42,7 @@ import java.util.Set; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; +import net.yacy.cora.federate.solr.FailType; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.yacy.ConfigurationSet; import net.yacy.cora.protocol.Domains; @@ -822,7 +823,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable * @param httpstatus * @throws IOException */ - public SolrInputDocument err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException { + public SolrInputDocument err(final DigestURI digestURI, final String failReason, final FailType failType, final int httpstatus) throws IOException { final SolrInputDocument solrdoc = new SolrInputDocument(); add(solrdoc, YaCySchema.id, ASCII.String(digestURI.hash())); add(solrdoc, YaCySchema.sku, digestURI.toNormalform(true)); @@ -836,6 +837,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable // fail reason and status if (contains(YaCySchema.failreason_t)) add(solrdoc, YaCySchema.failreason_t, failReason); + if (contains(YaCySchema.failtype_s)) add(solrdoc, YaCySchema.failtype_s, failType.name()); if (contains(YaCySchema.httpstatus_i)) add(solrdoc, YaCySchema.httpstatus_i, httpstatus); return solrdoc; }