added a new fail type attribute for the index to distinguish two

separate fail types: network fail and forced exclusion (i.e. by robots
or forwarding rules).
pull/1/head
Michael Peter Christen 12 years ago
parent 5e182a566f
commit efd2c4622d

@ -56,13 +56,15 @@ process_s
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field) ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
failreason_t failreason_t
## fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'
failtype_s
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field) ## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
httpstatus_i httpstatus_i
## redirect url if the error code is 299 < httpstatus_i < 310 ## redirect url if the error code is 299 < httpstatus_i < 310
#httpstatus_redirect_s #httpstatus_redirect_s
### optional but highly recommended values, part of the index distribution process ### optional but highly recommended values, part of the index distribution process
## time when resource was loaded ## time when resource was loaded

@ -0,0 +1,28 @@
/**
* FailType
* Copyright 2012 by Michael Peter Christen
* First released 23.11.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr;
public enum FailType {
fail, // failed because of network failure
excl; // failed because content had to be excluded
}

@ -44,6 +44,7 @@ public enum YaCySchema implements Schema {
size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size(); size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size();
process_s(SolrType.string, true, true, false, "index creation comment"), process_s(SolrType.string, true, true, false, "index creation comment"),
failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
failtype_s(SolrType.string, true, true, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),

@ -38,6 +38,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.order.NaturalOrder;
@ -63,16 +64,18 @@ public class ZURL implements Iterable<ZURL.Entry> {
public enum FailCategory { public enum FailCategory {
// TEMPORARY categories are such failure cases that should be tried again // TEMPORARY categories are such failure cases that should be tried again
// FINAL categories are such failure cases that are final and should not be tried again // FINAL categories are such failure cases that are final and should not be tried again
TEMPORARY_NETWORK_FAILURE(true), // an entity could not been loaded TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded
FINAL_PROCESS_CONTEXT(false), // because of a processing context we do not want that url again (i.e. remote crawling) FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling)
FINAL_LOAD_CONTEXT(false), // the crawler configuration does not want to load the entity FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity
FINAL_ROBOTS_RULE(true), // a remote server denies indexing or loading FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading
FINAL_REDIRECT_RULE(true); // the remote server redirects this page, thus disallowing reading of content FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content
public final boolean store; public final boolean store;
public final FailType failType;
private FailCategory(boolean store) { private FailCategory(boolean store, FailType failType) {
this.store = store; this.store = store;
this.failType = failType;
} }
} }
@ -180,7 +183,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
if (this.solrConnector != null && failCategory.store) { if (this.solrConnector != null && failCategory.store) {
// send the error to solr // send the error to solr
try { try {
SolrInputDocument errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, httpcode); SolrInputDocument errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
this.solrConnector.add(errorDoc); this.solrConnector.add(errorDoc);
} catch (final IOException e) { } catch (final IOException e) {
Log.logWarning("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage()); Log.logWarning("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage());

@ -42,6 +42,7 @@ import java.util.Set;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.ConfigurationSet; import net.yacy.cora.federate.yacy.ConfigurationSet;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
@ -822,7 +823,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
* @param httpstatus * @param httpstatus
* @throws IOException * @throws IOException
*/ */
public SolrInputDocument err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException { public SolrInputDocument err(final DigestURI digestURI, final String failReason, final FailType failType, final int httpstatus) throws IOException {
final SolrInputDocument solrdoc = new SolrInputDocument(); final SolrInputDocument solrdoc = new SolrInputDocument();
add(solrdoc, YaCySchema.id, ASCII.String(digestURI.hash())); add(solrdoc, YaCySchema.id, ASCII.String(digestURI.hash()));
add(solrdoc, YaCySchema.sku, digestURI.toNormalform(true)); add(solrdoc, YaCySchema.sku, digestURI.toNormalform(true));
@ -836,6 +837,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// fail reason and status // fail reason and status
if (contains(YaCySchema.failreason_t)) add(solrdoc, YaCySchema.failreason_t, failReason); if (contains(YaCySchema.failreason_t)) add(solrdoc, YaCySchema.failreason_t, failReason);
if (contains(YaCySchema.failtype_s)) add(solrdoc, YaCySchema.failtype_s, failType.name());
if (contains(YaCySchema.httpstatus_i)) add(solrdoc, YaCySchema.httpstatus_i, httpstatus); if (contains(YaCySchema.httpstatus_i)) add(solrdoc, YaCySchema.httpstatus_i, httpstatus);
return solrdoc; return solrdoc;
} }

Loading…
Cancel
Save