From 96ed0c980e19622897766cfc01b0f900815554ae Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 23 Sep 2013 18:09:42 +0200 Subject: [PATCH] - added hosthash to all documents (also fail documents which is needed there for deletion), this fixes a problem for the deletion of old documents for new crawl starts - added clickdepth and citation computation for fail documents --- source/net/yacy/search/index/Segment.java | 2 +- .../schema/CollectionConfiguration.java | 62 +++++++++++-------- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index abf4fc67e..548b513af 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -619,7 +619,7 @@ public class Segment { char docType = Response.docType(document.dc_format()); // CREATE SOLR DOCUMENT - final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration()); + final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration()); // ENRICH DOCUMENT WITH RANKING INFORMATION if (this.connectedCitation()) { diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index e36eda46a..b78400f05 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -201,20 +201,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * add uri attributes to solr document * @param doc * @param allAttr - * @param digestURI + * @param digestURL * @param doctype * @return the normalized url */ - public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURI, final char doctype) { - add(doc, CollectionSchema.id, ASCII.String(digestURI.hash())); - String us = digestURI.toNormalform(true); + public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL, final char doctype) { + add(doc, CollectionSchema.id, ASCII.String(digestURL.hash())); + if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, digestURL.hosthash()); + String us = digestURL.toNormalform(true); add(doc, CollectionSchema.sku, us); if (allAttr || contains(CollectionSchema.ip_s)) { - final InetAddress address = digestURI.getInetAddress(); + final InetAddress address = digestURL.getInetAddress(); if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress()); } String host = null; - if ((host = digestURI.getHost()) != null) { + if ((host = digestURL.getHost()) != null) { String dnc = Domains.getDNC(host); String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1); int p = subdomOrga.lastIndexOf('.'); @@ -228,17 +229,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } // path elements of link - String filename = digestURI.getFileName(); + String filename = digestURL.getFileName(); String extension = MultiProtocolURL.getFileExtension(filename); if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length()); - if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURI.getProtocol()); - if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths()); + if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol()); + if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURL.getPaths()); if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename); if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension); if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, doctype)); - Map searchpart = digestURI.getSearchpartMap(); + Map searchpart = digestURL.getSearchpartMap(); if (searchpart == null) { if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, 0); } else { @@ -309,7 +310,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // fields that are in URIMetadataRow additional to yacy2solr basic requirement if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, md.loaddate()); if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, md.freshdate()); - if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, md.hosthash()); if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash())); if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5()); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher()); @@ -357,27 +357,25 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } public SolrVector yacy2solr( - final String id, final Map collections, final ResponseHeader responseHeader, + final Map collections, final ResponseHeader responseHeader, final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final IndexCell citations, final WebgraphConfiguration webgraph) { // we use the SolrCell design as index schema SolrVector doc = new SolrVector(); - final DigestURL digestURI = document.dc_source(); + final DigestURL digestURL = document.dc_source(); + final String id = ASCII.String(digestURL.hash()); boolean allAttr = this.isEmpty(); - String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI)); + String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL)); Set processTypes = new LinkedHashSet(); - add(doc, CollectionSchema.id, id); - String us = digestURI.toNormalform(true); + String us = digestURL.toNormalform(true); int clickdepth = 999; if ((allAttr || contains(CollectionSchema.clickdepth_i)) && citations != null) { - if (digestURI.probablyRootURL()) { - boolean lc = this.lazy; this.lazy = false; + if (digestURL.probablyRootURL()) { clickdepth = 0; - this.lazy = lc; } else { clickdepth = 999; } @@ -712,7 +710,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (refresh != null && refresh.length() > 0) { MultiProtocolURL refreshURL; try { - refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURI, html.getRefreshPath()); + refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURL, html.getRefreshPath()); if (refreshURL != null) { inboundLinks.remove(refreshURL); outboundLinks.remove(refreshURL); @@ -785,7 +783,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } String content = document.getTextString(); - String tokens = digestURI.toTokens(); + String tokens = digestURL.toTokens(); if (content == null || content.length() == 0) { content = tokens; } else { @@ -798,9 +796,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } } - if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURI.getFileName()))) { + if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) { add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser - content = digestURI.toTokens(); // remove all other entry but the url tokens + content = digestURL.toTokens(); // remove all other entry but the url tokens } // content (must be written after special parser data, since this can influence the content) @@ -824,7 +822,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // create a subgraph if (!containsCanonical) { // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document - webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations); + webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, true, document.getAnchors(), citations); } // list all links @@ -850,7 +848,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength()); if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate); if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula - if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, document.dc_source().hosthash()); if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash())); //if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher()); @@ -1269,6 +1266,21 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } configuration.add(doc, CollectionSchema.collection_sxt, cs); } + + // clickdepth, cr and postprocessing + Set processTypes = new LinkedHashSet(); + if ((allAttr || configuration.contains(CollectionSchema.clickdepth_i))) { + processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut + CollectionSchema.clickdepth_i.add(doc, digestURL.probablyRootURL() ? 0 : 999); // no lazy value checking to get a '0' into the index + } + if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) { + processTypes.add(ProcessType.CITATION); // postprocessing needed + } + if (allAttr || configuration.contains(CollectionSchema.process_sxt)) { + List p = new ArrayList(); + for (ProcessType t: processTypes) p.add(t.name()); + configuration.add(doc, CollectionSchema.process_sxt, p); + } return doc; }