diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 921e7c7b5..32888e9c2 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -50,9 +50,6 @@ fuzzy_signature_unique_b ## the size of the raw source (mandatory field) size_i -## index creation comment (mandatory field) -process_s - ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field) failreason_t @@ -71,6 +68,10 @@ references_i ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url clickdepth_i +## needed (post-)processing steps on this metadata set +process_sxt + + ### optional but highly recommended values, part of the index distribution process ## time when resource was loaded diff --git a/htroot/HostBrowser.html b/htroot/HostBrowser.html index 391446f21..0dc590003 100644 --- a/htroot/HostBrowser.html +++ b/htroot/HostBrowser.html @@ -128,7 +128,7 @@ function updatepage(str) { #[url]#  #(stored)# #(load)#link, detected from context::load & index#(/load)#:: - indexed:: + indexed#[comment]#:: loading:: #[error]# #(/stored)# diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index a68a7dc52..edba7f5e0 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -47,6 +47,8 @@ import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.index.RowHandleMap; import net.yacy.kelondro.logging.Log; import net.yacy.peers.graphics.WebStructureGraph.StructureEntry; import net.yacy.search.Switchboard; @@ -251,13 +253,15 @@ public class HostBrowser { YaCySchema.inboundlinks_protocol_sxt.getSolrFieldName(), YaCySchema.inboundlinks_urlstub_txt.getSolrFieldName(), YaCySchema.outboundlinks_protocol_sxt.getSolrFieldName(), - YaCySchema.outboundlinks_urlstub_txt.getSolrFieldName() + YaCySchema.outboundlinks_urlstub_txt.getSolrFieldName(), + YaCySchema.clickdepth_i.getSolrFieldName() ); SolrDocument doc; Set storedDocs = new HashSet(); Map errorDocs = new HashMap(); Set inboundLinks = new HashSet(); Map> outboundHosts = new HashMap>(); + RowHandleMap clickdepth = new RowHandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1, 100, "clickdepth"); int hostsize = 0; final List deleteIDs = new ArrayList(); long timeout = System.currentTimeMillis() + TIMEOUT; @@ -265,6 +269,8 @@ public class HostBrowser { String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); String errortype = (String) doc.getFieldValue(YaCySchema.failtype_s.getSolrFieldName()); FailType error = errortype == null ? null : FailType.valueOf(errortype); + Integer cd = (Integer) doc.getFieldValue(YaCySchema.clickdepth_i.getSolrFieldName()); + if (cd != null) clickdepth.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())), cd.intValue()); if (u.startsWith(path)) { if (delete) { deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()))); @@ -399,6 +405,10 @@ public class HostBrowser { boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/"); if (!dc) { prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : error ? 3 : loading ? 2 : 0 /*linked*/); + if (type == StoreType.INDEX) { + long cd = clickdepth.get(uri.hash()); + prop.put("files_list_" + c + "_type_stored_comment", cd >= 0 ? "clickdepth = " + cd : ""); + } prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0); if (error) { FailType failType = errorDocs.get(entry.getKey()); diff --git a/source/net/yacy/cora/federate/solr/ProcessType.java b/source/net/yacy/cora/federate/solr/ProcessType.java new file mode 100644 index 000000000..29365708a --- /dev/null +++ b/source/net/yacy/cora/federate/solr/ProcessType.java @@ -0,0 +1,31 @@ +/** + * ProcessType + * Copyright 2013 by Michael Peter Christen + * First released 02.01.2013 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.federate.solr; + +/** + * this enum class is used to define (post-) process steps that are attached at the solr dataset in the field process_s + */ +public enum ProcessType { + + CLICKDEPTH; + +} diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java index 9691f51fb..69f5b1e99 100644 --- a/source/net/yacy/cora/federate/solr/YaCySchema.java +++ b/source/net/yacy/cora/federate/solr/YaCySchema.java @@ -42,14 +42,14 @@ public enum YaCySchema implements Schema { fuzzy_signature_text_t(SolrType.text_general, true, true, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"), fuzzy_signature_unique_b(SolrType.bool, true, true, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"), size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size(); - process_s(SolrType.string, true, true, false, "index creation comment"), failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), failtype_s(SolrType.string, true, true, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"), httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), references_i(SolrType.num_integer, true, true, false, "number of unique http references; used for ranking"), clickdepth_i(SolrType.num_integer, true, true, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"), - + process_sxt(SolrType.string, true, true, true, "needed (post-)processing steps on this metadata set"), + // optional but recommended, part of index distribution load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"), fresh_date_dt(SolrType.date, true, true, false, "date until resource shall be considered as fresh"), diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index ab124716e..a43674d29 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -32,6 +32,7 @@ import java.io.Serializable; import java.net.MalformedURLException; import java.util.HashSet; import java.util.Set; +import java.util.regex.Pattern; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; @@ -279,10 +280,18 @@ public class DigestURI extends MultiProtocolURI implements Serializable { private static final char rootURLFlag0 = subdomPortPath("", 80, ""); private static final char rootURLFlag1 = subdomPortPath("www", 80, ""); + private static final char rootURLFlag2 = subdomPortPath("", 21, ""); + private static final char rootURLFlag3 = subdomPortPath("ftp", 21, ""); + + public final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php"); + + public final boolean probablyRootURL() { + return this.path.length() == 0 || rootPattern.matcher(this.path).matches() || probablyRootURL(this.hash); + } public static final boolean probablyRootURL(final byte[] urlHash) { - final char c = (char) urlHash[5]; - return c == rootURLFlag0 || c == rootURLFlag1; + final char c = (char) urlHash[5]; + return c == rootURLFlag0 || c == rootURLFlag1 || c == rootURLFlag2 || c == rootURLFlag3; } private static final String hosthash5(final String protocol, final String host, final int port) { diff --git a/source/net/yacy/kelondro/util/ByteBuffer.java b/source/net/yacy/kelondro/util/ByteBuffer.java index dbe868061..e4c6d2681 100644 --- a/source/net/yacy/kelondro/util/ByteBuffer.java +++ b/source/net/yacy/kelondro/util/ByteBuffer.java @@ -226,6 +226,12 @@ public final class ByteBuffer extends OutputStream { return true; } + public static boolean equals(final byte[] b0, final int off0, final byte[] b1, final int off1, final int length) { + if (b0.length - off0 < length || b1.length - off1 < length) return false; + for (int i = 0; i < length; i++) if (b0[off0 + i] != b1[off1 + i]) return false; + return true; + } + public void writeTo(final OutputStream dest) throws IOException { dest.write(this.buffer, this.offset, this.length); dest.flush(); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index c7fb42215..57e5d76ae 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -363,7 +363,7 @@ public class Segment { char docType = Response.docType(document.dc_format()); // CREATE SOLR DOCUMENT - final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language); + final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex); // FIND OUT IF THIS IS A DOUBLE DOCUMENT for (YaCySchema[] checkfields: new YaCySchema[][]{ diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index b68330659..099967c28 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -34,32 +34,39 @@ import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; -import java.util.regex.Pattern; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.solr.FailType; +import net.yacy.cora.federate.solr.ProcessType; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.yacy.ConfigurationSet; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.CommonPattern; +import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; +import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.rwi.IndexCell; +import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.Bitfield; +import net.yacy.kelondro.util.ByteBuffer; import org.apache.solr.common.SolrInputDocument; @@ -306,23 +313,40 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable text = text.trim(); if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); } - - private final Pattern rootPattern = Pattern.compile("/|/index.htm(l?)|/index.php"); - protected SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, Condenser condenser, DigestURI referrerURL, String language) { + protected SolrInputDocument yacy2solr( + final String id, final CrawlProfile profile, final ResponseHeader responseHeader, + final Document document, Condenser condenser, DigestURI referrerURL, String language, + IndexCell citations) { // we use the SolrCell design as index scheme final SolrInputDocument doc = new SolrInputDocument(); final DigestURI digestURI = DigestURI.toDigestURI(document.dc_source()); boolean allAttr = this.isEmpty(); + + Set processTypes = new LinkedHashSet(); + add(doc, YaCySchema.id, id); if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) String docurl = digestURI.toNormalform(true); add(doc, YaCySchema.sku, docurl); if (allAttr || contains(YaCySchema.clickdepth_i)) { - String path = digestURI.getPath(); - boolean fronturl = path.length() == 0 || rootPattern.matcher(path).matches(); - add(doc, YaCySchema.clickdepth_i, fronturl ? 0 : -1); + boolean fronturl = digestURI.probablyRootURL(); + if (fronturl) { + add(doc, YaCySchema.clickdepth_i, 0); + } else { + // search the citations for references + int clickdepth = -1; + try { + clickdepth = getClickDepth(citations, digestURI.hash()); + } catch (IOException e) { + add(doc, YaCySchema.clickdepth_i, -1); + } + add(doc, YaCySchema.clickdepth_i, clickdepth); + if (clickdepth < 0 || clickdepth > 1) { + processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut + } + } } if (allAttr || contains(YaCySchema.ip_s)) { @@ -800,10 +824,71 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable Set facetValues = facet.getValue(); doc.setField(YaCySchema.VOCABULARY_PREFIX + facetName + YaCySchema.VOCABULARY_SUFFIX, facetValues.toArray(new String[facetValues.size()])); } - + + if (allAttr || contains(YaCySchema.process_sxt)) { + List p = new ArrayList(); + for (ProcessType t: processTypes) p.add(t.name()); + add(doc, YaCySchema.process_sxt, p); + } return doc; } + /** + * compute the click level using the citation reference database + * @param citations the citation database + * @param searchhash the hash of the url to be checked + * @return the clickdepth level or -1 if the root url cannot be found or a recursion limit is reached + * @throws IOException + */ + private int getClickDepth(final IndexCell citations, byte[] searchhash) throws IOException { + + RowHandleSet ignore = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops + RowHandleSet levelhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry + try {levelhashes.put(searchhash);} catch (SpaceExceededException e) {throw new IOException(e);} + int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call + final byte[] hosthash = new byte[6]; // the host of the url to be checked + System.arraycopy(searchhash, 6, hosthash, 0, 6); + + long timeout = System.currentTimeMillis() + 10000; + for (int maxdepth = 0; maxdepth < 10 && System.currentTimeMillis() < timeout; maxdepth++) { + + RowHandleSet checknext = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); + + // loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0 + checkloop: for (byte[] urlhash: levelhashes) { + + // get all the citations for this url and iterate + ReferenceContainer references = citations.get(urlhash, null); + if (references == null || references.size() == 0) continue checkloop; // don't know + Iterator i = references.entries(); + nextloop: while (i.hasNext()) { + CitationReference ref = i.next(); + if (ref == null) continue nextloop; + byte[] u = ref.urlhash(); + + // check ignore + if (ignore.has(u)) continue nextloop; + + // check if this is from the same host + if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop; + + // check if the url is a root url + if (DigestURI.probablyRootURL(u)) { + return leveldepth + 1; + } + + // step to next depth level + try {checknext.put(u);} catch (SpaceExceededException e) {} + try {ignore.put(u);} catch (SpaceExceededException e) {} + } + } + leveldepth++; + levelhashes = checknext; + + } + return -1; + } + /** * this method compresses a list of protocol names to an indexed list. * To do this, all 'http' entries are removed and considered as default.