From d6b82840f80c016e16be7ee3ad3f062e4fae1238 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 21 Nov 2012 18:46:49 +0100 Subject: [PATCH] added a feature to find similarities in documents. This uses an enhanced version of the Nutch/Solr TextProfileSignatue. As a result, a signature of the document is written to the solr search index. Additionally for each time when a signature is written, it is checked if the singature exists already in the index. If the signature does not exist, the document is marked as unique. The unique attribute can now be used to sort document lists and bring duplicates to the end of a result list. To enable this, a large portion of the search api to Solr had to be changed. This affected mainly caching of 'exists' searches to enhance the check for existing signatures and do this without actually doing a solr query. Because here the first time a long number is used as value in the Solr store, also the value naming in the YaCySchema had to be adopted and normalized. This caused that many files had to be changed. --- debian/changelog | 2 +- defaults/solr.keys.list | 15 ++ htroot/PerformanceMemory_p.java | 14 +- htroot/Ranking_p.java | 2 +- htroot/index.java | 4 +- htroot/yacy/search.java | 4 +- htroot/yacysearch.java | 4 +- htroot/yacysearchitem.java | 4 +- .../yacy/cora/document/MultiProtocolURI.java | 3 +- .../{ => analysis}/Classification.java | 6 +- .../EnhancedTextProfileSignature.java | 173 ++++++++++++++++ .../net/yacy/cora/federate/solr/SolrType.java | 22 +-- .../yacy/cora/federate/solr/YaCySchema.java | 120 +++++------ .../solr/connector/AbstractSolrConnector.java | 15 +- .../solr/connector/MirrorSolrConnector.java | 186 +++++++++++------- .../solr/connector/MultipleSolrConnector.java | 8 +- .../solr/connector/RetrySolrConnector.java | 14 +- .../solr/connector/ShardSolrConnector.java | 16 +- .../solr/connector/SolrConnector.java | 13 +- .../solr/connector/SolrServerConnector.java | 14 +- .../EnhancedXMLResponseWriter.java | 8 +- source/net/yacy/crawler/CrawlStacker.java | 2 +- .../yacy/crawler/retrieval/FileLoader.java | 2 +- .../net/yacy/crawler/retrieval/Response.java | 2 +- .../net/yacy/crawler/retrieval/SMBLoader.java | 2 +- source/net/yacy/document/Condenser.java | 66 +++++-- source/net/yacy/document/Document.java | 2 +- .../kelondro/data/meta/URIMetadataNode.java | 4 +- source/net/yacy/search/Switchboard.java | 4 +- source/net/yacy/search/index/Fulltext.java | 15 +- source/net/yacy/search/index/Segment.java | 24 ++- .../yacy/search/index/SolrConfiguration.java | 12 +- source/net/yacy/search/query/QueryGoal.java | 2 - source/net/yacy/search/query/QueryParams.java | 4 +- .../net/yacy/search/query/RankingProcess.java | 2 +- source/net/yacy/search/query/SearchEvent.java | 4 +- .../net/yacy/search/query/SnippetWorker.java | 2 +- .../yacy/search/ranking/RankingProfile.java | 4 +- .../net/yacy/search/snippet/MediaSnippet.java | 4 +- .../yacy/server/http/HTTPDFileHandler.java | 2 +- source/net/yacy/server/serverObjects.java | 4 +- 41 files changed, 558 insertions(+), 252 deletions(-) rename source/net/yacy/cora/document/{ => analysis}/Classification.java (98%) create mode 100644 source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java diff --git a/debian/changelog b/debian/changelog index 717f9f6ca..a149b3330 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -yacy (*auto-git-version*) unstable; urgency=low +yacy (1.2.9018) unstable; urgency=low * SVN Update diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 8f2e44b55..fe796e31c 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -32,6 +32,21 @@ host_id_s ## the md5 of the raw source (mandatory field) md5_s +## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t +exact_signature_l + +## flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search +exact_signature_unique_b + +## 64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t +fuzzy_signature_l + +## intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies +#fuzzy_signature_text_t + +## flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search +fuzzy_signature_unique_b + ## the size of the raw source (mandatory field) size_i diff --git a/htroot/PerformanceMemory_p.java b/htroot/PerformanceMemory_p.java index 069c5a791..3b428d9af 100644 --- a/htroot/PerformanceMemory_p.java +++ b/htroot/PerformanceMemory_p.java @@ -30,6 +30,7 @@ import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.Map; +import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.solr.connector.MirrorSolrConnector; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; @@ -204,15 +205,16 @@ public class PerformanceMemory_p { // other caching structures final MirrorSolrConnector solr = (MirrorSolrConnector) Switchboard.getSwitchboard().index.fulltext().getSolr(); + final MirrorSolrConnector.HitMissCache hitMissCache = solr.getCache(YaCySchema.id.getSolrFieldName()); prop.putNum("solrcacheHit.size", solr.nameCacheHitSize()); - prop.putNum("solrcacheHit.Hit", solr.hitCache_Hit); - prop.putNum("solrcacheHit.Miss", solr.hitCache_Miss); - prop.putNum("solrcacheHit.Insert", solr.hitCache_Insert); + prop.putNum("solrcacheHit.Hit", hitMissCache.hitCache_Hit); + prop.putNum("solrcacheHit.Miss", hitMissCache.hitCache_Miss); + prop.putNum("solrcacheHit.Insert", hitMissCache.hitCache_Insert); prop.putNum("solrcacheMiss.size", solr.nameCacheMissSize()); - prop.putNum("solrcacheMiss.Hit", solr.missCache_Hit); - prop.putNum("solrcacheMiss.Miss", solr.missCache_Miss); - prop.putNum("solrcacheMiss.Insert", solr.missCache_Insert); + prop.putNum("solrcacheMiss.Hit", hitMissCache.missCache_Hit); + prop.putNum("solrcacheMiss.Miss", hitMissCache.missCache_Miss); + prop.putNum("solrcacheMiss.Insert", hitMissCache.missCache_Insert); prop.putNum("solrcacheDocument.size", solr.nameCacheDocumentSize()); prop.putNum("solrcacheDocument.Hit", solr.documentCache_Hit); diff --git a/htroot/Ranking_p.java b/htroot/Ranking_p.java index 4e02e4003..6a549a837 100644 --- a/htroot/Ranking_p.java +++ b/htroot/Ranking_p.java @@ -29,7 +29,7 @@ import java.util.LinkedHashMap; import java.util.Map; import java.util.Map.Entry; -import net.yacy.cora.document.Classification; +import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.search.Switchboard; import net.yacy.search.query.SearchEventCache; diff --git a/htroot/index.java b/htroot/index.java index 8304a12e9..845b8e78c 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -29,8 +29,8 @@ // if the shell's current path is HTROOT -import net.yacy.cora.document.Classification; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.protocol.RequestHeader; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 2ae486344..5f0b8d625 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -37,8 +37,8 @@ import java.util.TreeMap; import java.util.TreeSet; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.protocol.Domains; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index cd4f8cf54..41d64cefa 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -41,8 +41,8 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 279d6b63c..69729b5f6 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -29,8 +29,8 @@ import java.util.List; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 5d9775664..423ca0618 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -46,8 +46,9 @@ import java.util.regex.Pattern; import jcifs.smb.SmbException; import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; -import net.yacy.cora.document.Classification.ContentDomain; import net.yacy.cora.document.Punycode.PunycodeException; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.protocol.ftp.FTPClient; diff --git a/source/net/yacy/cora/document/Classification.java b/source/net/yacy/cora/document/analysis/Classification.java similarity index 98% rename from source/net/yacy/cora/document/Classification.java rename to source/net/yacy/cora/document/analysis/Classification.java index fbf326236..59ebde97a 100644 --- a/source/net/yacy/cora/document/Classification.java +++ b/source/net/yacy/cora/document/analysis/Classification.java @@ -18,7 +18,7 @@ * If not, see . */ -package net.yacy.cora.document; +package net.yacy.cora.document.analysis; import java.io.BufferedInputStream; import java.io.File; @@ -28,6 +28,8 @@ import java.util.Map.Entry; import java.util.Properties; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; + public class Classification { private static final Set textExtSet = new HashSet(); @@ -140,7 +142,7 @@ public class Classification { return ctrlExtSet.contains(ctrlExt.trim().toLowerCase()); } - protected static ContentDomain getContentDomain(final String ext) { + public static ContentDomain getContentDomain(final String ext) { if (isTextExtension(ext)) return ContentDomain.TEXT; if (isImageExtension(ext)) return ContentDomain.IMAGE; if (isAudioExtension(ext)) return ContentDomain.AUDIO; diff --git a/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java b/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java new file mode 100644 index 000000000..8c82e129f --- /dev/null +++ b/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package net.yacy.cora.document.analysis; + +/* + * THIS CODE WAS COPIED FROM org.apache.solr.update.processor.TextProfileSignature + * - to get access to the 'newText' variable content which is otherwise lost in the process, used for debugging + * - to use the much faster Lookup3Signature instead of MD5Signature + */ + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; + +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.update.processor.Lookup3Signature; + +/** + *

This implementation is copied from Apache Nutch.

+ *

An implementation of a page signature. It calculates an MD5 hash + * of a plain text "profile" of a page.

+ *

The algorithm to calculate a page "profile" takes the plain text version of + * a page and performs the following steps: + *

    + *
  • remove all characters except letters and digits, and bring all characters + * to lower case,
  • + *
  • split the text into tokens (all consecutive non-whitespace characters),
  • + *
  • discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 characters),
  • + *
  • sort the list of tokens by decreasing frequency,
  • + *
  • round down the counts of tokens to the nearest multiple of QUANT + * (QUANT = QUANT_RATE * maxFreq, where QUANT_RATE is 0.01f + * by default, and maxFreq is the maximum token frequency). If + * maxFreq is higher than 1, then QUANT is always higher than 2 (which + * means that tokens with frequency 1 are always discarded).
  • + *
  • tokens, which frequency after quantization falls below QUANT, are discarded.
  • + *
  • create a list of tokens and their quantized frequency, separated by spaces, + * in the order of decreasing frequency.
  • + *
+ * This list is then submitted to an MD5 hash calculation.*/ +public class EnhancedTextProfileSignature extends Lookup3Signature { + + private float quantRate = 0.01f; + private float minTokenLen = 2; + private StringBuilder evalText = new StringBuilder(120); // start with some capacity, makes it much faster. + + @Override + public void init(SolrParams params) { + quantRate = params.getFloat("quantRate", 0.01f); + minTokenLen = params.getInt("minTokenLen", 2); + } + + @Override + public byte[] getSignature() { + return super.getSignature(); + } + + public StringBuilder getSignatureText() { + return evalText; + } + + @Override + public void add(String content) { + HashMap tokens = new HashMap(); + + StringBuilder curToken = new StringBuilder(); + int maxFreq = 0; + for (int i = 0; i < content.length(); i++) { + char c = content.charAt(i); + if (Character.isLetterOrDigit(c)) { + curToken.append(Character.toLowerCase(c)); + } else { + if (curToken.length() > 0) { + if (curToken.length() > minTokenLen) { + // add it + String s = curToken.toString(); + Token tok = tokens.get(s); + if (tok == null) { + tok = new Token(0, s); + tokens.put(s, tok); + } + tok.cnt++; + if (tok.cnt > maxFreq) + maxFreq = tok.cnt; + } + curToken.setLength(0); + } + } + } + // check the last token + if (curToken.length() > minTokenLen) { + // add it + String s = curToken.toString(); + Token tok = tokens.get(s); + if (tok == null) { + tok = new Token(0, s); + tokens.put(s, tok); + } + tok.cnt++; + if (tok.cnt > maxFreq) + maxFreq = tok.cnt; + } + Iterator it = tokens.values().iterator(); + ArrayList profile = new ArrayList(); + // calculate the QUANT value + int quant = Math.round(maxFreq * quantRate); + if (quant < 2) { + if (maxFreq > 1) + quant = 2; + else + quant = 1; + } + while (it.hasNext()) { + Token t = it.next(); + // round down to the nearest QUANT + t.qcnt = (t.cnt / quant) * quant; + // discard the frequencies below the QUANT + if (t.qcnt < quant) { + continue; + } + profile.add(t); + } + Collections.sort(profile, new TokenComparator()); + StringBuilder newText = new StringBuilder(120); + it = profile.iterator(); + while (it.hasNext()) { + Token t = it.next(); + if (newText.length() > 0) {newText.append(' ');evalText.append(' ');} + newText.append('(').append(t.val).append('-').append(t.qcnt).append(')'); + evalText.append('(').append(t.val).append('-').append(t.cnt).append('-').append(t.qcnt).append(')'); + } + + super.add(newText.toString()); + } + + private static class Token { + public int cnt, qcnt; + public String val; + + public Token(int cnt, String val) { + this.cnt = cnt; + this.val = val; + } + + @Override + public String toString() { + return val + " " + cnt; + } + } + + private static class TokenComparator implements Comparator { + public int compare(Token t1, Token t2) { + return t2.cnt - t1.cnt; + } + } + +} diff --git a/source/net/yacy/cora/federate/solr/SolrType.java b/source/net/yacy/cora/federate/solr/SolrType.java index b1d3ad8b1..b45575fb3 100644 --- a/source/net/yacy/cora/federate/solr/SolrType.java +++ b/source/net/yacy/cora/federate/solr/SolrType.java @@ -22,17 +22,17 @@ package net.yacy.cora.federate.solr; public enum SolrType { - string("s", "sxt"), // The type is not analyzed, but indexed/stored verbatim - text_general("t", "txt"), // tokenizes with StandardTokenizer, removes stop words from case-insensitive "stopwords.txt", down cases, applies synonyms. - text_en_splitting_tight(null, null),// can insert dashes in the wrong place and still match - location("p", null), // lat,lon - format: specialized field for geospatial search. If indexed, this fieldType must not be multivalued. - date("dt", null), // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z' - integer("i", "val", "int"), - bool("b", null, "boolean"), - tlong(null, null, "long"), // not used in schema yet - tfloat(null, null, "float"), // not used in schema yet - tdouble(null, null, "double"); // not used in schema yet - + string("s", "sxt"), // The type is not analyzed, but indexed/stored verbatim + text_general("t", "txt"), // tokenizes with StandardTokenizer, removes stop words from case-insensitive "stopwords.txt", down cases, applies synonyms. + text_en_splitting_tight(null, null), // can insert dashes in the wrong place and still match + location("p", null), // lat,lon - format: specialized field for geospatial search. If indexed, this fieldType must not be multivalued. + date("dt", null), // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z' + bool("b", "bs", "boolean"), + num_integer("i", "val", "int"), + num_long("l", "ls", "long"), + num_float("f", "fs", "float"), + num_double("d", "ds", "double"); + private String printName, singlevalExt, multivalExt; private SolrType(final String singlevalExt, final String multivalExt) { this.printName = this.name(); diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java index 822d3e6f9..32e90fec8 100644 --- a/source/net/yacy/cora/federate/solr/YaCySchema.java +++ b/source/net/yacy/cora/federate/solr/YaCySchema.java @@ -36,11 +36,16 @@ public enum YaCySchema implements Schema { title(SolrType.text_general, true, true, true, "content of title tag"), host_id_s(SolrType.string, true, true, false, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash(); md5_s(SolrType.string, true, true, false, "the md5 of the raw source"),// String md5(); - size_i(SolrType.integer, true, true, false, "the size of the raw source"),// int size(); + exact_signature_l(SolrType.num_long, true, true, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t"), + exact_signature_unique_b(SolrType.bool, true, true, false, "flag shows if exact_signature_l is unique at the time of document creation, used for double-check during search"), + fuzzy_signature_l(SolrType.num_long, true, true, false, "64 bit of the Lookup3Signature from EnhancedTextProfileSignature of text_t"), + fuzzy_signature_text_t(SolrType.text_general, true, true, false, "intermediate data produced in EnhancedTextProfileSignature: a list of word frequencies"), + fuzzy_signature_unique_b(SolrType.bool, true, true, false, "flag shows if fuzzy_signature_l is unique at the time of document creation, used for double-check during search"), + size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size(); process_s(SolrType.string, true, true, false, "index creation comment"), failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), - httpstatus_i(SolrType.integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), - httpstatus_redirect_s(SolrType.integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), + httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), + httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"), // optional but recommended, part of index distribution load_date_dt(SolrType.date, true, true, false, "time when resource was loaded"), @@ -48,9 +53,9 @@ public enum YaCySchema implements Schema { referrer_id_txt(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash(); publisher_t(SolrType.text_general, true, true, false, "the name of the publisher of the document"),// String dc_publisher(); language_s(SolrType.string, true, true, false, "the language used in the document"),// byte[] language(); - audiolinkscount_i(SolrType.integer, true, true, false, "number of links to audio resources"),// int laudio(); - videolinkscount_i(SolrType.integer, true, true, false, "number of links to video resources"),// int lvideo(); - applinkscount_i(SolrType.integer, true, true, false, "number of links to application resources"),// int lapp(); + audiolinkscount_i(SolrType.num_integer, true, true, false, "number of links to audio resources"),// int laudio(); + videolinkscount_i(SolrType.num_integer, true, true, false, "number of links to video resources"),// int lvideo(); + applinkscount_i(SolrType.num_integer, true, true, false, "number of links to application resources"),// int lapp(); // optional but recommended coordinate_p(SolrType.location, true, true, false, "point in degrees of latitude,longitude as declared in WSG84"), @@ -59,13 +64,13 @@ public enum YaCySchema implements Schema { description(SolrType.text_general, true, true, false, "content of description-tag"), keywords(SolrType.text_general, true, true, false, "content of keywords tag; words are separated by space"), charset_s(SolrType.string, true, true, false, "character encoding"), - wordcount_i(SolrType.integer, true, true, false, "number of words in visible area"), - inboundlinkscount_i(SolrType.integer, true, true, false, "total number of inbound links"), - inboundlinksnofollowcount_i(SolrType.integer, true, true, false, "number of inbound links with nofollow tag"), - outboundlinkscount_i(SolrType.integer, true, true, false, "external number of inbound links"), - outboundlinksnofollowcount_i(SolrType.integer, true, true, false, "number of external links with nofollow tag"), - imagescount_i(SolrType.integer, true, true, false, "number of images"), - responsetime_i(SolrType.integer, true, true, false, "response time of target server in milliseconds"), + wordcount_i(SolrType.num_integer, true, true, false, "number of words in visible area"), + inboundlinkscount_i(SolrType.num_integer, true, true, false, "total number of inbound links"), + inboundlinksnofollowcount_i(SolrType.num_integer, true, true, false, "number of inbound links with nofollow tag"), + outboundlinkscount_i(SolrType.num_integer, true, true, false, "external number of inbound links"), + outboundlinksnofollowcount_i(SolrType.num_integer, true, true, false, "number of external links with nofollow tag"), + imagescount_i(SolrType.num_integer, true, true, false, "number of images"), + responsetime_i(SolrType.num_integer, true, true, false, "response time of target server in milliseconds"), text_t(SolrType.text_general, true, true, false, "all visible text"), synonyms_sxt(SolrType.string, true, true, true, "additional synonyms to the words in the text"), h1_txt(SolrType.text_general, true, true, true, "h1 header"), @@ -77,11 +82,11 @@ public enum YaCySchema implements Schema { // optional values, not part of standard YaCy handling (but useful for external applications) collection_sxt(SolrType.string, true, true, true, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"), - csscount_i(SolrType.integer, true, true, false, "number of entries in css_tag_txt and css_url_txt"), + csscount_i(SolrType.num_integer, true, true, false, "number of entries in css_tag_txt and css_url_txt"), css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"), css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"), scripts_txt(SolrType.text_general, true, true, true, "normalized urls within a scripts tag"), - scriptscount_i(SolrType.integer, true, true, false, "number of entries in scripts_txt"), + scriptscount_i(SolrType.num_integer, true, true, false, "number of entries in scripts_txt"), // encoded as binary value into an integer: // bit 0: "all" contained in html header meta // bit 1: "index" contained in html header meta @@ -92,57 +97,57 @@ public enum YaCySchema implements Schema { // bit 10: "noindex" contained in http header properties // bit 11: "nofollow" contained in http header properties // bit 12: "unavailable_after" contained in http header properties - robots_i(SolrType.integer, true, true, false, "content of tag and the \"X-Robots-Tag\" HTTP property"), + robots_i(SolrType.num_integer, true, true, false, "content of tag and the \"X-Robots-Tag\" HTTP property"), metagenerator_t(SolrType.text_general, true, true, false, "content of tag"), inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as - tag with anchor text and nofollow"), inboundlinks_protocol_sxt(SolrType.string, true, true, true, "internal links, only the protocol"), inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"), inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"), inboundlinks_rel_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag"), - inboundlinks_relflags_val(SolrType.integer, true, true, true, "internal links, the rel property of the a-tag, coded binary"), + inboundlinks_relflags_val(SolrType.num_integer, true, true, true, "internal links, the rel property of the a-tag, coded binary"), inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"), - inboundlinks_text_chars_val(SolrType.integer, true, true, true, "internal links, the length of the a-tag as number of characters"), - inboundlinks_text_words_val(SolrType.integer, true, true, true, "internal links, the length of the a-tag as number of words"), + inboundlinks_text_chars_val(SolrType.num_integer, true, true, true, "internal links, the length of the a-tag as number of characters"), + inboundlinks_text_words_val(SolrType.num_integer, true, true, true, "internal links, the length of the a-tag as number of words"), inboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"), outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"), outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"), outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"), outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"), outboundlinks_rel_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag"), - outboundlinks_relflags_val(SolrType.integer, true, true, true, "external links, the rel property of the a-tag, coded binary"), + outboundlinks_relflags_val(SolrType.num_integer, true, true, true, "external links, the rel property of the a-tag, coded binary"), outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"), - outboundlinks_text_chars_val(SolrType.integer, true, true, true, "external links, the length of the a-tag as number of characters"), - outboundlinks_text_words_val(SolrType.integer, true, true, true, "external links, the length of the a-tag as number of words"), + outboundlinks_text_chars_val(SolrType.num_integer, true, true, true, "external links, the length of the a-tag as number of characters"), + outboundlinks_text_words_val(SolrType.num_integer, true, true, true, "external links, the length of the a-tag as number of words"), outboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"), images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as tag inclusive alt- and title property"), images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"), images_protocol_sxt(SolrType.text_general, true, true, true, "all image link protocols"), images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"), - images_withalt_i(SolrType.integer, true, true, false, "number of image links with alt tag"), - htags_i(SolrType.integer, true, true, false, "binary pattern for the existance of h1..h6 headlines"), + images_withalt_i(SolrType.num_integer, true, true, false, "number of image links with alt tag"), + htags_i(SolrType.num_integer, true, true, false, "binary pattern for the existance of h1..h6 headlines"), canonical_t(SolrType.text_general, true, true, false, "url inside the canonical link element"), refresh_s(SolrType.string, true, true, false, "link from the url property inside the refresh link element"), li_txt(SolrType.text_general, true, true, true, "all texts in
  • tags"), - licount_i(SolrType.integer, true, true, false, "number of
  • tags"), + licount_i(SolrType.num_integer, true, true, false, "number of
  • tags"), bold_txt(SolrType.text_general, true, true, true, "all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order"), - boldcount_i(SolrType.integer, true, true, false, "total number of occurrences of or "), + boldcount_i(SolrType.num_integer, true, true, false, "total number of occurrences of or "), italic_txt(SolrType.text_general, true, true, true, "all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order"), - italiccount_i(SolrType.integer, true, true, false, "total number of occurrences of "), + italiccount_i(SolrType.num_integer, true, true, false, "total number of occurrences of "), underline_txt(SolrType.text_general, true, true, true, "all texts inside of tags. no doubles. listed in the order of number of occurrences in decreasing order"), - underlinecount_i(SolrType.integer, true, true, false, "total number of occurrences of "), + underlinecount_i(SolrType.num_integer, true, true, false, "total number of occurrences of "), flash_b(SolrType.bool, true, true, false, "flag that shows if a swf file is linked"), frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"), - framesscount_i(SolrType.integer, true, true, false, "number of frames_txt"), + framesscount_i(SolrType.num_integer, true, true, false, "number of frames_txt"), iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"), - iframesscount_i(SolrType.integer, true, true, false, "number of iframes_txt"), + iframesscount_i(SolrType.num_integer, true, true, false, "number of iframes_txt"), url_protocol_s(SolrType.string, true, true, false, "the protocol of the url"), url_paths_sxt(SolrType.string, true, true, true, "all path elements in the url"), url_file_ext_s(SolrType.string, true, true, false, "the file name extension"), - url_parameter_i(SolrType.integer, true, true, false, "number of key-value pairs in search part of the url"), + url_parameter_i(SolrType.num_integer, true, true, false, "number of key-value pairs in search part of the url"), url_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url"), url_parameter_value_sxt(SolrType.string, true, true, true, "the values from key-value pairs in the search part of the url"), - url_chars_i(SolrType.integer, true, true, false, "number of all characters in the url == length of sku field"), + url_chars_i(SolrType.num_integer, true, true, false, "number of all characters in the url == length of sku field"), host_s(SolrType.string, true, true, false, "host of the url"), host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used."), @@ -150,43 +155,43 @@ public enum YaCySchema implements Schema { host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.'"), host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc"), - title_count_i(SolrType.integer, true, true, false, "number of titles (counting the 'title' field) in the document"), - title_chars_val(SolrType.integer, true, true, true, "number of characters for each title"), - title_words_val(SolrType.integer, true, true, true, "number of words in each title"), + title_count_i(SolrType.num_integer, true, true, false, "number of titles (counting the 'title' field) in the document"), + title_chars_val(SolrType.num_integer, true, true, true, "number of characters for each title"), + title_words_val(SolrType.num_integer, true, true, true, "number of words in each title"), - description_count_i(SolrType.integer, true, true, false, "number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)"), - description_chars_val(SolrType.integer, true, true, true, "number of characters for each description"), - description_words_val(SolrType.integer, true, true, true, "number of words in each description"), + description_count_i(SolrType.num_integer, true, true, false, "number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)"), + description_chars_val(SolrType.num_integer, true, true, true, "number of characters for each description"), + description_words_val(SolrType.num_integer, true, true, true, "number of words in each description"), - h1_i(SolrType.integer, true, true, false, "number of h1 header lines"), - h2_i(SolrType.integer, true, true, false, "number of h2 header lines"), - h3_i(SolrType.integer, true, true, false, "number of h3 header lines"), - h4_i(SolrType.integer, true, true, false, "number of h4 header lines"), - h5_i(SolrType.integer, true, true, false, "number of h5 header lines"), - h6_i(SolrType.integer, true, true, false, "number of h6 header lines"), + h1_i(SolrType.num_integer, true, true, false, "number of h1 header lines"), + h2_i(SolrType.num_integer, true, true, false, "number of h2 header lines"), + h3_i(SolrType.num_integer, true, true, false, "number of h3 header lines"), + h4_i(SolrType.num_integer, true, true, false, "number of h4 header lines"), + h5_i(SolrType.num_integer, true, true, false, "number of h5 header lines"), + h6_i(SolrType.num_integer, true, true, false, "number of h6 header lines"), - schema_org_breadcrumb_i(SolrType.integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"), + schema_org_breadcrumb_i(SolrType.num_integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"), opengraph_title_t(SolrType.text_general, true, true, false, "Open Graph Metadata from og:title metadata field, see http://ogp.me/ns#"), opengraph_type_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:type metadata field, see http://ogp.me/ns#"), opengraph_url_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:url metadata field, see http://ogp.me/ns#"), opengraph_image_s(SolrType.text_general, true, true, false, "Open Graph Metadata from og:image metadata field, see http://ogp.me/ns#"), // special values; can only be used if '_val' type is defined in schema file; this is not standard - bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"), - italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"), - underline_val(SolrType.integer, true, true, true, "number of occurrences of texts in underline_txt"), + bold_val(SolrType.num_integer, true, true, true, "number of occurrences of texts in bold_txt"), + italic_val(SolrType.num_integer, true, true, true, "number of occurrences of texts in italic_txt"), + underline_val(SolrType.num_integer, true, true, true, "number of occurrences of texts in underline_txt"), ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"), - ext_cms_val(SolrType.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"), + ext_cms_val(SolrType.num_integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"), ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"), - ext_ads_val(SolrType.integer, true, true, true, "number of attributes counts in ext_ads_txt"), + ext_ads_val(SolrType.num_integer, true, true, true, "number of attributes counts in ext_ads_txt"), ext_community_txt(SolrType.text_general, true, true, true, "names of recognized community functions"), - ext_community_val(SolrType.integer, true, true, true, "number of attribute counts in attr_community"), + ext_community_val(SolrType.num_integer, true, true, true, "number of attribute counts in attr_community"), ext_maps_txt(SolrType.text_general, true, true, true, "names of map services"), - ext_maps_val(SolrType.integer, true, true, true, "number of attribute counts in ext_maps_txt"), + ext_maps_val(SolrType.num_integer, true, true, true, "number of attribute counts in ext_maps_txt"), ext_tracker_txt(SolrType.text_general, true, true, true, "names of tracker server"), - ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"), + ext_tracker_val(SolrType.num_integer, true, true, true, "number of attribute counts in ext_tracker_txt"), ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"), - ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"); + ext_title_val(SolrType.num_integer, true, true, true, "number of matching title expressions"); private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() ) private final SolrType type; @@ -269,16 +274,19 @@ public enum YaCySchema implements Schema { public final void add(final SolrInputDocument doc, final Date value) { assert !this.isMultiValued(); + assert this.type == SolrType.date; doc.setField(this.getSolrFieldName(), value); } public final void add(final SolrInputDocument doc, final int value) { assert !this.isMultiValued(); + assert this.type == SolrType.num_integer; doc.setField(this.getSolrFieldName(), value); } public final void add(final SolrInputDocument doc, final long value) { assert !this.isMultiValued(); + assert this.type == SolrType.num_long; doc.setField(this.getSolrFieldName(), value); } @@ -295,7 +303,7 @@ public enum YaCySchema implements Schema { public final void add(final SolrInputDocument doc, final List value) { assert this.isMultiValued(); if (value == null || value.size() == 0) { - if (this.type == SolrType.integer) { + if (this.type == SolrType.num_integer) { doc.setField(this.getSolrFieldName(), new Integer[0]); } else if (this.type == SolrType.string) { doc.setField(this.getSolrFieldName(), new String[0]); @@ -305,7 +313,7 @@ public enum YaCySchema implements Schema { } return; } - if (this.type == SolrType.integer) { + if (this.type == SolrType.num_integer) { assert (value.iterator().next() instanceof Integer); doc.setField(this.getSolrFieldName(), value.toArray(new Integer[value.size()])); } else if (this.type == SolrType.string || this.type == SolrType.text_general) { diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index 443113d68..fdf386d9d 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -51,18 +51,23 @@ public abstract class AbstractSolrConnector implements SolrConnector { } public final static SolrQuery catchSuccessQuery = new SolrQuery(); static { - catchSuccessQuery.setQuery("-" + YaCySchema.failreason_t.name() + ":[* TO *]"); + catchSuccessQuery.setQuery("-" + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]"); catchSuccessQuery.setFields(YaCySchema.id.getSolrFieldName()); catchSuccessQuery.setRows(1); catchSuccessQuery.setStart(0); } private final static int pagesize = 100; - + @Override - public boolean exists(final String id) throws IOException { + public boolean exists(final String fieldName, final String key) throws IOException { + if (fieldName == null) return false; try { - final SolrDocument doc = get(id, YaCySchema.id.getSolrFieldName()); - return doc != null; + if (fieldName.equals(YaCySchema.id.getSolrFieldName())) { + final SolrDocument doc = getById(key, fieldName); + return doc != null; + } + long count = getQueryCount(fieldName + ":\"" + key + "\""); + return count > 0; } catch (final Throwable e) { log.warn(e); return false; diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index ac3031011..f6fa85a8b 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -57,22 +57,49 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo private SolrConnector solr0; private SolrConnector solr1; - private final ARC hitCache, missCache; + private int hitCacheMax, missCacheMax, partitions; + private final Map hitMissCache; private final ARC documentCache; - public long hitCache_Hit = 0, hitCache_Miss = 0, hitCache_Insert = 0; // for statistics only; do not write - public long missCache_Hit = 0, missCache_Miss = 0, missCache_Insert = 0; // for statistics only; do not write public long documentCache_Hit = 0, documentCache_Miss = 0, documentCache_Insert = 0; // for statistics only; do not write + + public static class HitMissCache { + + public final ARC hitCache, missCache; + public long hitCache_Hit = 0, hitCache_Miss = 0, hitCache_Insert = 0; // for statistics only; do not write + public long missCache_Hit = 0, missCache_Miss = 0, missCache_Insert = 0; // for statistics only; do not write + + public HitMissCache(int hitCacheMax, int missCacheMax, int partitions) { + this.hitCache = new ConcurrentARC(hitCacheMax, partitions); + this.missCache = new ConcurrentARC(missCacheMax, partitions); + } + + public void clearCache() { + this.hitCache.clear(); + this.missCache.clear(); + } + } public MirrorSolrConnector(int hitCacheMax, int missCacheMax, int docCacheMax) { this.solr0 = null; this.solr1 = null; - int partitions = Runtime.getRuntime().availableProcessors() * 2; - this.hitCache = new ConcurrentARC(hitCacheMax, partitions); - this.missCache = new ConcurrentARC(missCacheMax, partitions); - this.documentCache = new ConcurrentARC(docCacheMax, partitions); + this.hitCacheMax = hitCacheMax; + this.missCacheMax = missCacheMax; + this.partitions = Runtime.getRuntime().availableProcessors() * 2; + this.hitMissCache = new HashMap(); + this.documentCache = new ConcurrentARC(docCacheMax, this.partitions); } + + public HitMissCache getCache(String field) { + HitMissCache c = this.hitMissCache.get(field); + if (c == null) { + c = new HitMissCache(this.hitCacheMax, this.missCacheMax, this.partitions); + this.hitMissCache.put(field, c); + } + return c; + } + public boolean isConnected0() { return this.solr0 != null; } @@ -110,8 +137,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo } public void clearCache() { - this.hitCache.clear(); - this.missCache.clear(); + for (HitMissCache c: hitMissCache.values()) c.clearCache(); this.documentCache.clear(); } @@ -163,9 +189,10 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo @Override public void delete(final String id) throws IOException { this.documentCache.remove(id); - this.hitCache.remove(id); - this.missCache.put(id, EXIST); - this.missCache_Insert++; + HitMissCache c = getCache("id"); + c.hitCache.remove(id); + c.missCache.put(id, EXIST); + c.missCache_Insert++; if (this.solr0 != null) this.solr0.delete(id); if (this.solr1 != null) this.solr1.delete(id); } @@ -179,9 +206,10 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo public void delete(final List ids) throws IOException { for (String id: ids) { this.documentCache.remove(id); - this.hitCache.remove(id); - this.missCache.put(id, EXIST); - this.missCache_Insert++; + HitMissCache c = getCache("id"); + c.hitCache.remove(id); + c.missCache.put(id, EXIST); + c.missCache_Insert++; } if (this.solr0 != null) this.solr0.delete(ids); if (this.solr1 != null) this.solr1.delete(ids); @@ -196,74 +224,63 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo return count; } - /** - * check if a given id exists in solr - * @param id - * @return true if any entry in solr exists - * @throws IOException - */ @Override - public boolean exists(final String id) throws IOException { - if (this.hitCache.containsKey(id)) { - this.hitCache_Hit++; - return true; + public boolean exists(final String fieldName, final String key) throws IOException { + HitMissCache c = getCache(fieldName); + if (c.hitCache.containsKey(key)) { + c.hitCache_Hit++; + return true; } - this.hitCache_Miss++; - if (this.documentCache.containsKey(id)) { + c.hitCache_Miss++; + if (this.documentCache.containsKey(key)) { this.documentCache_Hit++; - return true; + return true; } this.documentCache_Miss++; - if (this.missCache.containsKey(id)) { - this.missCache_Hit++; - return false; + if (c.missCache.containsKey(key)) { + c.missCache_Hit++; + return false; } - this.missCache_Miss++; - if ((solr0 != null && solr0.exists(id)) || (solr1 != null && solr1.exists(id))) { - this.missCache.remove(id); - this.hitCache.put(id, EXIST); - this.hitCache_Insert++; + c.missCache_Miss++; + if ((solr0 != null && solr0.exists(fieldName, key)) || (solr1 != null && solr1.exists(fieldName, key))) { + c.missCache.remove(key); + c.hitCache.put(key, EXIST); + c.hitCache_Insert++; return true; } - this.missCache.put(id, EXIST); - this.missCache_Insert++; + c.missCache.put(key, EXIST); + c.missCache_Insert++; return false; } - + @Override - public SolrDocument get(final String id, final String ... fields) throws IOException { - SolrDocument doc = fields.length == 0 ? this.documentCache.get(id) : null; + public SolrDocument getById(final String key, final String ... fields) throws IOException { + SolrDocument doc = fields.length == 0 ? this.documentCache.get(key) : null; if (doc != null) { this.documentCache_Hit++; return doc; } documentCache_Miss++; - if (this.missCache.containsKey(id)) { - this.missCache_Hit++; + HitMissCache c = this.getCache(YaCySchema.id.getSolrFieldName()); + if (c.missCache.containsKey(key)) { + c.missCache_Hit++; return null; } - missCache_Miss++; - if ((solr0 != null && ((doc = solr0.get(id, fields)) != null)) || (solr1 != null && ((doc = solr1.get(id, fields)) != null))) { - this.missCache.remove(id); - this.hitCache.put(id, EXIST); - this.hitCache_Insert++; - if (fields.length == 0) {this.documentCache.put(id, doc); this.documentCache_Insert++;} + c.missCache_Miss++; + if ((solr0 != null && ((doc = solr0.getById(key, fields)) != null)) || (solr1 != null && ((doc = solr1.getById(key, fields)) != null))) { + addToCache(doc, fields.length == 0); return doc; } // check if there is a autocommit problem - if (this.hitCache.containsKey(id)) { + if (c.hitCache.containsKey(key)) { // the document should be there, therefore make a commit and check again this.commit(); - if ((solr0 != null && ((doc = solr0.get(id, fields)) != null)) || (solr1 != null && ((doc = solr1.get(id, fields)) != null))) { - this.missCache.remove(id); - this.hitCache.put(id, EXIST); - this.hitCache_Insert++; - if (fields.length == 0) {this.documentCache.put(id, doc); this.documentCache_Insert++;} - return doc; + if ((solr0 != null && ((doc = solr0.getById(key, fields)) != null)) || (solr1 != null && ((doc = solr1.getById(key, fields)) != null))) { + addToCache(doc, fields.length == 0); } } - this.missCache.put(id, EXIST); - this.missCache_Insert++; + c.missCache.put(key, EXIST); + c.missCache_Insert++; return null; } @@ -277,11 +294,10 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo String id = (String) solrdoc.getFieldValue(YaCySchema.id.getSolrFieldName()); assert id != null; if (id == null) return; - this.missCache.remove(id); - this.documentCache.put(id, ClientUtils.toSolrDocument(solrdoc)); + SolrDocument doc = ClientUtils.toSolrDocument(solrdoc); + addToCache(doc, true); + this.documentCache.put(id, doc); this.documentCache_Insert++; - this.hitCache.put(id, EXIST); - this.hitCache_Insert++; if (this.solr0 != null) this.solr0.add(solrdoc); if (this.solr1 != null) this.solr1.add(solrdoc); } @@ -302,19 +318,19 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo if (this.solr0 == null && this.solr1 == null) return new SolrDocumentList(); if (offset == 0 && count == 1 && querystring.startsWith("id:")) { final SolrDocumentList list = new SolrDocumentList(); - SolrDocument doc = get(querystring.charAt(3) == '"' ? querystring.substring(4, querystring.length() - 1) : querystring.substring(3), fields); + SolrDocument doc = getById(querystring.charAt(3) == '"' ? querystring.substring(4, querystring.length() - 1) : querystring.substring(3), fields); list.add(doc); // no addToCache(list) here because that was already handlet in get(); return list; } if (this.solr0 != null && this.solr1 == null) { SolrDocumentList list = this.solr0.query(querystring, offset, count, fields); - if (fields.length == 0) addToCache(list); + addToCache(list, fields.length == 0); return list; } if (this.solr1 != null && this.solr0 == null) { SolrDocumentList list = this.solr1.query(querystring, offset, count, fields); - if (fields.length == 0) addToCache(list); + addToCache(list, fields.length == 0); return list; } @@ -338,7 +354,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo for (final SolrDocument d: l) list.add(d); // add caching - if (fields.length == 0) addToCache(list); + addToCache(list, fields.length == 0); return list; } @@ -441,33 +457,51 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo return facets0; } - private void addToCache(SolrDocumentList list) { + private void addToCache(SolrDocumentList list, boolean doccache) { if (MemoryControl.shortStatus()) clearCache(); for (final SolrDocument solrdoc: list) { - String id = (String) solrdoc.getFieldValue(YaCySchema.id.getSolrFieldName()); - if (id != null) { - this.hitCache.put(id, EXIST); - hitCache_Insert++; - this.documentCache.put(id, solrdoc); - documentCache_Insert++; + addToCache(solrdoc, doccache); + } + } + + private void addToCache(SolrDocument doc, boolean doccach) { + for (Map.Entry e: this.hitMissCache.entrySet()) { + Object keyo = doc.getFieldValue(e.getKey()); + String key = null; + if (keyo instanceof String) key = (String) keyo; + if (keyo instanceof Integer) key = ((Integer) keyo).toString(); + if (keyo instanceof Long) key = ((Long) keyo).toString(); + if (key != null) { + HitMissCache c = e.getValue(); + c.missCache.remove(key); + c.hitCache.put(key, EXIST); + c.hitCache_Insert++; } } + if (doccach) { + this.documentCache.put((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()), doc); + this.documentCache_Insert++; + } } + @Override public long getSize() { long s = 0; if (this.solr0 != null) s += this.solr0.getSize(); if (this.solr1 != null) s += this.solr1.getSize(); - return Math.max(this.documentCache.size(), Math.max(this.hitCache.size(), s)); + HitMissCache c = getCache("id"); + return Math.max(this.documentCache.size(), Math.max(c.hitCache.size(), s)); } public int nameCacheHitSize() { - return this.hitCache.size(); + HitMissCache c = getCache("id"); + return c.hitCache.size(); } public int nameCacheMissSize() { - return this.missCache.size(); + HitMissCache c = getCache("id"); + return c.missCache.size(); } public int nameCacheDocumentSize() { diff --git a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java index a85ea485b..6a5dd6143 100644 --- a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java @@ -143,10 +143,10 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr return this.solr.deleteByQuery(querystring); } - @Override - public SolrDocument get(final String id, final String ... fields) throws IOException { - return this.solr.get(id, fields); - } + @Override + public SolrDocument getById(final String key, final String ... fields) throws IOException { + return this.solr.getById(key, fields); + } @Override public void add(final SolrInputDocument solrdoc) throws IOException, SolrException { diff --git a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java index d0d0106f7..143986bff 100644 --- a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java @@ -130,11 +130,11 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public boolean exists(final String id) throws IOException { + public boolean exists(final String fieldName, final String key) throws IOException { final long t = System.currentTimeMillis() + this.retryMaxTime; Throwable ee = null; while (System.currentTimeMillis() < t) try { - return this.solrConnector.exists(id); + return this.solrConnector.exists(fieldName, key); } catch (final Throwable e) { ee = e; try {Thread.sleep(10);} catch (final InterruptedException e1) {} @@ -144,12 +144,12 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon return false; } - @Override - public SolrDocument get(final String id, final String ... fields) throws IOException { - final long t = System.currentTimeMillis() + this.retryMaxTime; + @Override + public SolrDocument getById(final String key, final String ... fields) throws IOException { + final long t = System.currentTimeMillis() + this.retryMaxTime; Throwable ee = null; while (System.currentTimeMillis() < t) try { - return this.solrConnector.get(id, fields); + return this.solrConnector.getById(key, fields); } catch (final Throwable e) { ee = e; try {Thread.sleep(10);} catch (final InterruptedException e1) {} @@ -157,7 +157,7 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon } if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage()); return null; - } + } @Override public void add(final SolrInputDocument solrdoc) throws IOException, SolrException { diff --git a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java index 9d5cc0216..8374606c6 100644 --- a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java @@ -125,21 +125,21 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon * @throws IOException */ @Override - public boolean exists(final String id) throws IOException { + public boolean exists(final String fieldName, final String key) throws IOException { for (final SolrConnector connector: this.connectors) { - if (connector.exists(id)) return true; + if (connector.exists(fieldName, key)) return true; } return false; } - @Override - public SolrDocument get(String id, final String ... fields) throws IOException { - for (final SolrConnector connector: this.connectors) { - SolrDocument doc = connector.get(id, fields); - if (doc != null) return doc; + @Override + public SolrDocument getById(final String key, final String ... fields) throws IOException { + for (final SolrConnector connector: this.connectors) { + SolrDocument doc = connector.getById(key, fields); + if (doc != null) return doc; } return null; - } + } /** * add a Solr document diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index 8f05a7e50..e6df1e59d 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -88,12 +88,13 @@ public interface SolrConnector extends Iterable /* Iterable of document public int deleteByQuery(final String querystring) throws IOException; /** - * check if a given id exists in solr - * @param id + * check if a given key exists in solr at the field fieldName + * @param fieldName + * @param key * @return true if any entry in solr exists * @throws IOException */ - public boolean exists(final String id) throws IOException; + public boolean exists(final String fieldName, final String key) throws IOException; /** * add a solr input document @@ -105,13 +106,13 @@ public interface SolrConnector extends Iterable /* Iterable of document public void add(final Collection solrdocs) throws IOException, SolrException; /** - * get a document from solr by given id - * @param id + * get a document from solr by given key for the id-field + * @param key * @param fields list of fields * @return one result or null if no result exists * @throws IOException */ - public SolrDocument get(final String id, final String ... fields) throws IOException; + public SolrDocument getById(final String key, final String ... fields) throws IOException; /** * get a query result from solr diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index cd0e443c5..3946d3eb1 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -316,20 +316,14 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen private final char[] queryIDTemplate = "id:\" \"".toCharArray(); - /** - * get a document from solr by given id - * @param id - * @return one result or null if no result exists - * @throws IOException - */ @Override - public SolrDocument get(final String id, final String ... fields) throws IOException { - assert id.length() == 12; + public SolrDocument getById(final String key, final String ... fields) throws IOException { + final SolrQuery query = new SolrQuery(); + assert key.length() == 12; // construct query char[] q = new char[17]; System.arraycopy(this.queryIDTemplate, 0, q, 0, 17); - System.arraycopy(id.toCharArray(), 0, q, 4, 12); - final SolrQuery query = new SolrQuery(); + System.arraycopy(key.toCharArray(), 0, q, 4, 12); query.setQuery(new String(q)); query.setRows(1); query.setStart(0); diff --git a/source/net/yacy/cora/federate/solr/responsewriter/EnhancedXMLResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/EnhancedXMLResponseWriter.java index 65d82aa36..f6be6d8a2 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/EnhancedXMLResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/EnhancedXMLResponseWriter.java @@ -185,15 +185,15 @@ public class EnhancedXMLResponseWriter implements QueryResponseWriter { writeTag(writer, "str", name, value, true); } else if (typeName.equals(SolrType.bool.printName())) { writeTag(writer, "bool", name, "F".equals(value) ? "false" : "true", true); - } else if (typeName.equals(SolrType.integer.printName())) { + } else if (typeName.equals(SolrType.num_integer.printName())) { writeTag(writer, "int", name, value, true); - } else if (typeName.equals(SolrType.tlong.printName())) { + } else if (typeName.equals(SolrType.num_long.printName())) { writeTag(writer, "long", name, value, true); } else if (typeName.equals(SolrType.date.printName())) { writeTag(writer, "date", name, DateField.formatExternal(new Date(Long.parseLong(value))), true); - } else if (typeName.equals(SolrType.tfloat.printName())) { + } else if (typeName.equals(SolrType.num_float.printName())) { writeTag(writer, "float", name, value, true); - } else if (typeName.equals(SolrType.tdouble.printName())) { + } else if (typeName.equals(SolrType.num_double.printName())) { writeTag(writer, "double", name, value, true); } } diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index b97b49e97..bee51d884 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -41,7 +41,7 @@ import java.util.concurrent.atomic.AtomicInteger; import net.yacy.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.Base64Order; diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java index 706254256..526762c28 100644 --- a/source/net/yacy/crawler/retrieval/FileLoader.java +++ b/source/net/yacy/crawler/retrieval/FileLoader.java @@ -31,8 +31,8 @@ import java.util.Date; import java.util.List; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index d7eda49f5..a70d63738 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -30,9 +30,9 @@ import java.util.Date; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; diff --git a/source/net/yacy/crawler/retrieval/SMBLoader.java b/source/net/yacy/crawler/retrieval/SMBLoader.java index 6a5223979..26c68fc04 100644 --- a/source/net/yacy/crawler/retrieval/SMBLoader.java +++ b/source/net/yacy/crawler/retrieval/SMBLoader.java @@ -39,9 +39,9 @@ import jcifs.smb.SmbException; import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 77c4117db..77beb20e7 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -37,9 +37,13 @@ import java.util.Set; import java.util.SortedSet; import java.util.TreeMap; +import org.apache.solr.common.params.MapSolrParams; +import org.apache.solr.update.processor.Lookup3Signature; + import net.yacy.cora.document.ASCII; import net.yacy.cora.document.WordCache; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.vocabulary.Tagging; @@ -71,6 +75,8 @@ public final class Condenser { private final Map words; // a string (the words) to (indexWord) - relation private final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging private final Set synonyms; // a set of synonyms to the words + private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection + private String fuzzy_signature_text = null; // signatures for double-check detection public int RESULT_NUMB_WORDS = -1; public int RESULT_DIFF_WORDS = -1; @@ -84,7 +90,7 @@ public final class Condenser { final boolean indexText, final boolean indexMedia, final WordCache meaningLib, - final SynonymLibrary synonyms, + final SynonymLibrary synlib, final boolean doAutotagging ) { Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging @@ -209,14 +215,48 @@ public final class Condenser { if (!this.tags.isEmpty()) { document.addMetatags(this.tags); } + + if (synlib != null) { + for (String word: this.words.keySet()) { + Set syms = synlib.getSynonyms(word); + if (syms != null) this.synonyms.addAll(syms); + } + } + String text = document.getTextString(); // create the synonyms set if (synonyms != null) { for (String word: this.words.keySet()) { - Set syms = synonyms.getSynonyms(word); + Set syms = synlib.getSynonyms(word); if (syms != null) this.synonyms.addAll(syms); } } + + // create hashes for duplicate detection + // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b + EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature(); + Map sp = new HashMap(); + sp.put("quantRate", "0.5"); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5! + sp.put("minTokenLen", "3"); + fuzzySignatureFactory.init(new MapSolrParams(sp)); + fuzzySignatureFactory.add(text); + byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature(); + long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff); + this.fuzzy_signature = l; + this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString(); + Lookup3Signature exactSignatureFactory = new Lookup3Signature(); + exactSignatureFactory.add(text); + byte[] exact_signature_hash = exactSignatureFactory.getSignature(); + l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff); + this.exact_signature = l; + } + + public Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) { + this.languageIdentificator = null; // we don't need that here + // analysis = new Properties(); + this.words = new TreeMap(); + this.synonyms = new HashSet(); + createCondensement(text, meaningLib, doAutotagging); } private void insertTextToWords( @@ -250,14 +290,6 @@ public final class Condenser { } } - public Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) { - this.languageIdentificator = null; // we don't need that here - // analysis = new Properties(); - this.words = new TreeMap(); - this.synonyms = new HashSet(); - createCondensement(text, meaningLib, doAutotagging); - } - public int excludeWords(final SortedSet stopwords) { // subtracts the given stopwords from the word list // the word list shrinkes. This returns the number of shrinked words @@ -277,6 +309,18 @@ public final class Condenser { return l; } + public long fuzzySignature() { + return this.fuzzy_signature; + } + + public String fuzzySignatureText() { + return this.fuzzy_signature_text; + } + + public long exactSignature() { + return this.exact_signature; + } + public String language() { return this.languageIdentificator.getLanguage(); } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index b1608bee9..a1fecbdaa 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -53,9 +53,9 @@ import java.util.Set; import java.util.TreeSet; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.lod.vocabulary.DCTerms; import net.yacy.cora.lod.vocabulary.Owl; diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 33f8458e7..76575859a 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -433,7 +433,7 @@ public class URIMetadataNode { private int getInt(YaCySchema field) { assert !field.isMultiValued(); - assert field.getType() == SolrType.integer; + assert field.getType() == SolrType.num_integer; Object x = this.doc.getFieldValue(field.getSolrFieldName()); if (x == null) return 0; if (x instanceof Integer) return ((Integer) x).intValue(); @@ -480,7 +480,7 @@ public class URIMetadataNode { @SuppressWarnings("unchecked") private ArrayList getIntList(YaCySchema field) { assert field.isMultiValued(); - assert field.getType() == SolrType.integer; + assert field.getType() == SolrType.num_integer; Object r = this.doc.getFieldValue(field.getSolrFieldName()); if (r == null) return new ArrayList(0); if (r instanceof ArrayList) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 35fed436c..e73a899a8 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -83,13 +83,13 @@ import net.yacy.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.contentcontrol.SMWListSyncThread; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSReader; import net.yacy.cora.document.UTF8; import net.yacy.cora.document.WordCache; +import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.solr.connector.ShardSelection; import net.yacy.cora.federate.solr.connector.ShardSolrConnector; @@ -392,7 +392,7 @@ public final class Switchboard extends serverSwitch { solrScheme.fill(backupScheme, true); // switch on some fields which are necessary for ranking and faceting for (YaCySchema field: new YaCySchema[]{ - YaCySchema.host_s, + YaCySchema.host_s, YaCySchema.load_date_dt, YaCySchema.url_file_ext_s, YaCySchema.last_modified, // needed for media search and /date operator YaCySchema.url_paths_sxt, YaCySchema.host_organization_s, // needed to search in the url YaCySchema.inboundlinks_protocol_sxt, YaCySchema.inboundlinks_urlstub_txt, // needed for HostBrowser diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 98cacb8f6..8fba0db54 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -42,7 +42,6 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; @@ -208,7 +207,7 @@ public final class Fulltext implements Iterable { } public void commit() { - if (this.forcedCommitTime + forcedCommitTimeout < System.currentTimeMillis()) return; + if (this.forcedCommitTime + forcedCommitTimeout > System.currentTimeMillis()) return; this.forcedCommitTime = Long.MAX_VALUE - forcedCommitTimeout; // set the time high to prevent that other processes get to this point meanwhile this.solr.commit(); this.forcedCommitTime = System.currentTimeMillis(); // set the exact time @@ -218,7 +217,7 @@ public final class Fulltext implements Iterable { if (urlHash == null) return null; SolrDocument doc; try { - doc = this.solr.get(urlHash, YaCySchema.load_date_dt.getSolrFieldName()); + doc = this.solr.getById(urlHash, YaCySchema.load_date_dt.getSolrFieldName()); } catch (IOException e) { return null; } @@ -249,7 +248,7 @@ public final class Fulltext implements Iterable { // get the metadata from Solr try { - SolrDocument doc = this.solr.get(ASCII.String(urlHash)); + SolrDocument doc = this.solr.getById(ASCII.String(urlHash)); if (doc != null) { if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash); return new URIMetadataNode(doc, wre, weight); @@ -279,7 +278,7 @@ public final class Fulltext implements Iterable { byte[] idb = ASCII.getBytes(id); try { if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); - SolrDocument sd = this.solr.get(id); + SolrDocument sd = this.solr.getById(id, YaCySchema.last_modified.getSolrFieldName()); Date now = new Date(); Date sdDate = sd == null ? null : URIMetadataNode.getDate(sd, YaCySchema.last_modified); if (sdDate == null || sdDate.after(now)) sdDate = now; @@ -307,7 +306,7 @@ public final class Fulltext implements Iterable { String id = ASCII.String(idb); try { if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); - SolrDocument sd = this.solr.get(id); + SolrDocument sd = this.solr.getById(id); if (sd == null || (new URIMetadataNode(sd)).isOlder(row)) { if (this.solrScheme.contains(YaCySchema.ip_s)) { // ip_s needs a dns lookup which causes blockings during search here @@ -471,7 +470,7 @@ public final class Fulltext implements Iterable { if (urlHash == null) return false; if (this.urlIndexFile != null && this.urlIndexFile.has(urlHash)) return true; try { - if (this.solr.exists(ASCII.String(urlHash))) return true; + if (this.solr.exists(YaCySchema.id.getSolrFieldName(), ASCII.String(urlHash))) return true; } catch (final Throwable e) { Log.logException(e); } @@ -480,7 +479,7 @@ public final class Fulltext implements Iterable { public String failReason(final String urlHash) throws IOException { if (urlHash == null) return null; - SolrDocument doc = this.solr.get(urlHash, YaCySchema.failreason_t.getSolrFieldName()); + SolrDocument doc = this.solr.getById(urlHash, YaCySchema.failreason_t.getSolrFieldName()); if (doc == null) return null; String reason = (String) doc.getFieldValue(YaCySchema.failreason_t.getSolrFieldName()); return reason == null ? null : reason.length() == 0 ? null : reason; diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 0b5d69478..864e3a901 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -202,7 +202,7 @@ public class Segment { public int getQueryCount(String word) { if (word == null || word.indexOf(':') >= 0 || word.indexOf(' ') >= 0 || word.indexOf('/') >= 0) return 0; int count = this.termIndex == null ? 0 : this.termIndex.count(Word.word2hash(word)); - try {count += this.fulltext.getSolr().getQueryCount(YaCySchema.text_t.name() + ':' + word);} catch (IOException e) {} + try {count += this.fulltext.getSolr().getQueryCount(YaCySchema.text_t.getSolrFieldName() + ':' + word);} catch (IOException e) {} return count; } @@ -363,8 +363,28 @@ public class Segment { if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; char docType = Response.docType(document.dc_format()); - // STORE TO SOLR + // CREATE SOLR DOCUMENT final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language); + + // FIND OUT IF THIS IS A DOUBLE DOCUMENT + for (YaCySchema[] checkfields: new YaCySchema[][]{ + {YaCySchema.exact_signature_l, YaCySchema.exact_signature_unique_b}, + {YaCySchema.fuzzy_signature_l, YaCySchema.fuzzy_signature_unique_b}}) { + YaCySchema hashfield = checkfields[0]; + YaCySchema uniquefield = checkfields[1]; + if (this.fulltext.getSolrScheme().contains(hashfield) && this.fulltext.getSolrScheme().contains(uniquefield)) { + // lookup the document with the same signature + long signature = ((Long) solrInputDoc.getField(hashfield.getSolrFieldName()).getValue()).longValue(); + try { + if (this.fulltext.getSolr().exists(hashfield.getSolrFieldName(), Long.toString(signature))) { + // change unique attribut in content + solrInputDoc.setField(uniquefield.getSolrFieldName(), false); + } + } catch (IOException e) {} + } + } + + // STORE TO SOLR String error = null; tryloop: for (int i = 0; i < 20; i++) { try { diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 6365b0add..7c0a9154e 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -139,13 +139,18 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value); } + private void add(final SolrInputDocument doc, final YaCySchema key, final long value) { + assert !key.isMultiValued(); + if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value); + } + private void add(final SolrInputDocument doc, final YaCySchema key, final boolean value) { assert !key.isMultiValued(); if (isEmpty() || contains(key)) key.add(doc, value); } protected static Date getDate(SolrInputDocument doc, final YaCySchema key) { - Date x = (Date) doc.getFieldValue(key.name()); + Date x = (Date) doc.getFieldValue(key.getSolrFieldName()); Date now = new Date(); return (x == null) ? new Date(0) : x.after(now) ? now : x; } @@ -384,6 +389,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable List synonyms = condenser.synonyms(); add(doc, YaCySchema.synonyms_sxt, synonyms); } + add(doc, YaCySchema.exact_signature_l, condenser.exactSignature()); + add(doc, YaCySchema.exact_signature_unique_b, true); // this must be corrected afterwards! + add(doc, YaCySchema.fuzzy_signature_l, condenser.fuzzySignature()); + add(doc, YaCySchema.fuzzy_signature_text_t, condenser.fuzzySignatureText()); + add(doc, YaCySchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards! // path elements of link if (allAttr || contains(YaCySchema.url_paths_sxt)) add(doc, YaCySchema.url_paths_sxt, digestURI.getPaths()); diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 695b99087..a26c07765 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -28,12 +28,10 @@ import java.util.LinkedHashMap; import java.util.Map; import java.util.SortedSet; -import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.document.Condenser; import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.word.Word; diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 48d5248c7..4ad3def9d 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -43,8 +43,8 @@ import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.ORDER; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.yacy.CacheStrategy; diff --git a/source/net/yacy/search/query/RankingProcess.java b/source/net/yacy/search/query/RankingProcess.java index a519be058..ed0d2c1a7 100644 --- a/source/net/yacy/search/query/RankingProcess.java +++ b/source/net/yacy/search/query/RankingProcess.java @@ -36,7 +36,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 09278d634..f4a8f8b53 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -41,8 +41,8 @@ import com.hp.hpl.jena.rdf.model.Resource; import net.yacy.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.Distribution; diff --git a/source/net/yacy/search/query/SnippetWorker.java b/source/net/yacy/search/query/SnippetWorker.java index a759689d9..51a484087 100644 --- a/source/net/yacy/search/query/SnippetWorker.java +++ b/source/net/yacy/search/query/SnippetWorker.java @@ -23,8 +23,8 @@ package net.yacy.search.query; import java.util.Iterator; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ScoreMap; diff --git a/source/net/yacy/search/ranking/RankingProfile.java b/source/net/yacy/search/ranking/RankingProfile.java index 2e21fbdb9..b5e906ab7 100644 --- a/source/net/yacy/search/ranking/RankingProfile.java +++ b/source/net/yacy/search/ranking/RankingProfile.java @@ -30,8 +30,8 @@ import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; -import net.yacy.cora.document.Classification; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.util.NumberTools; import net.yacy.kelondro.logging.Log; diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index cd424551c..740290846 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -36,8 +36,8 @@ import java.util.SortedSet; import java.util.TreeSet; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; -import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; diff --git a/source/net/yacy/server/http/HTTPDFileHandler.java b/source/net/yacy/server/http/HTTPDFileHandler.java index f09ba268b..d81c06dd0 100644 --- a/source/net/yacy/server/http/HTTPDFileHandler.java +++ b/source/net/yacy/server/http/HTTPDFileHandler.java @@ -93,9 +93,9 @@ import java.util.zip.GZIPOutputStream; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.order.Digest; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; diff --git a/source/net/yacy/server/serverObjects.java b/source/net/yacy/server/serverObjects.java index 52b091ca1..02f084c91 100644 --- a/source/net/yacy/server/serverObjects.java +++ b/source/net/yacy/server/serverObjects.java @@ -455,7 +455,7 @@ public class serverObjects extends HashMap implements Cloneable public SolrParams toSolrParams(YaCySchema[] facets) { // check if all required post fields are there - if (!this.containsKey(CommonParams.DF)) this.put(CommonParams.DF, YaCySchema.text_t.name()); // set default field to the text field + if (!this.containsKey(CommonParams.DF)) this.put(CommonParams.DF, YaCySchema.text_t.getSolrFieldName()); // set default field to the text field if (!this.containsKey(CommonParams.START)) this.put(CommonParams.START, "0"); // set default start item if (!this.containsKey(CommonParams.ROWS)) this.put(CommonParams.ROWS, "10"); // set default number of search results @@ -466,7 +466,7 @@ public class serverObjects extends HashMap implements Cloneable if (facets != null && facets.length > 0) { m.put("facet", new String[]{"true"}); String[] fs = new String[facets.length]; - for (int i = 0; i < facets.length; i++) fs[i] = facets[i].name(); + for (int i = 0; i < facets.length; i++) fs[i] = facets[i].getSolrFieldName(); m.put("facet.field", fs); } final SolrParams solrParams = new MultiMapSolrParams(m);