/** * CollectionConfiguration * Copyright 2011 by Michael Peter Christen * First released 14.04.2011 at http://yacy.net * * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ * $LastChangedRevision: 7654 $ * $LastChangedBy: orbiter $ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . */ package net.yacy.search.schema; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.lang.reflect.Array; import java.net.InetAddress; import java.net.MalformedURLException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailType; import net.yacy.cora.federate.solr.ProcessType; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL; import net.yacy.cora.federate.solr.logic.BooleanLiteral; import net.yacy.cora.federate.solr.logic.CatchallLiteral; import net.yacy.cora.federate.solr.logic.Conjunction; import net.yacy.cora.federate.solr.logic.Disjunction; import net.yacy.cora.federate.solr.logic.LongLiteral; import net.yacy.cora.federate.solr.logic.Negation; import net.yacy.cora.federate.solr.logic.StringLiteral; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.SentenceReader; import net.yacy.document.Tokenizer; import net.yacy.document.content.DCEntry; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.index.RowHandleMap; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.MemoryControl; import net.yacy.search.index.Segment; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.query.QueryParams; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputField; import org.eclipse.jetty.util.ConcurrentHashSet; public class CollectionConfiguration extends SchemaConfiguration implements Serializable { private static final long serialVersionUID=-499100932212840385L; public static boolean UNIQUE_HEURISTIC_PREFER_HTTPS = false; public static boolean UNIQUE_HEURISTIC_PREFER_WWWPREFIX = true; private final ArrayList rankings; /** * initialize the schema with a given configuration file * the configuration file simply contains a list of lines with keywords * or keyword = value lines (while value is a custom Solr field name * @param configurationFile * @throws IOException */ public CollectionConfiguration(final File configurationFile, final boolean lazy) throws IOException { super(configurationFile); super.lazy = lazy; this.rankings = new ArrayList(4); for (int i = 0; i <= 3; i++) rankings.add(new Ranking()); // check consistency: compare with YaCyField enum if (this.isEmpty()) return; Iterator it = this.entryIterator(); for (SchemaConfiguration.Entry etr = it.next(); it.hasNext(); etr = it.next()) { try { CollectionSchema f = CollectionSchema.valueOf(etr.key()); f.setSolrFieldName(etr.getValue()); } catch (final IllegalArgumentException e) { ConcurrentLog.fine("SolrCollectionWriter", "solr schema file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + etr.toString() + "'"); it.remove(); } } // check consistency the other way: look if all enum constants in SolrField appear in the configuration file for (CollectionSchema field: CollectionSchema.values()) { if (this.get(field.name()) == null) { if (CollectionSchema.author_sxt.getSolrFieldName().endsWith(field.name())) continue; // exception for this: that is a copy-field if (CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName().endsWith(field.name())) continue; // exception for this: automatically generated if (CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName().endsWith(field.name())) continue; // exception for this: automatically generated ConcurrentLog.warn("SolrCollectionWriter", " solr schema file " + configurationFile.getAbsolutePath() + " is missing declaration for '" + field.name() + "'"); } } } public String[] allFields() { ArrayList a = new ArrayList<>(this.size()); for (CollectionSchema f: CollectionSchema.values()) { if (this.contains(f)) a.add(f.getSolrFieldName()); } return a.toArray(new String[a.size()]); } public Ranking getRanking(final int idx) { return this.rankings.get(idx % this.rankings.size()); // simply prevent out of bound exeption (& callers don't check for null) } /** * @param name The name of the ranking to get. * @return The corresponding Ranking-object. */ public Ranking getRanking(final String name) { if (name == null) return null; for (int i = 0; i < this.rankings.size(); i++) { Ranking currentRanking = this.rankings.get(i); if (name.equals(currentRanking.getName())) return currentRanking; } return null; } /** * save configuration to file and update enum SolrFields * @throws IOException */ @Override public void commit() throws IOException { try { super.commit(); // make sure the enum SolrField.SolrFieldName is current Iterator it = this.entryIterator(); for (SchemaConfiguration.Entry etr = it.next(); it.hasNext(); etr = it.next()) { try { SchemaDeclaration f = CollectionSchema.valueOf(etr.key()); f.setSolrFieldName(etr.getValue()); } catch (final IllegalArgumentException e) { continue; } } } catch (final IOException e) {} } private final static Set omitFields = new HashSet(3); static { omitFields.add(CollectionSchema.author_sxt.getSolrFieldName()); omitFields.add(CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName()); omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName()); } public SolrInputDocument toSolrInputDocument(final SolrDocument doc) { return toSolrInputDocument(doc, omitFields); } public SolrDocument toSolrDocument(final SolrInputDocument doc) { return toSolrDocument(doc, omitFields); } /** * add uri attributes to solr document * @param doc * @param allAttr * @param digestURL * @param doctype * @return the normalized url */ public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL) { add(doc, CollectionSchema.id, ASCII.String(digestURL.hash())); if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, digestURL.hosthash()); String us = digestURL.toNormalform(true); add(doc, CollectionSchema.sku, us); if (allAttr || contains(CollectionSchema.ip_s)) { final InetAddress address = digestURL.getInetAddress(); if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress()); } String host = null; if ((host = digestURL.getHost()) != null) { String dnc = Domains.getDNC(host); String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1); int p = subdomOrga.lastIndexOf('.'); String subdom = (p < 0) ? "" : subdomOrga.substring(0, p); String orga = (p < 0) ? subdomOrga : subdomOrga.substring(p + 1); if (allAttr || contains(CollectionSchema.host_s)) add(doc, CollectionSchema.host_s, host); if (allAttr || contains(CollectionSchema.host_dnc_s)) add(doc, CollectionSchema.host_dnc_s, dnc); if (allAttr || contains(CollectionSchema.host_organization_s)) add(doc, CollectionSchema.host_organization_s, orga); if (allAttr || contains(CollectionSchema.host_organizationdnc_s)) add(doc, CollectionSchema.host_organizationdnc_s, orga + '.' + dnc); if (allAttr || contains(CollectionSchema.host_subdomain_s)) add(doc, CollectionSchema.host_subdomain_s, subdom); } // path elements of link String filename = digestURL.getFileName(); String extension = MultiProtocolURL.getFileExtension(filename); String filenameStub = filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename; // remove possible jsession (or other url parm like "img.jpg;jsession=123") // TODO: consider to implement ";jsession=123" check in getFileExtension() if (extension.indexOf(';') >= 0) extension = extension.substring(0,extension.indexOf(';')); if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length()); if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol()); if (allAttr || contains(CollectionSchema.url_paths_sxt) || contains(CollectionSchema.url_paths_count_i)) { String[] paths = digestURL.getPaths(); if (allAttr || contains(CollectionSchema.url_paths_count_i)) add(doc, CollectionSchema.url_paths_count_i, paths.length); if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, paths); } if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filenameStub); if (allAttr || contains(CollectionSchema.url_file_name_tokens_t)) add(doc, CollectionSchema.url_file_name_tokens_t, MultiProtocolURL.toTokens(filenameStub)); if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension); Map searchpart = digestURL.getSearchpartMap(); if (searchpart == null) { if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, 0); } else { if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, searchpart.size()); if (allAttr || contains(CollectionSchema.url_parameter_key_sxt)) add(doc, CollectionSchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()])); if (allAttr || contains(CollectionSchema.url_parameter_value_sxt)) add(doc, CollectionSchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()])); } return us; } public SolrInputDocument metadata2solr(final URIMetadataNode md) { SolrInputDocument doc = toSolrInputDocument(md); //urimetadatanode stores some values in private fields, add now to sorldocument boolean allAttr = this.isEmpty(); addURIAttributes(doc, allAttr, md.url()); String title = md.dc_title(); if (allAttr || contains(CollectionSchema.title_count_i)) add(doc, CollectionSchema.title_count_i, 1); if (allAttr || contains(CollectionSchema.title_chars_val)) { Integer[] cv = new Integer[]{new Integer(title.length())}; add(doc, CollectionSchema.title_chars_val, cv); } if (allAttr || contains(CollectionSchema.title_words_val)) { Integer[] cv = new Integer[]{new Integer(CommonPattern.SPACES.split(title).length)}; add(doc, CollectionSchema.title_words_val, cv); } String description = md.snippet(); boolean description_exist = description != null; if (description == null) description = ""; if (allAttr || contains(CollectionSchema.description_txt)) add(doc, CollectionSchema.description_txt, description_exist ? new String[]{description} : new String[0]); if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, description_exist ? 1 : 0); if (allAttr || contains(CollectionSchema.description_chars_val)) { add(doc, CollectionSchema.description_chars_val, description_exist ? new Integer[]{new Integer(description.length())} : new Integer[0]); } if (allAttr || contains(CollectionSchema.description_words_val)) { add(doc, CollectionSchema.description_words_val, description_exist ? new Integer[]{new Integer(description.length() == 0 ? 0 : CommonPattern.SPACES.split(description).length)} : new Integer[0]); } String keywords = md.dc_subject(); Bitfield flags = md.flags(); if (flags.get(Tokenizer.flag_cat_indexof)) { if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else { if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof"; } } if (allAttr || contains(CollectionSchema.keywords)) { add(doc, CollectionSchema.keywords, keywords); } if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, md.limage()); if (allAttr || contains(CollectionSchema.linkscount_i)) add(doc, CollectionSchema.linkscount_i, md.llocal() + md.lother()); if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, md.llocal()); if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, md.lother()); if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, StandardCharsets.UTF_8.name()); // coordinates if (md.lat() != 0.0 && md.lon() != 0.0) { // i.e. from or if (allAttr || contains(CollectionSchema.coordinate_p)) { add(doc, CollectionSchema.coordinate_p, Double.toString(md.lat()) + "," + Double.toString(md.lon())); } } if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, 200); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher()); // fields that are in URIMetadataRow additional to yacy2solr basic requirement if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, md.laudio()); if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, md.lvideo()); if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, md.lapp()); if (allAttr || contains(CollectionSchema.text_t)) { // construct the text from other metadata parts. // This is necessary here since that is used to search the link when no other data (parsed text body) is available StringBuilder sb = new StringBuilder(120); // accText(sb, md.dc_title()); // default search field via getQueryFields(), not needed for snippet (always displayed) // accText(sb, md.dc_creator()); // author is in Default ranking/getQueryFields // accText(sb, md.dc_publisher()); // has it's own metadata field publisher_t (not part of default queryfields) and mostly N/A // accText(sb, md.snippet()); // above added to description_txt, default search field via getQueryFields(), description_txt incl. in snippet calculation accText(sb, md.url().toTokens()); // accText(sb, keywords); // default search field via getQueryFields(), keywords not incl. in snippet calculation add(doc, CollectionSchema.text_t, sb.toString()); } return doc; } private static void accText(final StringBuilder sb, String text) { if (text == null || text.length() == 0) return; if (sb.length() != 0) sb.append(' '); text = text.trim(); if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); } public static class Subgraph { public final ArrayList[] urlProtocols, urlStubs, urlAnchorTexts; @SuppressWarnings("unchecked") public Subgraph(int inboundSize, int outboundSize) { this.urlProtocols = (ArrayList[]) Array.newInstance(ArrayList.class, 2); this.urlProtocols[0] = new ArrayList(inboundSize); this.urlProtocols[1] = new ArrayList(outboundSize); this.urlStubs = (ArrayList[]) Array.newInstance(ArrayList.class, 2); this.urlStubs[0] = new ArrayList(inboundSize); this.urlStubs[1] = new ArrayList(outboundSize); this.urlAnchorTexts = (ArrayList[]) Array.newInstance(ArrayList.class, 2); this.urlAnchorTexts[0] = new ArrayList(inboundSize); this.urlAnchorTexts[1] = new ArrayList(outboundSize); } } public static boolean enrichSubgraph(final Subgraph subgraph, final DigestURL source_url, AnchorURL target_url) { final String text = target_url.getTextProperty(); // the text between the tag String source_host = source_url.getHost(); String target_host = target_url.getHost(); boolean inbound = (source_host == null && target_host == null) || (source_host != null && target_host != null && (target_host.equals(source_host) || target_host.equals("www." + source_host) || source_host.equals("www." + target_host))); // well, not everybody defines 'outbound' that way but however, thats used here. int ioidx = inbound ? 0 : 1; subgraph.urlProtocols[ioidx].add(target_url.getProtocol()); subgraph.urlStubs[ioidx].add(target_url.urlstub(true, true)); subgraph.urlAnchorTexts[ioidx].add(text); return inbound; } /** * a SolrVector is a SolrInputDocument with the ability * to store also the webgraph that is associated with * the web document in the Solr document. */ public static class SolrVector extends SolrInputDocument { private static final long serialVersionUID = -210901881471714939L; private List webgraphDocuments; public SolrVector() { super(); this.webgraphDocuments = new ArrayList(); } public void addWebgraphDocument(SolrInputDocument webgraphDocument) { this.webgraphDocuments.add(webgraphDocument); } public List getWebgraphDocuments() { return this.webgraphDocuments; } } public SolrVector yacy2solr( final Segment segment, final Map collections, final ResponseHeader responseHeader, final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final boolean setUnique, final WebgraphConfiguration webgraph, final String sourceName) { // we use the SolrCell design as index schema SolrVector doc = new SolrVector(); final DigestURL digestURL = document.dc_source(); boolean allAttr = this.isEmpty(); String url = addURIAttributes(doc, allAttr, digestURL); add(doc, CollectionSchema.content_type, new String[]{document.dc_format()}); // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check) Set processTypes = new LinkedHashSet(); String host = digestURL.getHost(); int crawldepth = document.getDepth(); if ((allAttr || contains(CollectionSchema.crawldepth_i))) { CollectionSchema.crawldepth_i.add(doc, crawldepth); } if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) { processTypes.add(ProcessType.CITATION); // postprocessing needed } if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) { List cs = new ArrayList(); for (Map.Entry e: collections.entrySet()) { if (e.getValue().matcher(url).matches()) cs.add(e.getKey()); } add(doc, CollectionSchema.collection_sxt, cs); } char doctype = Response.docType(responseHeader== null ? null : responseHeader.getContentType()); // null returns DT_UNKNOWN List titles = document.titles(); if (allAttr || contains(CollectionSchema.title)) { if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) { String mediatitle = responseHeader.get(HeaderFramework.X_YACY_MEDIA_TITLE, ""); if (mediatitle.length() > 0) { if (titles.size() == 0) titles.add(mediatitle); else titles.set(0, mediatitle); } } add(doc, CollectionSchema.title, titles); if ((allAttr || contains(CollectionSchema.title_exact_signature_l)) && titles.size() > 0) { add(doc, CollectionSchema.title_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(titles.get(0))); } } if (allAttr || contains(CollectionSchema.title_count_i)) add(doc, CollectionSchema.title_count_i, titles.size()); if (allAttr || contains(CollectionSchema.title_chars_val)) { ArrayList cv = new ArrayList(titles.size()); for (String s: titles) cv.add(new Integer(s.length())); add(doc, CollectionSchema.title_chars_val, cv); } if (allAttr || contains(CollectionSchema.title_words_val)) { ArrayList cv = new ArrayList(titles.size()); for (String s: titles) cv.add(new Integer(CommonPattern.SPACES.split(s).length)); add(doc, CollectionSchema.title_words_val, cv); } String[] descriptions = document.dc_description(); if (allAttr || contains(CollectionSchema.description_txt)) { add(doc, CollectionSchema.description_txt, descriptions); if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && descriptions != null && descriptions.length > 0) { add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(descriptions)); } } if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.length); if (allAttr || contains(CollectionSchema.description_chars_val)) { ArrayList cv = new ArrayList(descriptions.length); for (String s: descriptions) cv.add(new Integer(s.length())); add(doc, CollectionSchema.description_chars_val, cv); } if (allAttr || contains(CollectionSchema.description_words_val)) { ArrayList cv = new ArrayList(descriptions.length); for (String s: descriptions) cv.add(new Integer(CommonPattern.SPACES.split(s).length)); add(doc, CollectionSchema.description_words_val, cv); } if (allAttr || contains(CollectionSchema.author)) { String author = document.dc_creator(); if (author == null || author.length() == 0) author = document.dc_publisher(); add(doc, CollectionSchema.author, author); } if (allAttr || contains(CollectionSchema.last_modified)) { Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified(); if (lastModified == null) lastModified = new Date(); if (document.getLastModified().before(lastModified)) lastModified = document.getLastModified(); long firstSeen = segment.getFirstSeenTime(digestURL.hash()); if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier add(doc, CollectionSchema.last_modified, lastModified); } if (allAttr || contains(CollectionSchema.dates_in_content_dts) || contains(CollectionSchema.dates_in_content_count_i)) { LinkedHashSet dates_in_content = condenser.dates_in_content; if (allAttr || contains(CollectionSchema.dates_in_content_count_i)) { add(doc, CollectionSchema.dates_in_content_count_i, dates_in_content.size()); } if (dates_in_content.size() > 0 && (allAttr || contains(CollectionSchema.dates_in_content_dts))) { add(doc, CollectionSchema.dates_in_content_dts, dates_in_content.toArray(new Date[dates_in_content.size()])); } } if (allAttr || contains(CollectionSchema.keywords)) { String keywords = document.dc_subject(' '); if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) { keywords = responseHeader.get(HeaderFramework.X_YACY_MEDIA_KEYWORDS, keywords); } add(doc, CollectionSchema.keywords, keywords); } // unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is) add(doc, CollectionSchema.http_unique_b, setUnique || UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage! add(doc, CollectionSchema.www_unique_b, setUnique || host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage! add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature()); add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage! add(doc, CollectionSchema.exact_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing! add(doc, CollectionSchema.fuzzy_signature_l, condenser.fuzzySignature()); add(doc, CollectionSchema.fuzzy_signature_text_t, condenser.fuzzySignatureText()); add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards during storage! add(doc, CollectionSchema.fuzzy_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing! if (this.contains(CollectionSchema.exact_signature_unique_b) || this.contains(CollectionSchema.exact_signature_copycount_i) || this.contains(CollectionSchema.fuzzy_signature_l) || this.contains(CollectionSchema.fuzzy_signature_copycount_i) || this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) { processTypes.add(ProcessType.UNIQUE); } // get list of all links; they will be shrinked by urls that appear in other fields of the solr schema LinkedHashMap inboundLinks = document.inboundLinks(); LinkedHashMap outboundLinks = document.outboundLinks(); Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); int c = 0; final Object parser = document.getParserObject(); boolean containsCanonical = false; DigestURL canonical = null; if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; List images = html.getImages(); // header tags int h = 0; int f = 1; String[] hs; hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h1_txt, hs); add(doc, CollectionSchema.h1_i, hs.length); hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h2_txt, hs); add(doc, CollectionSchema.h2_i, hs.length); hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h3_txt, hs); add(doc, CollectionSchema.h3_i, hs.length); hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h4_txt, hs); add(doc, CollectionSchema.h4_i, hs.length); hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h5_txt, hs); add(doc, CollectionSchema.h5_i, hs.length); hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h6_txt, hs); add(doc, CollectionSchema.h6_i, hs.length); add(doc, CollectionSchema.htags_i, h); add(doc, CollectionSchema.schema_org_breadcrumb_i, html.breadcrumbCount()); // meta tags: Open Graph properties String og; og = html.getMetas().get("og:title"); if (og != null) add(doc, CollectionSchema.opengraph_title_t, og); og = html.getMetas().get("og:type"); if (og != null) add(doc, CollectionSchema.opengraph_type_s, og); og = html.getMetas().get("og:url"); if (og != null) add(doc, CollectionSchema.opengraph_url_s, og); og = html.getMetas().get("og:image"); if (og != null) add(doc, CollectionSchema.opengraph_image_s, og); // noindex and nofollow attributes // from HTML (meta-tag in HTML header: robots) // and HTTP header (X-Robots-Tag property) // coded as binary value: // bit 0: "all" contained in html header meta // bit 1: "index" contained in html header meta // bit 2: "follow" contained in html header meta // bit 3: "noindex" contained in html header meta // bit 4: "nofollow" contained in html header meta // bit 5: "noarchive" contained in html header meta // bit 8: "all" contained in http header X-Robots-Tag // bit 9: "noindex" contained in http header X-Robots-Tag // bit 10: "nofollow" contained in http header X-Robots-Tag // bit 11: "noarchive" contained in http header X-Robots-Tag // bit 12: "nosnippet" contained in http header X-Robots-Tag // bit 13: "noodp" contained in http header X-Robots-Tag // bit 14: "notranslate" contained in http header X-Robots-Tag // bit 15: "noimageindex" contained in http header X-Robots-Tag // bit 16: "unavailable_after" contained in http header X-Robots-Tag int b = 0; String robots_meta = html.getMetas().get("robots"); // this tag may have values: all, index, noindex, nofollow; see http://www.robotstxt.org/meta.html if (robots_meta != null) { robots_meta = robots_meta.toLowerCase(); if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0 if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1 if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2 if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3 if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4 if (robots_meta.indexOf("noarchive",0) >= 0) b += 32; // set bit 5 } String x_robots_tag = responseHeader == null ? "" : responseHeader.getXRobotsTag(); if (!x_robots_tag.isEmpty()) { // this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8 if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9 if (x_robots_tag.indexOf("nofollow",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<10; // set bit 10 if (x_robots_tag.indexOf("noarchive",0) >= 0) b += 1<<11; // set bit 11 if (x_robots_tag.indexOf("nosnippet",0) >= 0) b += 1<<12; // set bit 12 if (x_robots_tag.indexOf("noodp",0) >= 0) b += 1<<13; // set bit 13 if (x_robots_tag.indexOf("notranslate",0) >= 0) b += 1<<14; // set bit 14 if (x_robots_tag.indexOf("noimageindex",0) >= 0) b += 1<<15; // set bit 15 if (x_robots_tag.indexOf("unavailable_after",0) >= 0) b += 1<<16; // set bit 16 } add(doc, CollectionSchema.robots_i, b); // meta tags: generator final String generator = html.getMetas().get("generator"); if (generator != null) add(doc, CollectionSchema.metagenerator_t, generator); // bold, italic final String[] bold = html.getBold(); add(doc, CollectionSchema.boldcount_i, bold.length); if (bold.length > 0) { add(doc, CollectionSchema.bold_txt, bold); if (allAttr || contains(CollectionSchema.bold_val)) { add(doc, CollectionSchema.bold_val, html.getBoldCount(bold)); } } final String[] italic = html.getItalic(); add(doc, CollectionSchema.italiccount_i, italic.length); if (italic.length > 0) { add(doc, CollectionSchema.italic_txt, italic); if (allAttr || contains(CollectionSchema.italic_val)) { add(doc, CollectionSchema.italic_val, html.getItalicCount(italic)); } } final String[] underline = html.getUnderline(); add(doc, CollectionSchema.underlinecount_i, underline.length); if (underline.length > 0) { add(doc, CollectionSchema.underline_txt, underline); if (allAttr || contains(CollectionSchema.underline_val)) { add(doc, CollectionSchema.underline_val, html.getUnderlineCount(underline)); } } final String[] li = html.getLi(); add(doc, CollectionSchema.licount_i, li.length); if (li.length > 0) add(doc, CollectionSchema.li_txt, li); final String[] dt = html.getDt(); add(doc, CollectionSchema.dtcount_i, dt.length); if (dt.length > 0) add(doc, CollectionSchema.dt_txt, dt); final String[] dd = html.getDd(); add(doc, CollectionSchema.ddcount_i, dd.length); if (dd.length > 0) add(doc, CollectionSchema.dd_txt, dd); final List startDates = html.getStartDates(); if (startDates.size() > 0) add(doc, CollectionSchema.startDates_dts, startDates.toArray(new Date[startDates.size()])); final List endDates = html.getStartDates(); if (endDates.size() > 0) add(doc, CollectionSchema.endDates_dts, endDates.toArray(new Date[endDates.size()])); final List articles = html.getArticles(); add(doc, CollectionSchema.articlecount_i, articles.size()); if (articles.size() > 0) add(doc, CollectionSchema.article_txt, articles); // images final ArrayList imgprots = new ArrayList(images.size()); final Integer[] imgheights = new Integer[images.size()]; final Integer[] imgwidths = new Integer[images.size()]; final Integer[] imgpixels = new Integer[images.size()]; final String[] imgstubs = new String[images.size()]; final String[] imgalts = new String[images.size()]; int withalt = 0; int i = 0; LinkedHashSet images_text_map = new LinkedHashSet(); for (final ImageEntry ie: images) { final MultiProtocolURL uri = ie.url(); inboundLinks.remove(uri); outboundLinks.remove(uri); imgheights[i] = ie.height(); imgwidths[i] = ie.width(); imgpixels[i] = ie.height() < 0 || ie.width() < 0 ? -1 : ie.height() * ie.width(); String protocol = uri.getProtocol(); imgprots.add(protocol); imgstubs[i] = uri.toString().substring(protocol.length() + 3); imgalts[i] = ie.alt(); for (String it: CommonPattern.SPACE.split(uri.toTokens())) images_text_map.add(it); if (ie.alt() != null && ie.alt().length() > 0) { SentenceReader sr = new SentenceReader(ie.alt()); while (sr.hasNext()) images_text_map.add(sr.next().toString()); withalt++; } i++; } StringBuilder images_text = new StringBuilder(images_text_map.size() * 6 + 1); for (String s: images_text_map) images_text.append(s.trim()).append(' '); if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, images.size()); if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots)); if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs); if (allAttr || contains(CollectionSchema.images_alt_sxt)) add(doc, CollectionSchema.images_alt_sxt, imgalts); if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, imgheights); if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths); if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels); if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt); if (allAttr || contains(CollectionSchema.images_text_t)) add(doc, CollectionSchema.images_text_t, images_text.toString().trim()); // style sheets if (allAttr || contains(CollectionSchema.css_tag_sxt)) { final Map csss = html.getCSS(); final String[] css_tag = new String[csss.size()]; final String[] css_url = new String[csss.size()]; c = 0; for (final Map.Entry entry: csss.entrySet()) { final String cssurl = entry.getKey().toNormalform(false); inboundLinks.remove(entry.getKey()); outboundLinks.remove(entry.getKey()); css_tag[c] = ""; css_url[c] = cssurl; c++; } add(doc, CollectionSchema.csscount_i, css_tag.length); if (css_tag.length > 0) add(doc, CollectionSchema.css_tag_sxt, css_tag); if (css_url.length > 0) add(doc, CollectionSchema.css_url_sxt, css_url); } // Scripts if (allAttr || contains(CollectionSchema.scripts_sxt)) { final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; for (final AnchorURL u: scriptss) { inboundLinks.remove(u); outboundLinks.remove(u); scripts[c++] = u.toNormalform(false); } add(doc, CollectionSchema.scriptscount_i, scripts.length); if (scripts.length > 0) add(doc, CollectionSchema.scripts_sxt, scripts); } // Frames if (allAttr || contains(CollectionSchema.frames_sxt)) { final Set framess = html.getFrames(); final String[] frames = new String[framess.size()]; c = 0; for (final AnchorURL u: framess) { inboundLinks.remove(u); outboundLinks.remove(u); frames[c++] = u.toNormalform(false); } add(doc, CollectionSchema.framesscount_i, frames.length); if (frames.length > 0) { add(doc, CollectionSchema.frames_sxt, frames); //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound } } // IFrames if (allAttr || contains(CollectionSchema.iframes_sxt)) { final Set iframess = html.getIFrames(); final String[] iframes = new String[iframess.size()]; c = 0; for (final AnchorURL u: iframess) { inboundLinks.remove(u); outboundLinks.remove(u); iframes[c++] = u.toNormalform(false); } add(doc, CollectionSchema.iframesscount_i, iframes.length); if (iframes.length > 0) { add(doc, CollectionSchema.iframes_sxt, iframes); //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound } } // canonical tag if (allAttr || contains(CollectionSchema.canonical_s)) { canonical = html.getCanonical(); // if there is no canonical in the html then look into the http header: if (canonical == null && responseHeader != null) { String link = responseHeader.get("Link", null); int p; if (link != null && ((p = link.indexOf("rel=\"canonical\"")) > 0)) { link = link.substring(0, p).trim(); p = link.indexOf('<'); int q = link.lastIndexOf('>'); if (p >= 0 && q > 0) { link = link.substring(p + 1, q); try { canonical = new DigestURL(link); } catch (MalformedURLException e) {} } } } if (canonical != null) { containsCanonical = true; inboundLinks.remove(canonical); outboundLinks.remove(canonical); add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false)); // set a flag if this is equal to sku if (contains(CollectionSchema.canonical_equal_sku_b)) { add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(digestURL)); } } } // meta refresh tag if (allAttr || contains(CollectionSchema.refresh_s)) { String refresh = html.getRefreshPath(); if (refresh != null && refresh.length() > 0) { MultiProtocolURL refreshURL; try { refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURL, html.getRefreshPath()); if (refreshURL != null) { inboundLinks.remove(refreshURL); outboundLinks.remove(refreshURL); add(doc, CollectionSchema.refresh_s, refreshURL.toNormalform(false)); } } catch (final MalformedURLException e) { add(doc, CollectionSchema.refresh_s, refresh); } } } // flash embedded if (allAttr || contains(CollectionSchema.flash_b)) { MultiProtocolURL[] flashURLs = html.getFlash(); for (MultiProtocolURL u: flashURLs) { // remove all flash links from ibound/outbound links inboundLinks.remove(u); outboundLinks.remove(u); } add(doc, CollectionSchema.flash_b, flashURLs.length > 0); } // generic evaluation pattern for (final String model: html.getEvaluationModelNames()) { if (allAttr || contains("ext_" + model + "_txt")) { final String[] scorenames = html.getEvaluationModelScoreNames(model); if (scorenames.length > 0) { add(doc, CollectionSchema.valueOf("ext_" + model + "_txt"), scorenames); add(doc, CollectionSchema.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames)); } } } // response time add(doc, CollectionSchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"))); // hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077 if (allAttr || (contains(CollectionSchema.hreflang_url_sxt) && contains(CollectionSchema.hreflang_cc_sxt))) { final String[] ccs = new String[html.getHreflang().size()]; final String[] urls = new String[html.getHreflang().size()]; c = 0; for (Map.Entry e: html.getHreflang().entrySet()) { ccs[c] = e.getKey(); urls[c] = e.getValue().toNormalform(true); c++; } add(doc, CollectionSchema.hreflang_cc_sxt, ccs); add(doc, CollectionSchema.hreflang_url_sxt, urls); } // page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html if (allAttr || (contains(CollectionSchema.navigation_url_sxt) && contains(CollectionSchema.navigation_type_sxt))) { final String[] navs = new String[html.getNavigation().size()]; final String[] urls = new String[html.getNavigation().size()]; c = 0; for (Map.Entry e: html.getNavigation().entrySet()) { navs[c] = e.getKey(); urls[c] = e.getValue().toNormalform(true); c++; } add(doc, CollectionSchema.navigation_type_sxt, navs); add(doc, CollectionSchema.navigation_url_sxt, urls); } // publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de if (allAttr || contains(CollectionSchema.publisher_url_s) && html.getPublisherLink() != null) { add(doc, CollectionSchema.publisher_url_s, html.getPublisherLink().toNormalform(true)); } } if (parser instanceof DCEntry) { // the document was created with a surrogate parsing; overwrite all md: -entries to Solr DCEntry dcentry = (DCEntry) parser; for (Map.Entry entry: dcentry.getMap().entrySet()) { String tag = entry.getKey(); if (!tag.startsWith("md:") || tag.length() < 4) continue; CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3)); if (solr_field == null) continue; String[] values = entry.getValue(); if (values == null || values.length == 0) continue; if (allAttr || contains(solr_field)) { add(doc, solr_field, values); } } } String content = document.getTextString(); String tokens = digestURL.toTokens(); if (content == null || content.length() == 0) { content = tokens; } else { String[] t = CommonPattern.SPACE.split(tokens); for (String r: t) { if (r.length() > 0 && content.indexOf(" " + r + " ") < 0 && !content.startsWith(r + " ") && !content.endsWith(" " + r)) content += " " + r; } } // handle image source meta data if (document.getContentDomain() == ContentDomain.IMAGE) { // add image pixel size if known Iterator imgit = document.getImages().values().iterator(); List heights = new ArrayList<>(); List widths = new ArrayList<>(); List pixels = new ArrayList<>(); while (imgit.hasNext()) { ImageEntry img = imgit.next(); int imgpixels = (img.height() < 0 || img.width() < 0) ? -1 : img.height() * img.width(); if (imgpixels > 0 && (allAttr || (contains(CollectionSchema.images_height_val) && contains(CollectionSchema.images_width_val) && contains(CollectionSchema.images_pixel_val)))) { heights.add(img.height()); widths.add(img.width()); pixels.add(imgpixels); } } if (heights.size() > 0) { add(doc, CollectionSchema.images_height_val, heights); add(doc, CollectionSchema.images_width_val, widths); add(doc, CollectionSchema.images_pixel_val, pixels); } if (allAttr || contains(CollectionSchema.images_text_t)) { add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser content = digestURL.toTokens(); // remove all other entry but the url tokens } } // content (must be written after special parser data, since this can influence the content) if (allAttr || contains(CollectionSchema.text_t)) add(doc, CollectionSchema.text_t, content); if (allAttr || contains(CollectionSchema.wordcount_i)) { if (content.length() == 0) { add(doc, CollectionSchema.wordcount_i, 0); } else { int contentwc = 1; for (int i = content.length() - 1; i >= 0; i--) if (content.charAt(i) == ' ') contentwc++; add(doc, CollectionSchema.wordcount_i, contentwc); } } // statistics about the links if (allAttr || contains(CollectionSchema.linkscount_i)) add(doc, CollectionSchema.linkscount_i, inboundLinks.size() + outboundLinks.size()); if (allAttr || contains(CollectionSchema.linksnofollowcount_i)) add(doc, CollectionSchema.linksnofollowcount_i, document.inboundLinkNofollowCount() + document.outboundLinkNofollowCount()); if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, inboundLinks.size()); if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount()); if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size()); if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); // create a subgraph Boolean canonical_equal_sku = canonical == null ? null : canonical.toNormalform(true).equals(url); if (webgraph != null && (!containsCanonical || (canonical_equal_sku != null && (canonical_equal_sku.booleanValue())))) { // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document List edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, processTypes, document.getHyperlinks().keySet(), sourceName); // this also enriched the subgraph doc.webgraphDocuments.addAll(edges); } else { if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt) || contains(CollectionSchema.inboundlinks_urlstub_sxt) || contains(CollectionSchema.inboundlinks_anchortext_txt) || contains(CollectionSchema.outboundlinks_protocol_sxt) || contains(CollectionSchema.outboundlinks_urlstub_sxt) || contains(CollectionSchema.outboundlinks_anchortext_txt)) { for (final AnchorURL target_url: document.getHyperlinks().keySet()) { enrichSubgraph(subgraph, digestURL, target_url); } } } // attach the subgraph content if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0])); if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_sxt)) add(doc, CollectionSchema.inboundlinks_urlstub_sxt, subgraph.urlStubs[0]); if (allAttr || contains(CollectionSchema.inboundlinks_anchortext_txt)) add(doc, CollectionSchema.inboundlinks_anchortext_txt, subgraph.urlAnchorTexts[0]); if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1])); if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_sxt)) add(doc, CollectionSchema.outboundlinks_urlstub_sxt, subgraph.urlStubs[1]); if (allAttr || contains(CollectionSchema.outboundlinks_anchortext_txt)) add(doc, CollectionSchema.outboundlinks_anchortext_txt, subgraph.urlAnchorTexts[1]); // charset if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset()); // coordinates if (document.lat() != 0.0 && document.lon() != 0.0) { if (allAttr || contains(CollectionSchema.coordinate_p)) add(doc, CollectionSchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon())); } if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode()); // fields that were additionally in URIMetadataRow Date loadDate = new Date(); Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified(); if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength()); if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate); if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash())); //if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher()); if ((allAttr || contains(CollectionSchema.language_s)) && language != null) add(doc, CollectionSchema.language_s, language); if (allAttr || contains(CollectionSchema.size_i)) add(doc, CollectionSchema.size_i, size); if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, document.getAudiolinks().size()); if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, document.getVideolinks().size()); if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, document.getApplinks().size()); // document post-processing if ((allAttr || contains(CollectionSchema.process_sxt)) && processTypes.size() > 0) { List p = new ArrayList(); for (ProcessType t: processTypes) p.add(t.name()); add(doc, CollectionSchema.process_sxt, p); if (allAttr || contains(CollectionSchema.harvestkey_s)) { add(doc, CollectionSchema.harvestkey_s, sourceName); } } // document enrichments (synonyms, facets) enrich(doc, condenser.synonyms(), document.getGenericFacets()); return doc; } /** * attach additional information to the document to enable navigation features * @param doc the document to be enriched * @param synonyms a list of synonyms detected for the text content * @param genericFacets a map where the key is the navigator name and the value is the set of attributes names */ public void enrich(SolrInputDocument doc, List synonyms, Map> genericFacets) { remove(doc, CollectionSchema.vocabularies_sxt); // delete old values for (SolrInputField sif: doc) { if (sif.getName().startsWith(CollectionSchema.VOCABULARY_PREFIX)) remove(doc, sif.getName()); } if (this.isEmpty() || contains(CollectionSchema.vocabularies_sxt)) { // write generic navigation // there are no pre-defined solr fields for navigation because the vocabulary is generic // we use dynamically allocated solr fields for this. // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names // add to genericFacets the probabilistic categories String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); Map classification = ProbabilisticClassifier.getClassification(text); for (Map.Entry entry: classification.entrySet()) { Set facetAttrbutes = new HashSet<>(); facetAttrbutes.add(entry.getValue()); genericFacets.put(entry.getKey(), facetAttrbutes); } // compute the document field values List vocabularies = new ArrayList<>(); for (Map.Entry> facet: genericFacets.entrySet()) { String facetName = facet.getKey(); Set facetValues = facet.getValue(); int count = facetValues.size(); if (count == 0) continue; int logcount = (int) (Math.log(count) / Math.log(2)); Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i; doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count])); doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size()); doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount); doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts); vocabularies.add(facetName); } if (vocabularies.size() > 0) add(doc, CollectionSchema.vocabularies_sxt, vocabularies); } remove(doc, CollectionSchema.synonyms_sxt); // delete old values if (this.isEmpty() || contains(CollectionSchema.synonyms_sxt)) { if (synonyms.size() > 0) add(doc, CollectionSchema.synonyms_sxt, synonyms); } } public static boolean postprocessingRunning = false; public static String postprocessingActivity = ""; // if started, the following values are assigned public static long postprocessingStartTime = 0; // the start time for the processing; not started = 0 public static int postprocessingCollection1Count = 0; // number of documents to be processed public static int postprocessingWebgraphCount = 0; // number of documents to be processed public static final String collection1query(final Segment segment, final String harvestkey) { return (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM; } public static final String webgraphquery(final Segment segment, final String harvestkey) { return (harvestkey == null || !segment.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.harvestkey_s) ? "" : WebgraphSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + WebgraphSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM; } /** * post-processing steps for all entries that have a process tag assigned * @param connector * @param urlCitation * @return */ public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey, final boolean byPartialUpdate) { if (!this.contains(CollectionSchema.process_sxt)) return 0; if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0; final SolrConnector collectionConnector = segment.fulltext().getDefaultConnector(); collectionConnector.commit(false); // make sure that we have latest information that can be found if (segment.fulltext().useWebgraph()) segment.fulltext().getWebgraphConnector().commit(false); final CollectionConfiguration collection = segment.fulltext().getDefaultConfiguration(); final WebgraphConfiguration webgraph = segment.fulltext().getWebgraphConfiguration(); // calculate the number of documents to be processed String collection1query = collection1query(segment, harvestkey); String webgraphquery = webgraphquery(segment, harvestkey); postprocessingRunning = true; postprocessingStartTime = System.currentTimeMillis(); postprocessingActivity = "collecting counts"; ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); try { postprocessingCollection1Count = (int) collectionConnector.getCountByQuery("{!cache=false}" + collection1query); postprocessingWebgraphCount = segment.fulltext().useWebgraph() ? (int) segment.fulltext().getWebgraphConnector().getCountByQuery("{!cache=false}" + webgraphquery) : 0; } catch (IOException e) { postprocessingCollection1Count = -1; postprocessingWebgraphCount = -1; } // collect hosts from index which shall take part in citation computation postprocessingActivity = "collecting host facets for collection"; ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); ReversibleScoreMap collection1hosts; try { Map> hostfacet = collectionConnector.getFacets("{!cache=false}" + collection1query, 10000000, CollectionSchema.host_s.getSolrFieldName()); collection1hosts = hostfacet.get(CollectionSchema.host_s.getSolrFieldName()); } catch (final IOException e2) { ConcurrentLog.logException(e2); collection1hosts = new ClusteredScoreMap(true); } postprocessingActivity = "create ranking map"; ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); boolean shallComputeCR = (segment.fulltext().useWebgraph() && ((webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) || (webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i))) || (collection.contains(CollectionSchema.cr_host_count_i) && collection.contains(CollectionSchema.cr_host_chance_d) && collection.contains(CollectionSchema.cr_host_norm_i))); // create the ranking map final Map rankings = new ConcurrentHashMap(); if (shallComputeCR) try { int concurrency = Math.min(collection1hosts.size(), Runtime.getRuntime().availableProcessors()); postprocessingActivity = "collecting cr for " + collection1hosts.size() + " hosts, concurrency = " + concurrency; ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); int countcheck = 0; for (String host: collection1hosts.keyList(true)) { // Patch the citation index for links with canonical tags. // This shall fulfill the following requirement: // If a document A links to B and B contains a 'canonical C', then the citation rank computation shall consider that A links to C and B does not link to C. // To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM; long patchquerycount = collectionConnector.getCountByQuery("{!cache=false}" + patchquery); BlockingQueue documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000000, Long.MAX_VALUE, 20, 1, true, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName()); SolrDocument doc_B; int patchquerycountcheck = 0; try { while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) { // find all documents which link to the canonical doc DigestURL doc_C_url = new DigestURL((String) doc_B.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName())); byte[] doc_B_id = ASCII.getBytes(((String) doc_B.getFieldValue(CollectionSchema.id.getSolrFieldName()))); // we remove all references to B, because these become references to C if (segment.connectedCitation()) { ReferenceContainer doc_A_ids = segment.urlCitation().remove(doc_B_id); if (doc_A_ids == null) { //System.out.println("*** document with canonical but no referrer: " + doc_B.getFieldValue(CollectionSchema.sku.getSolrFieldName())); continue; // the document has a canonical tag but no referrer? } Iterator doc_A_ids_iterator = doc_A_ids.entries(); // for each of the referrer A of B, set A as a referrer of C while (doc_A_ids_iterator.hasNext()) { CitationReference doc_A_citation = doc_A_ids_iterator.next(); segment.urlCitation().add(doc_C_url.hash(), doc_A_citation); } } patchquerycountcheck++; if (MemoryControl.shortStatus()) { ConcurrentLog.warn("CollectionConfiguration", "terminated canonical collection during postprocessing because of short memory"); break; } } } catch (InterruptedException e) { ConcurrentLog.logException(e); } catch (SpaceExceededException e) { ConcurrentLog.logException(e); } if (patchquerycount != patchquerycountcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous patchquery count for host " + host + ": expected=" + patchquerycount + ", counted=" + patchquerycountcheck); // do the citation rank computation if (collection1hosts.get(host) <= 0) continue; // select all documents for each host CRHost crh = new CRHost(segment, rrCache, host, 0.85d, 6); int convergence_attempts = 0; while (convergence_attempts++ < 30) { ConcurrentLog.info("CollectionConfiguration", "convergence step " + convergence_attempts + " for host " + host + " ..."); if (crh.convergenceStep()) break; if (MemoryControl.shortStatus()) { ConcurrentLog.warn("CollectionConfiguration", "terminated convergenceStep during postprocessing because of short memory"); break; } } ConcurrentLog.info("CollectionConfiguration", "convergence for host " + host + " after " + convergence_attempts + " steps"); // we have now the cr for all documents of a specific host; we store them for later use Map crn = crh.normalize(); //crh.log(crn); rankings.putAll(crn); // accumulate this here for usage in document update later if (MemoryControl.shortStatus()) { ConcurrentLog.warn("CollectionConfiguration", "terminated crn akkumulation during postprocessing because of short memory"); break; } countcheck++; } if (collection1hosts.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + collection1hosts.size() + ", counted=" + countcheck); } catch (final IOException e2) { ConcurrentLog.logException(e2); collection1hosts = new ClusteredScoreMap(true); } // process all documents at the webgraph for the outgoing links of this document final AtomicInteger allcount = new AtomicInteger(0); if (segment.fulltext().useWebgraph() && shallComputeCR) { postprocessingActivity = "collecting host facets for webgraph cr calculation"; ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); final Set omitFields = new HashSet(); omitFields.add(WebgraphSchema.process_sxt.getSolrFieldName()); omitFields.add(WebgraphSchema.harvestkey_s.getSolrFieldName()); // collect hosts from index which shall take part in citation computation ReversibleScoreMap webgraphhosts; try { Map> hostfacet = segment.fulltext().getWebgraphConnector().getFacets(webgraphquery, 10000000, WebgraphSchema.source_host_s.getSolrFieldName()); webgraphhosts = hostfacet.get(WebgraphSchema.source_host_s.getSolrFieldName()); } catch (final IOException e2) { ConcurrentLog.logException(e2); webgraphhosts = new ClusteredScoreMap(true); } try { final long start = System.currentTimeMillis(); for (String host: webgraphhosts.keyList(true)) { if (webgraphhosts.get(host) <= 0) continue; final String hostfinal = host; // select all webgraph edges and modify their cr value postprocessingActivity = "writing cr values to webgraph for host " + host; ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); String patchquery = WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\" AND " + WebgraphSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM; final long count = segment.fulltext().getWebgraphConnector().getCountByQuery("{!cache=false}" + patchquery); int concurrency = Math.min((int) count, Math.max(1, Runtime.getRuntime().availableProcessors() / 4)); ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph, concurrency = " + concurrency); final BlockingQueue docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery( patchquery, WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true // TODO: add field list and do partial updates ); final AtomicInteger proccount = new AtomicInteger(0); Thread[] t = new Thread[concurrency]; for (final AtomicInteger i = new AtomicInteger(0); i.get() < t.length; i.incrementAndGet()) { t[i.get()] = new Thread() { private String name = "CollectionConfiguration.postprocessing.webgraph-" + i.get(); @Override public void run() { Thread.currentThread().setName(name); SolrDocument doc; String id; try { processloop: while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { try { SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields); Collection proctags = doc.getFieldValues(WebgraphSchema.process_sxt.getSolrFieldName()); for (Object tag: proctags) try { // switch over tag types ProcessType tagtype = ProcessType.valueOf((String) tag); // set cr values if (tagtype == ProcessType.CITATION) { if (segment.fulltext().useWebgraph() && webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) { id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); CRV crv = rankings.get(id); if (crv != null) { sid.setField(WebgraphSchema.source_cr_host_norm_i.getSolrFieldName(), crv.crn); } } if (webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i)) { id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()); CRV crv = rankings.get(id); if (crv != null) { sid.setField(WebgraphSchema.target_cr_host_norm_i.getSolrFieldName(), crv.crn); } } } } catch (IllegalArgumentException e) { ConcurrentLog.logException(e); } // write document back to index try { sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName()); sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName()); //segment.fulltext().getWebgraphConnector().deleteById((String) sid.getFieldValue(WebgraphSchema.id.getSolrFieldName())); segment.fulltext().getWebgraphConnector().add(sid); } catch (SolrException e) { ConcurrentLog.logException(e); } catch (IOException e) { ConcurrentLog.logException(e); } proccount.incrementAndGet(); allcount.incrementAndGet(); if (proccount.get() % 1000 == 0) { postprocessingActivity = "writing cr values to webgraph for host " + hostfinal + "postprocessed " + proccount + " from " + count + " documents; " + (proccount.get() * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " + ((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining"; ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); } } catch (Throwable e) { ConcurrentLog.logException(e); continue processloop; } } } catch (InterruptedException e) { ConcurrentLog.warn("CollectionConfiguration", e.getMessage(), e); } } }; t[i.get()].start(); } for (int i = 0; i < t.length; i++) try { t[i].join(10000); if (t[i].isAlive()) t[i].interrupt(); } catch (InterruptedException e) {} if (count != proccount.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous webgraph document count for host " + host + ": expected=" + count + ", counted=" + proccount); } } catch (final IOException e2) { ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2); } } // process all documents in collection final Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id final Set uniqueURLs = new ConcurrentHashSet(); // will be used in a concurrent environment final Set omitFields = new HashSet(); omitFields.add(CollectionSchema.process_sxt.getSolrFieldName()); omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName()); final Collection failids = new ArrayList(); final AtomicInteger countcheck = new AtomicInteger(0); final AtomicInteger proccount = new AtomicInteger(); final AtomicInteger proccount_referencechange = new AtomicInteger(); final AtomicInteger proccount_citationchange = new AtomicInteger(); try { // partitioning of the index, get a facet for a partitioning key final long count = collectionConnector.getCountByQuery("{!cache=false}" + collection1query); String partitioningKey = CollectionSchema.responsetime_i.getSolrFieldName(); postprocessingActivity = "collecting " + count + " documents from the collection for harvestkey " + harvestkey + ", partitioned by " + partitioningKey; if (count > 0) { Map> partitioningFacet = collectionConnector.getFacets("{!cache=false}" + collection1query, 100000, partitioningKey); ReversibleScoreMap partitioning = partitioningFacet.get(partitioningKey); long emptyCount = collectionConnector.getCountByQuery("{!cache=false}" + "-" + partitioningKey + AbstractSolrConnector.CATCHALL_DTERM + " AND (" + collection1query + ")"); if (emptyCount > 0) partitioning.inc("", (int) emptyCount); final long start = System.currentTimeMillis(); List querystrings = new ArrayList<>(partitioning.size()); for (String partitioningValue: partitioning) { String partitioningQuery = "{!cache=false}" + ((partitioningValue.length() == 0) ? "-" + partitioningKey + AbstractSolrConnector.CATCHALL_DTERM + " AND (" + collection1query + ")" : partitioningKey + ":" + partitioningValue + " AND (" + collection1query + ")"); querystrings.add(partitioningQuery); } // start collection of documents final int concurrency = Math.max(1, Math.min((int) (MemoryControl.available() / (100L * 1024L * 1024L)), Runtime.getRuntime().availableProcessors())); //final int concurrency = 1; final boolean reference_computation = this.contains(CollectionSchema.references_i) && this.contains(CollectionSchema.references_internal_i) && this.contains(CollectionSchema.references_external_i) && this.contains(CollectionSchema.references_exthosts_i); ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); final BlockingQueue docs = collectionConnector.concurrentDocumentsByQueries( querystrings, (this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) ? CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false : null, // null sort is faster! 0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true, byPartialUpdate ? new String[]{ // the following fields are needed to perform the postprocessing // and should only be used for partial updates; for full updates use a // full list of fields to avoid LazyInstantiation which has poor performace CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.harvestkey_s.getSolrFieldName(), CollectionSchema.process_sxt.getSolrFieldName(), CollectionSchema.canonical_equal_sku_b.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName(), CollectionSchema.exact_signature_l.getSolrFieldName(), CollectionSchema.fuzzy_signature_l.getSolrFieldName(), CollectionSchema.title_exact_signature_l.getSolrFieldName(), CollectionSchema.description_exact_signature_l.getSolrFieldName(), CollectionSchema.host_id_s.getSolrFieldName(), CollectionSchema.host_s.getSolrFieldName(), CollectionSchema.host_subdomain_s.getSolrFieldName(), CollectionSchema.url_chars_i.getSolrFieldName(), CollectionSchema.url_protocol_s.getSolrFieldName(), CollectionSchema.httpstatus_i.getSolrFieldName(), CollectionSchema.inboundlinkscount_i.getSolrFieldName(), CollectionSchema.robots_i.getSolrFieldName()} : this.allFields()); final Thread rewriteThread[] = new Thread[concurrency]; for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) { rewriteThread[rewrite_start] = new Thread() { @Override public void run() { SolrDocument doc; try { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { // for each to-be-processed entry work on the process tag Collection proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName()); final String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); final String i = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); if (proctags == null || proctags.size() == 0) { // this should not happen since we collected the documents using a process_sxt:[* TO *] term ConcurrentLog.warn("CollectionConfiguration", "no process_sxt entry for url " + u + ", id=" + i); continue; } try { DigestURL url = new DigestURL(u, ASCII.getBytes(i)); byte[] id = url.hash(); SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, omitFields); sid.setField(CollectionSchema.id.getSolrFieldName(), i); for (Object tag: proctags) try { // switch over tag types ProcessType tagtype = ProcessType.valueOf((String) tag); if (tagtype == ProcessType.CITATION && collection.contains(CollectionSchema.cr_host_count_i) && collection.contains(CollectionSchema.cr_host_chance_d) && collection.contains(CollectionSchema.cr_host_norm_i)) { CRV crv = rankings.remove(ASCII.String(id)); // instead of 'get'ting the CRV, we also remove it because we will not need it again and free some memory here if (crv != null) { sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count); sid.setField(CollectionSchema.cr_host_chance_d.getSolrFieldName(), crv.cr); sid.setField(CollectionSchema.cr_host_norm_i.getSolrFieldName(), crv.crn); proccount_citationchange.incrementAndGet(); } } if (tagtype == ProcessType.UNIQUE) { postprocessing_http_unique(segment, doc, sid, url); postprocessing_www_unique(segment, doc, sid, url); postprocessing_doublecontent(segment, uniqueURLs, doc, sid, url); } } catch (IllegalArgumentException e) {} // compute references if (reference_computation) { String hosthash = url.hosthash(); if (!hostExtentCache.containsKey(hosthash)) { StringBuilder q = new StringBuilder(); q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200"); long hostExtentCount = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString()); hostExtentCache.put(hosthash, hostExtentCount); } if (postprocessing_references(rrCache, sid, url, hostExtentCache)) proccount_referencechange.incrementAndGet(); } // all processing steps checked, remove the processing and harvesting key if (byPartialUpdate) { sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null); } else { sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName()); } // with standard solr fields selected, the sid now contains the fields // id, http_unique_b, www_unique_b, references_i, references_internal_i, references_external_i, references_exthosts_i, host_extent_i // and the value for host_extent_i is by default 2147483647 // send back to index //collectionConnector.deleteById(i); if (byPartialUpdate) { collectionConnector.update(sid); } else { collectionConnector.add(sid); } long thiscount = proccount.incrementAndGet(); allcount.incrementAndGet(); if (thiscount % 100 == 0) { postprocessingActivity = "postprocessed " + thiscount + " from " + count + " collection documents; " + (thiscount * 60000L / (System.currentTimeMillis() - start)) + " ppm; " + ((System.currentTimeMillis() - start) * (count - thiscount) / thiscount / 60000) + " minutes remaining"; ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); } } catch (final Throwable e1) { ConcurrentLog.logException(e1); failids.add(i); } countcheck.incrementAndGet(); } } catch (InterruptedException e) { ConcurrentLog.logException(e); } } }; rewriteThread[rewrite_start].start(); } // wait for termination for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) rewriteThread[rewrite_start].join(); if (failids.size() > 0) { ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: deleting " + failids.size() + " documents which have permanent execution fails"); collectionConnector.deleteByIds(failids); } if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck + "; countquery=" + collection1query); // big gap for harvestkey = null ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount + " new documents, " + proccount_referencechange + " reference-count changes, " + proccount_citationchange + " citation ranking changes."); } } catch (final InterruptedException e2) { ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2); } catch (IOException e3) { ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3); } collectionConnector.commit(true); // make changes available directly to prevent that the process repeats again postprocessingCollection1Count = 0; postprocessingWebgraphCount = 0; postprocessingActivity = "postprocessing terminated"; ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); postprocessingRunning = false; return allcount.get(); } public void postprocessing_http_unique(final Segment segment, final SolrDocument doc, final SolrInputDocument sid, final DigestURL url) { if (!this.contains(CollectionSchema.http_unique_b)) return; if (!url.isHTTPS() && !url.isHTTP()) return; try { DigestURL u = new DigestURL((url.isHTTP() ? "https://" : "http://") + url.urlstub(true, true)); SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.http_unique_b.getSolrFieldName()); set_unique_flag(CollectionSchema.http_unique_b, doc, sid, d); } catch (final IOException e) {} } public void postprocessing_www_unique(final Segment segment, final SolrDocument doc, final SolrInputDocument sid, final DigestURL url) { if (!this.contains(CollectionSchema.www_unique_b)) return; final String us = url.urlstub(true, true); try { DigestURL u = new DigestURL(url.getProtocol() + (us.startsWith("www.") ? "://" + us.substring(4) : "://www." + us)); SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.www_unique_b.getSolrFieldName()); set_unique_flag(CollectionSchema.www_unique_b, doc, sid, d); } catch (final IOException e) {} } private void set_unique_flag(CollectionSchema field, final SolrDocument doc, final SolrInputDocument sid, final SolrDocument d) { Object sb = doc.getFieldValue(field.getSolrFieldName()); boolean sbb = sb != null && ((Boolean) sb).booleanValue(); Object ob = d == null ? null : d.getFieldValue(field.getSolrFieldName()); boolean obb = ob != null && ((Boolean) ob).booleanValue(); if (sbb == obb) sid.setField(field.getSolrFieldName(), !sbb); } public void postprocessing_doublecontent(Segment segment, Set uniqueURLs, SolrDocument doc, final SolrInputDocument sid, final DigestURL url) { // FIND OUT IF THIS IS A DOUBLE DOCUMENT // term to describe documents which are indexable: // - no noindex in meta oder x-robots // - no canonical-tag Conjunction ValidDocTermTemplate = new Conjunction(); ValidDocTermTemplate.addOperand(new LongLiteral(CollectionSchema.httpstatus_i, 200)); ValidDocTermTemplate.addOperand(new Disjunction(new Negation(new CatchallLiteral(CollectionSchema.canonical_equal_sku_b)), new BooleanLiteral(CollectionSchema.canonical_equal_sku_b, true))); ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 8))); // bit 3 (noindex) ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 24))); // bit 3 + 4 (noindex + nofollow) ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 512))); // bit 9 (noindex) ValidDocTermTemplate.addOperand(new Negation(new LongLiteral(CollectionSchema.robots_i, 1536))); // bit 9 + 10 (noindex + nofollow) String urlhash = ASCII.String(url.hash()); String hostid = url.hosthash(); Disjunction dnf = new Disjunction(); CollectionSchema[][] doccheckschema = new CollectionSchema[][]{ {CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i}, {CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}; uniquecheck: for (CollectionSchema[] checkfields: doccheckschema) { CollectionSchema signaturefield = checkfields[0]; CollectionSchema uniquefield = checkfields[1]; CollectionSchema countfield = checkfields[2]; if (this.contains(signaturefield) && this.contains(uniquefield) && this.contains(countfield)) { // lookup the document with the same signature Long signature = (Long) doc.getFieldValue(signaturefield.getSolrFieldName()); if (signature == null) continue uniquecheck; //con.addOperand(new Negation(new Literal(CollectionSchema.id, urlhash))); //con.addOperand(new Literal(CollectionSchema.host_id_s, hostid)); dnf.addOperand(new LongLiteral(signaturefield, signature)); } } Conjunction con = (Conjunction) ValidDocTermTemplate.clone(); con.addOperand(dnf); con.addOperand(new Negation(new StringLiteral(CollectionSchema.id, urlhash))); con.addOperand(new StringLiteral(CollectionSchema.host_id_s, hostid)); String query = con.toString(); SolrDocumentList docsAkk; try { docsAkk = segment.fulltext().getDefaultConnector().getDocumentListByQuery(query, null, 0, 1000, CollectionSchema.id.getSolrFieldName(), CollectionSchema.exact_signature_l.getSolrFieldName(), CollectionSchema.fuzzy_signature_l.getSolrFieldName()); } catch (final IOException e) { ConcurrentLog.logException(e); docsAkk = new SolrDocumentList(); } if (docsAkk.getNumFound() > 0) uniquecheck: for (CollectionSchema[] checkfields: doccheckschema) { CollectionSchema signaturefield = checkfields[0]; CollectionSchema uniquefield = checkfields[1]; CollectionSchema countfield = checkfields[2]; if (this.contains(signaturefield) && this.contains(uniquefield) && this.contains(countfield)) { // lookup the document with the same signature Long signature = (Long) doc.getFieldValue(signaturefield.getSolrFieldName()); if (signature == null) continue uniquecheck; SolrDocumentList docs = new StringLiteral(signaturefield, signature.toString()).apply(docsAkk); if (docs.getNumFound() == 0) { sid.setField(uniquefield.getSolrFieldName(), true); sid.setField(countfield.getSolrFieldName(), 1); } else { boolean firstappearance = true; for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;} sid.setField(uniquefield.getSolrFieldName(), firstappearance); sid.setField(countfield.getSolrFieldName(), docs.getNumFound() + 1); // the current url was excluded from search but is included in count } } } // CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on) // in case that the document has no status code 200, has a noindex attribute // or a canonical tag which does not point to the document itself, // then the unique-field is not written at all! Integer robots_i = this.contains(CollectionSchema.robots_i) ? (Integer) doc.getFieldValue(CollectionSchema.robots_i.getSolrFieldName()) : null; Integer httpstatus_i = this.contains(CollectionSchema.httpstatus_i) ? (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) : null; String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) doc.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null; Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) doc.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null; CollectionSchema[][] metadatacheckschema = new CollectionSchema[][]{ {CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b}, {CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}; if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s) && (robots_i == null || (robots_i.intValue() & (1 << 9)) == 0 /*noindex in http X-ROBOTS*/ && (robots_i.intValue() & (1 << 3)) == 0 /*noindex in html metas*/ ) && (canonical_s == null || canonical_s.length() == 0 || (canonical_equal_sku_b != null && canonical_equal_sku_b.booleanValue()) || url.toNormalform(true).equals(canonical_s)) && (httpstatus_i == null || httpstatus_i.intValue() == 200)) { uniquecheck: for (CollectionSchema[] checkfields: metadatacheckschema) { CollectionSchema checkfield = checkfields[0]; CollectionSchema signaturefield = checkfields[1]; CollectionSchema uniquefield = checkfields[2]; if (this.contains(checkfield) && this.contains(signaturefield) && this.contains(uniquefield)) { // lookup in the index within the same hosts for the same title or description //String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description(); Long signature = (Long) doc.getFieldValue(signaturefield.getSolrFieldName()); if (signature == null) { continue uniquecheck; } try { Conjunction doccountterm = (Conjunction) ValidDocTermTemplate.clone(); doccountterm.addOperand(new Negation(new StringLiteral(CollectionSchema.id, urlhash))); doccountterm.addOperand(new StringLiteral(CollectionSchema.host_id_s, hostid)); doccountterm.addOperand(new LongLiteral(signaturefield, signature)); long doccount = segment.fulltext().getDefaultConnector().getCountByQuery("{!cache=false}" + doccountterm.toString()); sid.setField(uniquefield.getSolrFieldName(), doccount == 0); } catch (final IOException e) {} } } } uniqueURLs.add(urlhash); } public boolean postprocessing_references(final ReferenceReportCache rrCache, final SolrInputDocument sid, final DigestURL url, final Map hostExtentCount) { if (!(this.contains(CollectionSchema.references_i) || this.contains(CollectionSchema.references_internal_i) || this.contains(CollectionSchema.references_external_i) || this.contains(CollectionSchema.references_exthosts_i))) return false; Integer all_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer internal_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); Integer external_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer exthosts_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); Integer hostextc_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName()); try { ReferenceReport rr = rrCache.getReferenceReport(ASCII.String(url.hash()), false); List internalIDs = new ArrayList(); HandleSet iids = rr.getInternallIDs(); for (byte[] b: iids) internalIDs.add(ASCII.String(b)); boolean change = false; int all = rr.getExternalCount() + rr.getInternalCount(); if (this.contains(CollectionSchema.references_i) && (all_old == null || all_old.intValue() != all)) { sid.setField(CollectionSchema.references_i.getSolrFieldName(), all); change = true; } if (this.contains(CollectionSchema.references_internal_i) && (internal_old == null || internal_old.intValue() != rr.getInternalCount())) { sid.setField(CollectionSchema.references_internal_i.getSolrFieldName(), rr.getInternalCount()); change = true; } if (this.contains(CollectionSchema.references_external_i) && (external_old == null || external_old.intValue() != rr.getExternalCount())) { sid.setField(CollectionSchema.references_external_i.getSolrFieldName(), rr.getExternalCount()); change = true; } if (this.contains(CollectionSchema.references_exthosts_i) && (exthosts_old == null || exthosts_old.intValue() != rr.getExternalHostIDs().size())) { sid.setField(CollectionSchema.references_exthosts_i.getSolrFieldName(), rr.getExternalHostIDs().size()); change = true; } Long hostExtent = hostExtentCount == null ? Long.MAX_VALUE : hostExtentCount.get(url.hosthash()); if (this.contains(CollectionSchema.host_extent_i) && (hostextc_old == null || hostextc_old.intValue() != hostExtent)) { sid.setField(CollectionSchema.host_extent_i.getSolrFieldName(), hostExtent.intValue()); change = true; } return change; } catch (final IOException e) { } return false; } private static final class CRV { public double cr; public int crn, count; public CRV(final int count, final double cr, final int crn) {this.count = count; this.cr = cr; this.crn = crn;} @Override public String toString() { return "count=" + count + ", cr=" + cr + ", crn=" + crn; } } /** * The CRHost class is a container for all ranking values of a specific host. * Objects of that class are needed as an environment for repeated convergenceStep() computations, * which are iterative citation rank computations that are repeated until the ranking values * converge to stable values. * The class also contains normalization methods to compute simple integer ranking values out of the * double relevance values. */ private static final class CRHost { private final Segment segment; private final Map crt; private final int cr_host_count; private final RowHandleMap internal_links_counter; private double damping; private int converge_eq_factor; private ReferenceReportCache rrCache; public CRHost(final Segment segment, final ReferenceReportCache rrCache, final String host, final double damping, final int converge_digits) { this.segment = segment; this.damping = damping; this.rrCache = rrCache; this.converge_eq_factor = (int) Math.pow(10.0d, converge_digits); SolrConnector connector = segment.fulltext().getDefaultConnector(); this.crt = new ConcurrentHashMap(); try { // select all documents for each host BlockingQueue ids = connector.concurrentIDsByQuery("{!cache=false raw f=" + CollectionSchema.host_s.getSolrFieldName() + "}" + host, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000000, 86400000, 200, 1); String id; while ((id = ids.take()) != AbstractSolrConnector.POISON_ID) { this.crt.put(id, new double[]{0.0d,0.0d}); //{old value, new value} if (MemoryControl.shortStatus()) { ConcurrentLog.warn("CollectionConfiguration", "terminated CRHost collection during postprocessing because of short memory"); break; } } } catch (final InterruptedException e2) { } this.cr_host_count = this.crt.size(); double initval = 1.0d / cr_host_count; for (Map.Entry entry: this.crt.entrySet()) entry.getValue()[0] = initval; this.internal_links_counter = new RowHandleMap(12, Base64Order.enhancedCoder, 8, 100, "internal_links_counter"); } /** * produce a map from IDs to CRV records, normalization entries containing the values that are stored to solr. * @return */ public Map normalize() { final TreeMap> reorder = new TreeMap>(); for (Map.Entry entry: this.crt.entrySet()) { Double d = entry.getValue()[0]; List ds = reorder.get(d); if (ds == null) {ds = new ArrayList(); reorder.put(d, ds);} ds.add(ASCII.getBytes(entry.getKey())); } int nextcount = (this.cr_host_count + 1) / 2; int nextcrn = 0; Map r = new HashMap(); while (reorder.size() > 0) { int count = nextcount; while (reorder.size() > 0 && count > 0) { Map.Entry> next = reorder.pollFirstEntry(); List ids = next.getValue(); count -= ids.size(); double cr = next.getKey(); for (byte[] id: ids) r.put(ASCII.String(id), new CRV(this.cr_host_count, cr, nextcrn)); } nextcrn++; nextcount = Math.max(1, (nextcount + count + 1) / 2); } // finally, increase the crn number in such a way that the maximum is always 10 int inc = 11 - nextcrn; // nextcrn is +1 for (Map.Entry entry: r.entrySet()) entry.getValue().crn += inc; return r; } /** * log out a complete CRHost set of urls and ranking values * @param rm */ @SuppressWarnings("unused") public void log(final Map rm) { // print out all urls with their cr-values SolrConnector connector = segment.fulltext().getDefaultConnector(); for (Map.Entry entry: rm.entrySet()) { if (entry == null || entry.getValue() == null) continue; try { LoadTimeURL md = connector.getLoadTimeURL(ASCII.String(entry.getKey())); ConcurrentLog.info("CollectionConfiguration", "CR for " + md.url); ConcurrentLog.info("CollectionConfiguration", ">> " + entry.getValue().toString()); } catch (final IOException e) { ConcurrentLog.logException(e); } } } /** * Calculate the number of internal links from a specific document, denoted by the document ID. * This is a very important attribute for the ranking computation because it is the dividend for the previous ranking attribute. * The internalLinks value will be requested several times for the same id during the convergenceStep()-steps; therefore it should use a cache. * This cache is part of the CRHost data structure. * @param id * @return the number of links from the document, denoted by the ID to documents within the same domain */ public int getInternalLinks(final byte[] id) { int il = (int) this.internal_links_counter.get(id); if (il >= 0) return il; SolrConnector connector = this.segment.fulltext().getDefaultConnector(); if (connector == null) return 0; try { SolrDocument doc = connector.getDocumentById(ASCII.String(id), CollectionSchema.inboundlinkscount_i.getSolrFieldName()); if (doc == null) { this.internal_links_counter.put(id, 0); return 0; } Object x = doc.getFieldValue(CollectionSchema.inboundlinkscount_i.getSolrFieldName()); il = (x == null) ? 0 : (x instanceof Integer) ? ((Integer) x).intValue() : (x instanceof Long) ? ((Long) x).intValue() : 0; this.internal_links_counter.put(id, il); return il; } catch (final IOException e) { ConcurrentLog.logException(e); } catch (final SpaceExceededException e) { ConcurrentLog.logException(e); } try {this.internal_links_counter.put(id, 0);} catch (final SpaceExceededException e) {} return 0; } /** * Use the crt cache to compute the next generation of crt values. * @return */ public boolean convergenceStep() { boolean convergence = true; double df = (1.0d - damping) / this.cr_host_count; try { for (Map.Entry entry: this.crt.entrySet()) { String id = entry.getKey(); ReferenceReport rr = this.rrCache.getReferenceReport(id, false); // sum up the cr of the internal links HandleSet iids = rr.getInternallIDs(); double ncr = 0.0d; for (byte[] iid: iids) { int ilc = getInternalLinks(iid); if (ilc > 0) { // if (ilc == 0) then the reference report is wrong! double[] d = this.crt.get(ASCII.String(iid)); // d[] could be empty at some situations if (d != null && d.length > 0) { ncr += d[0] / ilc; } else { // Output a warning that d[] is empty ConcurrentLog.warn("COLLECTION", "d[] is empty, iid=" + ASCII.String(iid)); break; } } } ncr = df + damping * ncr; if (convergence && !eqd(ncr, entry.getValue()[0])) convergence = false; entry.getValue()[1] = ncr; } // after the loop, replace the old value with the new value in crt for (Map.Entry entry: this.crt.entrySet()) { entry.getValue()[0] = entry.getValue()[1]; } } catch (final IOException e) { } return convergence; } /** * helper method to check if two doubles are equal using a specific number of digits * @param a * @param b * @return */ private boolean eqd(final double a, final double b) { return ((int) (a * this.converge_eq_factor)) == ((int) (b * this.converge_eq_factor)); } } /** * this method compresses a list of protocol names to an indexed list. * To do this, all 'http' entries are removed and considered as default. * The remaining entries are indexed as follows: a list of -

entries is produced, where * is an index pointing to the original index of the protocol entry and

is the protocol entry itself. * The entry is formatted as a 3-digit decimal number with leading zero digits. * @param protocol * @return a list of indexed protocol entries */ public static List protocolList2indexedList(final List protocol) { List a = new ArrayList(); String p; for (int i = 0; i < protocol.size(); i++) { p = protocol.get(i); if (!p.equals("http")) { String c = Integer.toString(i); while (c.length() < 3) c = "0" + c; a.add(c + "-" + p); } } return a; } public static List indexedList2protocolList(Collection iplist, int dimension) { List a = new ArrayList(dimension); for (int i = 0; i < dimension; i++) a.add("http"); if (iplist == null) return a; for (Object ip : iplist) { // ip format is 001-https but can be 4 digits 1011-https int i = ((String) ip).indexOf('-'); a.set(Integer.parseInt(((String) ip).substring(0, i)), ((String) ip).substring(i+1)); } return a; } /** * encode a string containing attributes from anchor rel properties binary: * bit 0: "me" contained in rel * bit 1: "nofollow" contained in rel * @param rel * @return binary encoded information about rel */ /* private static List relEval(final List rel) { List il = new ArrayList(rel.size()); for (final String s: rel) { int i = 0; final String s0 = s.toLowerCase().trim(); if ("me".equals(s0)) i += 1; if ("nofollow".equals(s0)) i += 2; il.add(i); } return il; } */ public static class FailDoc { DigestURL digestURL; final Map collections; final String failReason; final FailType failType; final int httpstatus; final Date failtime; final int crawldepth; public FailDoc(final DigestURL digestURL, final Map collections, final String failReason, final FailType failType, final int httpstatus, final int crawldepth) { this.digestURL = digestURL; this.collections = collections; this.failReason = failReason; this.failType = failType; this.httpstatus = httpstatus; this.failtime = new Date(); this.crawldepth = crawldepth; } public FailDoc(final SolrDocument doc) { try { this.digestURL = new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); } catch (MalformedURLException e) { this.digestURL = null; } this.collections = new HashMap(); Collection c = doc.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName()); if (c != null) for (Object cn: c) if (cn != null) this.collections.put((String) cn, QueryParams.catchall_pattern); this.failReason = (String) doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName()); String fts = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); if (fts == null) ConcurrentLog.warn("CollectionConfiguration", "no fail type given for URL " + this.digestURL.toNormalform(true)); this.failType = fts == null ? FailType.fail : FailType.valueOf(fts); this.httpstatus = (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()); this.failtime = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); Integer cd = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName()); this.crawldepth = cd == null ? 0 : cd.intValue(); } public DigestURL getDigestURL() { return digestURL; } public Map getCollections() { return collections; } public String getFailReason() { return failReason; } public FailType getFailType() { return failType; } public Date getFailDate() { return this.failtime; } public int getHttpstatus() { return httpstatus; } public SolrInputDocument toSolr(CollectionConfiguration configuration) { boolean allAttr = configuration.isEmpty(); assert allAttr || configuration.contains(CollectionSchema.failreason_s); final SolrInputDocument doc = new SolrInputDocument(); String url = configuration.addURIAttributes(doc, allAttr, this.getDigestURL()); // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check) CollectionSchema.content_type.add(doc, new String[]{Classification.url2mime(this.digestURL)}); if (allAttr || configuration.contains(CollectionSchema.load_date_dt)) configuration.add(doc, CollectionSchema.load_date_dt, getFailDate()); if (allAttr || configuration.contains(CollectionSchema.crawldepth_i)) configuration.add(doc, CollectionSchema.crawldepth_i, this.crawldepth); // fail reason and status if (allAttr || configuration.contains(CollectionSchema.failreason_s)) configuration.add(doc, CollectionSchema.failreason_s, this.getFailReason()); if (allAttr || configuration.contains(CollectionSchema.failtype_s)) configuration.add(doc, CollectionSchema.failtype_s, this.getFailType().name()); if (allAttr || configuration.contains(CollectionSchema.httpstatus_i)) configuration.add(doc, CollectionSchema.httpstatus_i, this.getHttpstatus()); if (allAttr || configuration.contains(CollectionSchema.collection_sxt) && this.getCollections() != null && this.getCollections().size() > 0) { List cs = new ArrayList(); for (Map.Entry e: this.getCollections().entrySet()) { if (e.getValue().matcher(url).matches()) cs.add(e.getKey()); } configuration.add(doc, CollectionSchema.collection_sxt, cs); } // cr and postprocessing Set processTypes = new LinkedHashSet(); if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) { processTypes.add(ProcessType.CITATION); // postprocessing needed } if (allAttr || configuration.contains(CollectionSchema.process_sxt)) { List p = new ArrayList(); for (ProcessType t: processTypes) p.add(t.name()); configuration.add(doc, CollectionSchema.process_sxt, p); } return doc; } } }