/**
* CollectionConfiguration
* Copyright 2011 by Michael Peter Christen
* First released 14.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7654 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see .
*/
package net.yacy.search.schema;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.Array;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.eclipse.jetty.util.ConcurrentHashSet;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.ProcessType;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL;
import net.yacy.cora.federate.solr.logic.BooleanLiteral;
import net.yacy.cora.federate.solr.logic.CatchallLiteral;
import net.yacy.cora.federate.solr.logic.Conjunction;
import net.yacy.cora.federate.solr.logic.Disjunction;
import net.yacy.cora.federate.solr.logic.LongLiteral;
import net.yacy.cora.federate.solr.logic.Negation;
import net.yacy.cora.federate.solr.logic.StringLiteral;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.SentenceReader;
import net.yacy.document.Tokenizer;
import net.yacy.document.content.DCEntry;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.IconEntry;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.index.RowHandleMap;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams;
public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
private static final long serialVersionUID=-499100932212840385L;
public static boolean UNIQUE_HEURISTIC_PREFER_HTTPS = false;
public static boolean UNIQUE_HEURISTIC_PREFER_WWWPREFIX = true;
private final ArrayList rankings;
/**
* initialize the schema with a given configuration file
* the configuration file simply contains a list of lines with keywords
* or keyword = value lines (while value is a custom Solr field name
* @param configurationFile
* @throws IOException
*/
public CollectionConfiguration(final File configurationFile, final boolean lazy) throws IOException {
super(configurationFile);
super.lazy = lazy;
this.rankings = new ArrayList(4);
for (int i = 0; i <= 3; i++) rankings.add(new Ranking());
// check consistency: compare with YaCyField enum
if (this.isEmpty()) return;
Iterator it = this.entryIterator();
for (SchemaConfiguration.Entry etr = it.next(); it.hasNext(); etr = it.next()) {
try {
CollectionSchema f = CollectionSchema.valueOf(etr.key());
f.setSolrFieldName(etr.getValue());
} catch (final IllegalArgumentException e) {
ConcurrentLog.fine("SolrCollectionWriter", "solr schema file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + etr.toString() + "'");
it.remove();
}
}
// check consistency the other way: look if all enum constants in SolrField appear in the configuration file
for (CollectionSchema field: CollectionSchema.values()) {
if (this.get(field.name()) == null) {
if (CollectionSchema.author_sxt.getSolrFieldName().endsWith(field.name())) continue; // exception for this: that is a copy-field
if (CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName().endsWith(field.name())) continue; // exception for this: automatically generated
if (CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName().endsWith(field.name())) continue; // exception for this: automatically generated
ConcurrentLog.warn("SolrCollectionWriter", " solr schema file " + configurationFile.getAbsolutePath() + " is missing declaration for '" + field.name() + "'");
}
}
}
public String[] allFields() {
ArrayList a = new ArrayList<>(this.size());
for (CollectionSchema f: CollectionSchema.values()) {
if (this.contains(f)) a.add(f.getSolrFieldName());
}
return a.toArray(new String[a.size()]);
}
public Ranking getRanking(final int idx) {
return this.rankings.get(idx % this.rankings.size()); // simply prevent out of bound exeption (& callers don't check for null)
}
/**
* @param name The name of the ranking to get.
* @return The corresponding Ranking-object.
*/
public Ranking getRanking(final String name) {
if (name == null) return null;
for (int i = 0; i < this.rankings.size(); i++) {
Ranking currentRanking = this.rankings.get(i);
if (name.equals(currentRanking.getName())) return currentRanking;
}
return null;
}
/**
* save configuration to file and update enum SolrFields
* @throws IOException
*/
@Override
public void commit() throws IOException {
try {
super.commit();
// make sure the enum SolrField.SolrFieldName is current
Iterator it = this.entryIterator();
for (SchemaConfiguration.Entry etr = it.next(); it.hasNext(); etr = it.next()) {
try {
SchemaDeclaration f = CollectionSchema.valueOf(etr.key());
f.setSolrFieldName(etr.getValue());
} catch (final IllegalArgumentException e) {
continue;
}
}
} catch (final IOException e) {}
}
private final static Set omitFields = new HashSet(3);
static {
omitFields.add(CollectionSchema.author_sxt.getSolrFieldName());
omitFields.add(CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName());
omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName());
}
public SolrInputDocument toSolrInputDocument(final SolrDocument doc) {
return toSolrInputDocument(doc, omitFields);
}
public SolrDocument toSolrDocument(final SolrInputDocument doc) {
return toSolrDocument(doc, omitFields);
}
/**
* add uri attributes to solr document
* @param doc
* @param allAttr
* @param digestURL
* @param doctype
* @return the normalized url
*/
public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL) {
add(doc, CollectionSchema.id, ASCII.String(digestURL.hash()));
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, digestURL.hosthash());
String us = digestURL.toNormalform(true);
add(doc, CollectionSchema.sku, us);
if (allAttr || contains(CollectionSchema.ip_s)) {
final InetAddress address = digestURL.getInetAddress();
if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress());
}
String host = null;
if ((host = digestURL.getHost()) != null) {
String dnc = Domains.getDNC(host);
String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1);
int p = subdomOrga.lastIndexOf('.');
String subdom = (p < 0) ? "" : subdomOrga.substring(0, p);
String orga = (p < 0) ? subdomOrga : subdomOrga.substring(p + 1);
if (allAttr || contains(CollectionSchema.host_s)) add(doc, CollectionSchema.host_s, host);
if (allAttr || contains(CollectionSchema.host_dnc_s)) add(doc, CollectionSchema.host_dnc_s, dnc);
if (allAttr || contains(CollectionSchema.host_organization_s)) add(doc, CollectionSchema.host_organization_s, orga);
if (allAttr || contains(CollectionSchema.host_organizationdnc_s)) add(doc, CollectionSchema.host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(CollectionSchema.host_subdomain_s)) add(doc, CollectionSchema.host_subdomain_s, subdom);
}
// path elements of link
String filename = digestURL.getFileName();
String extension = MultiProtocolURL.getFileExtension(filename);
String filenameStub = filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename;
// remove possible jsession (or other url parm like "img.jpg;jsession=123")
// TODO: consider to implement ";jsession=123" check in getFileExtension()
if (extension.indexOf(';') >= 0) extension = extension.substring(0,extension.indexOf(';'));
if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length());
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol());
if (allAttr || contains(CollectionSchema.url_paths_sxt) || contains(CollectionSchema.url_paths_count_i)) {
String[] paths = digestURL.getPaths();
if (allAttr || contains(CollectionSchema.url_paths_count_i)) add(doc, CollectionSchema.url_paths_count_i, paths.length);
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, paths);
}
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filenameStub);
if (allAttr || contains(CollectionSchema.url_file_name_tokens_t)) add(doc, CollectionSchema.url_file_name_tokens_t, MultiProtocolURL.toTokens(filenameStub));
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
Map searchpart = digestURL.getSearchpartMap();
if (searchpart == null) {
if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, 0);
} else {
if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, searchpart.size());
if (allAttr || contains(CollectionSchema.url_parameter_key_sxt)) add(doc, CollectionSchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()]));
if (allAttr || contains(CollectionSchema.url_parameter_value_sxt)) add(doc, CollectionSchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()]));
}
return us;
}
public SolrInputDocument metadata2solr(final URIMetadataNode md) {
SolrInputDocument doc = toSolrInputDocument(md); //urimetadatanode stores some values in private fields, add now to sorldocument
boolean allAttr = this.isEmpty();
addURIAttributes(doc, allAttr, md.url());
String title = md.dc_title();
if (allAttr || contains(CollectionSchema.title_count_i)) add(doc, CollectionSchema.title_count_i, 1);
if (allAttr || contains(CollectionSchema.title_chars_val)) {
Integer[] cv = new Integer[]{new Integer(title.length())};
add(doc, CollectionSchema.title_chars_val, cv);
}
if (allAttr || contains(CollectionSchema.title_words_val)) {
Integer[] cv = new Integer[]{new Integer(CommonPattern.SPACES.split(title).length)};
add(doc, CollectionSchema.title_words_val, cv);
}
String description = md.snippet();
boolean description_exist = description != null;
if (description == null) description = "";
if (allAttr || contains(CollectionSchema.description_txt)) add(doc, CollectionSchema.description_txt, description_exist ? new String[]{description} : new String[0]);
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, description_exist ? 1 : 0);
if (allAttr || contains(CollectionSchema.description_chars_val)) {
add(doc, CollectionSchema.description_chars_val, description_exist ? new Integer[]{new Integer(description.length())} : new Integer[0]);
}
if (allAttr || contains(CollectionSchema.description_words_val)) {
add(doc, CollectionSchema.description_words_val, description_exist ? new Integer[]{new Integer(description.length() == 0 ? 0 : CommonPattern.SPACES.split(description).length)} : new Integer[0]);
}
String keywords = md.dc_subject();
Bitfield flags = md.flags();
if (flags.get(Tokenizer.flag_cat_indexof)) {
if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else {
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
}
}
if (allAttr || contains(CollectionSchema.keywords)) {
add(doc, CollectionSchema.keywords, keywords);
}
/* Metadata node may contain one favicon url when transmitted as dht chunk */
processIcons(doc, allAttr, md.getIcons());
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, md.limage());
if (allAttr || contains(CollectionSchema.linkscount_i)) add(doc, CollectionSchema.linkscount_i, md.llocal() + md.lother());
if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, md.llocal());
if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, md.lother());
if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, StandardCharsets.UTF_8.name());
// coordinates
if (md.lat() != 0.0 && md.lon() != 0.0) {
// i.e. from or
if (allAttr || contains(CollectionSchema.coordinate_p)) {
add(doc, CollectionSchema.coordinate_p, Double.toString(md.lat()) + "," + Double.toString(md.lon()));
}
}
if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, 200);
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher());
// fields that are in URIMetadataRow additional to yacy2solr basic requirement
if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, md.laudio());
if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, md.lvideo());
if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, md.lapp());
return doc;
}
public static class Subgraph {
public final ArrayList[] urlProtocols, urlStubs, urlAnchorTexts;
@SuppressWarnings("unchecked")
public Subgraph(int inboundSize, int outboundSize) {
this.urlProtocols = (ArrayList[]) Array.newInstance(ArrayList.class, 2);
this.urlProtocols[0] = new ArrayList(inboundSize);
this.urlProtocols[1] = new ArrayList(outboundSize);
this.urlStubs = (ArrayList[]) Array.newInstance(ArrayList.class, 2);
this.urlStubs[0] = new ArrayList(inboundSize);
this.urlStubs[1] = new ArrayList(outboundSize);
this.urlAnchorTexts = (ArrayList[]) Array.newInstance(ArrayList.class, 2);
this.urlAnchorTexts[0] = new ArrayList(inboundSize);
this.urlAnchorTexts[1] = new ArrayList(outboundSize);
}
}
public static boolean enrichSubgraph(final Subgraph subgraph, final DigestURL source_url, AnchorURL target_url) {
final String text = target_url.getTextProperty(); // the text between the tag
String source_host = source_url.getHost();
String target_host = target_url.getHost();
boolean inbound =
(source_host == null && target_host == null) ||
(source_host != null && target_host != null &&
(target_host.equals(source_host) ||
target_host.equals("www." + source_host) ||
source_host.equals("www." + target_host))); // well, not everybody defines 'outbound' that way but however, thats used here.
int ioidx = inbound ? 0 : 1;
subgraph.urlProtocols[ioidx].add(target_url.getProtocol());
subgraph.urlStubs[ioidx].add(target_url.urlstub(true, true));
subgraph.urlAnchorTexts[ioidx].add(text);
return inbound;
}
/**
* a SolrVector is a SolrInputDocument with the ability
* to store also the webgraph that is associated with
* the web document in the Solr document.
*/
public static class SolrVector extends SolrInputDocument {
private static final long serialVersionUID = -210901881471714939L;
private List webgraphDocuments;
public SolrVector() {
super();
this.webgraphDocuments = new ArrayList();
}
public void addWebgraphDocument(SolrInputDocument webgraphDocument) {
this.webgraphDocuments.add(webgraphDocument);
}
public List getWebgraphDocuments() {
return this.webgraphDocuments;
}
}
public SolrVector yacy2solr(
final Segment segment,
final Map collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final boolean setUnique,
final WebgraphConfiguration webgraph, final String sourceName) {
// we use the SolrCell design as index schema
SolrVector doc = new SolrVector();
final DigestURL digestURL = document.dc_source();
boolean allAttr = this.isEmpty();
String url = addURIAttributes(doc, allAttr, digestURL);
add(doc, CollectionSchema.content_type, new String[]{document.dc_format()}); // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check)
Set processTypes = new LinkedHashSet();
String host = digestURL.getHost();
int crawldepth = document.getDepth();
if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
CollectionSchema.crawldepth_i.add(doc, crawldepth);
}
if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) {
processTypes.add(ProcessType.CITATION); // postprocessing needed
}
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
List cs = new ArrayList();
for (Map.Entry e: collections.entrySet()) {
if (e.getValue().matcher(url).matches()) cs.add(e.getKey());
}
add(doc, CollectionSchema.collection_sxt, cs);
}
List titles = document.titles();
if (allAttr || contains(CollectionSchema.title)) {
add(doc, CollectionSchema.title, titles);
if ((allAttr || contains(CollectionSchema.title_exact_signature_l)) && titles.size() > 0) {
add(doc, CollectionSchema.title_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(titles.get(0)));
}
}
if (allAttr || contains(CollectionSchema.title_count_i)) add(doc, CollectionSchema.title_count_i, titles.size());
if (allAttr || contains(CollectionSchema.title_chars_val)) {
ArrayList cv = new ArrayList(titles.size());
for (String s: titles) cv.add(new Integer(s.length()));
add(doc, CollectionSchema.title_chars_val, cv);
}
if (allAttr || contains(CollectionSchema.title_words_val)) {
ArrayList cv = new ArrayList(titles.size());
for (String s: titles) cv.add(new Integer(CommonPattern.SPACES.split(s).length));
add(doc, CollectionSchema.title_words_val, cv);
}
String[] descriptions = document.dc_description();
if (allAttr || contains(CollectionSchema.description_txt)) {
add(doc, CollectionSchema.description_txt, descriptions);
if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && descriptions != null && descriptions.length > 0) {
add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(descriptions));
}
}
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.length);
if (allAttr || contains(CollectionSchema.description_chars_val)) {
ArrayList cv = new ArrayList(descriptions.length);
for (String s: descriptions) cv.add(new Integer(s.length()));
add(doc, CollectionSchema.description_chars_val, cv);
}
if (allAttr || contains(CollectionSchema.description_words_val)) {
ArrayList cv = new ArrayList(descriptions.length);
for (String s: descriptions) cv.add(new Integer(CommonPattern.SPACES.split(s).length));
add(doc, CollectionSchema.description_words_val, cv);
}
if (allAttr || contains(CollectionSchema.author)) {
String author = document.dc_creator();
if (author == null || author.length() == 0) author = document.dc_publisher();
add(doc, CollectionSchema.author, author);
}
if (allAttr || contains(CollectionSchema.last_modified)) {
Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified();
if (lastModified == null) lastModified = new Date();
if (document.getLastModified().before(lastModified)) lastModified = document.getLastModified();
long firstSeen = segment.getFirstSeenTime(digestURL.hash());
if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
add(doc, CollectionSchema.last_modified, lastModified);
}
if (allAttr || contains(CollectionSchema.dates_in_content_dts) || contains(CollectionSchema.dates_in_content_count_i)) {
LinkedHashSet dates_in_content = condenser.dates_in_content;
if (allAttr || contains(CollectionSchema.dates_in_content_count_i)) {
add(doc, CollectionSchema.dates_in_content_count_i, dates_in_content.size());
}
if (dates_in_content.size() > 0 && (allAttr || contains(CollectionSchema.dates_in_content_dts))) {
add(doc, CollectionSchema.dates_in_content_dts, dates_in_content.toArray(new Date[dates_in_content.size()]));
}
}
if (allAttr || contains(CollectionSchema.keywords)) {
String keywords = document.dc_subject(' ');
add(doc, CollectionSchema.keywords, keywords);
}
// unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is)
add(doc, CollectionSchema.http_unique_b, setUnique || UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.www_unique_b, setUnique || host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature());
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
add(doc, CollectionSchema.fuzzy_signature_l, condenser.fuzzySignature());
add(doc, CollectionSchema.fuzzy_signature_text_t, condenser.fuzzySignatureText());
add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.fuzzy_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
if (this.contains(CollectionSchema.exact_signature_unique_b) || this.contains(CollectionSchema.exact_signature_copycount_i) ||
this.contains(CollectionSchema.fuzzy_signature_l) || this.contains(CollectionSchema.fuzzy_signature_copycount_i) ||
this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) {
processTypes.add(ProcessType.UNIQUE);
}
// get list of all links; they will be shrinked by urls that appear in other fields of the solr schema
LinkedHashMap inboundLinks = document.inboundLinks();
LinkedHashMap outboundLinks = document.outboundLinks();
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
int c = 0;
final Object scraper = document.getScraperObject();
boolean containsCanonical = false;
DigestURL canonical = null;
processIcons(doc, allAttr, inboundLinks, outboundLinks, document.getIcons().values());
if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper;
List images = html.getImages();
// header tags
int h = 0;
int f = 1;
String[] hs;
hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h1_txt, hs); add(doc, CollectionSchema.h1_i, hs.length);
hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h2_txt, hs); add(doc, CollectionSchema.h2_i, hs.length);
hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h3_txt, hs); add(doc, CollectionSchema.h3_i, hs.length);
hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h4_txt, hs); add(doc, CollectionSchema.h4_i, hs.length);
hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h5_txt, hs); add(doc, CollectionSchema.h5_i, hs.length);
hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h6_txt, hs); add(doc, CollectionSchema.h6_i, hs.length);
add(doc, CollectionSchema.htags_i, h);
add(doc, CollectionSchema.schema_org_breadcrumb_i, html.breadcrumbCount());
// meta tags: Open Graph properties
String og;
og = html.getMetas().get("og:title"); if (og != null) add(doc, CollectionSchema.opengraph_title_t, og);
og = html.getMetas().get("og:type"); if (og != null) add(doc, CollectionSchema.opengraph_type_s, og);
og = html.getMetas().get("og:url"); if (og != null) add(doc, CollectionSchema.opengraph_url_s, og);
og = html.getMetas().get("og:image"); if (og != null) add(doc, CollectionSchema.opengraph_image_s, og);
// noindex and nofollow attributes
// from HTML (meta-tag in HTML header: robots)
// and HTTP header (X-Robots-Tag property)
// coded as binary value:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "follow" contained in html header meta
// bit 3: "noindex" contained in html header meta
// bit 4: "nofollow" contained in html header meta
// bit 5: "noarchive" contained in html header meta
// bit 8: "all" contained in http header X-Robots-Tag
// bit 9: "noindex" contained in http header X-Robots-Tag
// bit 10: "nofollow" contained in http header X-Robots-Tag
// bit 11: "noarchive" contained in http header X-Robots-Tag
// bit 12: "nosnippet" contained in http header X-Robots-Tag
// bit 13: "noodp" contained in http header X-Robots-Tag
// bit 14: "notranslate" contained in http header X-Robots-Tag
// bit 15: "noimageindex" contained in http header X-Robots-Tag
// bit 16: "unavailable_after" contained in http header X-Robots-Tag
int b = 0;
String robots_meta = html.getMetas().get("robots");
// this tag may have values: all, index, noindex, nofollow; see http://www.robotstxt.org/meta.html
if (robots_meta != null) {
robots_meta = robots_meta.toLowerCase();
if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0
if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1
if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2
if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3
if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
if (robots_meta.indexOf("noarchive",0) >= 0) b += 32; // set bit 5
}
String x_robots_tag = responseHeader == null ? "" : responseHeader.getXRobotsTag();
if (!x_robots_tag.isEmpty()) {
// this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de
if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8
if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9
if (x_robots_tag.indexOf("nofollow",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<10; // set bit 10
if (x_robots_tag.indexOf("noarchive",0) >= 0) b += 1<<11; // set bit 11
if (x_robots_tag.indexOf("nosnippet",0) >= 0) b += 1<<12; // set bit 12
if (x_robots_tag.indexOf("noodp",0) >= 0) b += 1<<13; // set bit 13
if (x_robots_tag.indexOf("notranslate",0) >= 0) b += 1<<14; // set bit 14
if (x_robots_tag.indexOf("noimageindex",0) >= 0) b += 1<<15; // set bit 15
if (x_robots_tag.indexOf("unavailable_after",0) >= 0) b += 1<<16; // set bit 16
}
add(doc, CollectionSchema.robots_i, b);
// meta tags: generator
final String generator = html.getMetas().get("generator");
if (generator != null) add(doc, CollectionSchema.metagenerator_t, generator);
// bold, italic
final String[] bold = html.getBold();
add(doc, CollectionSchema.boldcount_i, bold.length);
if (bold.length > 0) {
add(doc, CollectionSchema.bold_txt, bold);
if (allAttr || contains(CollectionSchema.bold_val)) {
add(doc, CollectionSchema.bold_val, html.getBoldCount(bold));
}
}
final String[] italic = html.getItalic();
add(doc, CollectionSchema.italiccount_i, italic.length);
if (italic.length > 0) {
add(doc, CollectionSchema.italic_txt, italic);
if (allAttr || contains(CollectionSchema.italic_val)) {
add(doc, CollectionSchema.italic_val, html.getItalicCount(italic));
}
}
final String[] underline = html.getUnderline();
add(doc, CollectionSchema.underlinecount_i, underline.length);
if (underline.length > 0) {
add(doc, CollectionSchema.underline_txt, underline);
if (allAttr || contains(CollectionSchema.underline_val)) {
add(doc, CollectionSchema.underline_val, html.getUnderlineCount(underline));
}
}
final String[] li = html.getLi();
add(doc, CollectionSchema.licount_i, li.length);
if (li.length > 0) add(doc, CollectionSchema.li_txt, li);
final String[] dt = html.getDt();
add(doc, CollectionSchema.dtcount_i, dt.length);
if (dt.length > 0) add(doc, CollectionSchema.dt_txt, dt);
final String[] dd = html.getDd();
add(doc, CollectionSchema.ddcount_i, dd.length);
if (dd.length > 0) add(doc, CollectionSchema.dd_txt, dd);
final List startDates = html.getStartDates();
if (startDates.size() > 0) add(doc, CollectionSchema.startDates_dts, startDates.toArray(new Date[startDates.size()]));
final List endDates = html.getStartDates();
if (endDates.size() > 0) add(doc, CollectionSchema.endDates_dts, endDates.toArray(new Date[endDates.size()]));
final List articles = html.getArticles();
add(doc, CollectionSchema.articlecount_i, articles.size());
if (articles.size() > 0) add(doc, CollectionSchema.article_txt, articles);
// images
processImages(doc, allAttr, inboundLinks, outboundLinks, images);
// style sheets
if (allAttr || contains(CollectionSchema.css_tag_sxt)) {
final Map csss = html.getCSS();
final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()];
c = 0;
for (final Map.Entry entry: csss.entrySet()) {
final String cssurl = entry.getKey().toNormalform(false);
inboundLinks.remove(entry.getKey());
outboundLinks.remove(entry.getKey());
css_tag[c] =
"";
css_url[c] = cssurl;
c++;
}
add(doc, CollectionSchema.csscount_i, css_tag.length);
if (css_tag.length > 0) add(doc, CollectionSchema.css_tag_sxt, css_tag);
if (css_url.length > 0) add(doc, CollectionSchema.css_url_sxt, css_url);
}
// Scripts
if (allAttr || contains(CollectionSchema.scripts_sxt)) {
final Set scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
for (final AnchorURL u: scriptss) {
inboundLinks.remove(u);
outboundLinks.remove(u);
scripts[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.scriptscount_i, scripts.length);
if (scripts.length > 0) add(doc, CollectionSchema.scripts_sxt, scripts);
}
// Frames
if (allAttr || contains(CollectionSchema.frames_sxt)) {
final Set framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
for (final AnchorURL u: framess) {
inboundLinks.remove(u);
outboundLinks.remove(u);
frames[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.framesscount_i, frames.length);
if (frames.length > 0) {
add(doc, CollectionSchema.frames_sxt, frames);
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
}
}
// IFrames
if (allAttr || contains(CollectionSchema.iframes_sxt)) {
final Set iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
for (final AnchorURL u: iframess) {
inboundLinks.remove(u);
outboundLinks.remove(u);
iframes[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.iframesscount_i, iframes.length);
if (iframes.length > 0) {
add(doc, CollectionSchema.iframes_sxt, iframes);
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
}
}
// canonical tag
if (allAttr || contains(CollectionSchema.canonical_s)) {
canonical = html.getCanonical();
// if there is no canonical in the html then look into the http header:
if (canonical == null && responseHeader != null) {
String link = responseHeader.get("Link", null);
int p;
if (link != null && ((p = link.indexOf("rel=\"canonical\"")) > 0)) {
link = link.substring(0, p).trim();
p = link.indexOf('<');
int q = link.lastIndexOf('>');
if (p >= 0 && q > 0) {
link = link.substring(p + 1, q);
try {
canonical = new DigestURL(link);
} catch (MalformedURLException e) {}
}
}
}
if (canonical != null) {
containsCanonical = true;
inboundLinks.remove(canonical);
outboundLinks.remove(canonical);
add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false));
// set a flag if this is equal to sku
if (contains(CollectionSchema.canonical_equal_sku_b)) {
add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(digestURL));
}
}
}
// meta refresh tag
if (allAttr || contains(CollectionSchema.refresh_s)) {
String refresh = html.getRefreshPath();
if (refresh != null && refresh.length() > 0) {
MultiProtocolURL refreshURL;
try {
refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURL, html.getRefreshPath());
if (refreshURL != null) {
inboundLinks.remove(refreshURL);
outboundLinks.remove(refreshURL);
add(doc, CollectionSchema.refresh_s, refreshURL.toNormalform(false));
}
} catch (final MalformedURLException e) {
add(doc, CollectionSchema.refresh_s, refresh);
}
}
}
// flash embedded
if (allAttr || contains(CollectionSchema.flash_b)) {
MultiProtocolURL[] flashURLs = html.getFlash();
for (MultiProtocolURL u: flashURLs) {
// remove all flash links from ibound/outbound links
inboundLinks.remove(u);
outboundLinks.remove(u);
}
add(doc, CollectionSchema.flash_b, flashURLs.length > 0);
}
// generic evaluation pattern
for (final String model: html.getEvaluationModelNames()) {
if (allAttr || contains("ext_" + model + "_txt")) {
final String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) {
add(doc, CollectionSchema.valueOf("ext_" + model + "_txt"), scorenames);
add(doc, CollectionSchema.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames));
}
}
}
// response time
add(doc, CollectionSchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
// hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
if (allAttr || (contains(CollectionSchema.hreflang_url_sxt) && contains(CollectionSchema.hreflang_cc_sxt))) {
final String[] ccs = new String[html.getHreflang().size()];
final String[] urls = new String[html.getHreflang().size()];
c = 0;
for (Map.Entry e: html.getHreflang().entrySet()) {
ccs[c] = e.getKey();
urls[c] = e.getValue().toNormalform(true);
c++;
}
add(doc, CollectionSchema.hreflang_cc_sxt, ccs);
add(doc, CollectionSchema.hreflang_url_sxt, urls);
}
// page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html
if (allAttr || (contains(CollectionSchema.navigation_url_sxt) && contains(CollectionSchema.navigation_type_sxt))) {
final String[] navs = new String[html.getNavigation().size()];
final String[] urls = new String[html.getNavigation().size()];
c = 0;
for (Map.Entry e: html.getNavigation().entrySet()) {
navs[c] = e.getKey();
urls[c] = e.getValue().toNormalform(true);
c++;
}
add(doc, CollectionSchema.navigation_type_sxt, navs);
add(doc, CollectionSchema.navigation_url_sxt, urls);
}
// publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de
if (allAttr || contains(CollectionSchema.publisher_url_s) && html.getPublisherLink() != null) {
add(doc, CollectionSchema.publisher_url_s, html.getPublisherLink().toNormalform(true));
}
}
if (scraper instanceof DCEntry) {
// the document was created with a surrogate parsing; overwrite all md: -entries to Solr
DCEntry dcentry = (DCEntry) scraper;
for (Map.Entry entry: dcentry.getMap().entrySet()) {
String tag = entry.getKey();
if (!tag.startsWith("md:") || tag.length() < 4) continue;
CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3));
if (solr_field == null) continue;
String[] values = entry.getValue();
if (values == null || values.length == 0) continue;
if (allAttr || contains(solr_field)) {
add(doc, solr_field, values);
}
}
}
String content = document.getTextString();
String tokens = digestURL.toTokens();
if (content == null || content.length() == 0) {
content = tokens;
} else {
String[] t = CommonPattern.SPACE.split(tokens);
for (String r: t) {
if (r.length() > 0 &&
content.indexOf(" " + r + " ") < 0 &&
!content.startsWith(r + " ") &&
!content.endsWith(" " + r)) content += " " + r;
}
}
// handle image source meta data
if (document.getContentDomain() == ContentDomain.IMAGE) {
// add image pixel size if known
Iterator imgit = document.getImages().values().iterator();
List heights = new ArrayList<>();
List widths = new ArrayList<>();
List pixels = new ArrayList<>();
while (imgit.hasNext()) {
ImageEntry img = imgit.next();
int imgpixels = (img.height() < 0 || img.width() < 0) ? -1 : img.height() * img.width();
if (imgpixels > 0 && (allAttr || (contains(CollectionSchema.images_height_val) && contains(CollectionSchema.images_width_val) && contains(CollectionSchema.images_pixel_val)))) {
heights.add(img.height());
widths.add(img.width());
pixels.add(imgpixels);
}
}
if (heights.size() > 0) {
add(doc, CollectionSchema.images_height_val, heights);
add(doc, CollectionSchema.images_width_val, widths);
add(doc, CollectionSchema.images_pixel_val, pixels);
}
if (allAttr || contains(CollectionSchema.images_text_t)) {
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
content = digestURL.toTokens(); // remove all other entry but the url tokens
}
}
// content (must be written after special parser data, since this can influence the content)
if (allAttr || contains(CollectionSchema.text_t)) add(doc, CollectionSchema.text_t, content);
if (allAttr || contains(CollectionSchema.wordcount_i)) {
if (content.length() == 0) {
add(doc, CollectionSchema.wordcount_i, 0);
} else {
int contentwc = 1;
for (int i = content.length() - 1; i >= 0; i--) if (content.charAt(i) == ' ') contentwc++;
add(doc, CollectionSchema.wordcount_i, contentwc);
}
}
// statistics about the links
if (allAttr || contains(CollectionSchema.linkscount_i)) add(doc, CollectionSchema.linkscount_i, inboundLinks.size() + outboundLinks.size());
if (allAttr || contains(CollectionSchema.linksnofollowcount_i)) add(doc, CollectionSchema.linksnofollowcount_i, document.inboundLinkNofollowCount() + document.outboundLinkNofollowCount());
if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, inboundLinks.size());
if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount());
if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size());
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
// create a subgraph
Boolean canonical_equal_sku = canonical == null ? null : canonical.toNormalform(true).equals(url);
if (webgraph != null && (!containsCanonical || (canonical_equal_sku != null && (canonical_equal_sku.booleanValue())))) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
List edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, processTypes, document.getHyperlinks().keySet(), sourceName);
// this also enriched the subgraph
doc.webgraphDocuments.addAll(edges);
} else {
if (allAttr ||
contains(CollectionSchema.inboundlinks_protocol_sxt) ||
contains(CollectionSchema.inboundlinks_urlstub_sxt) ||
contains(CollectionSchema.inboundlinks_anchortext_txt) ||
contains(CollectionSchema.outboundlinks_protocol_sxt) ||
contains(CollectionSchema.outboundlinks_urlstub_sxt) ||
contains(CollectionSchema.outboundlinks_anchortext_txt)) {
for (final AnchorURL target_url: document.getHyperlinks().keySet()) {
enrichSubgraph(subgraph, digestURL, target_url);
}
}
}
// attach the subgraph content
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_sxt)) add(doc, CollectionSchema.inboundlinks_urlstub_sxt, subgraph.urlStubs[0]);
if (allAttr || contains(CollectionSchema.inboundlinks_anchortext_txt)) add(doc, CollectionSchema.inboundlinks_anchortext_txt, subgraph.urlAnchorTexts[0]);
if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1]));
if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_sxt)) add(doc, CollectionSchema.outboundlinks_urlstub_sxt, subgraph.urlStubs[1]);
if (allAttr || contains(CollectionSchema.outboundlinks_anchortext_txt)) add(doc, CollectionSchema.outboundlinks_anchortext_txt, subgraph.urlAnchorTexts[1]);
// charset
if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset());
// coordinates
if (document.lat() != 0.0 && document.lon() != 0.0) {
if (allAttr || contains(CollectionSchema.coordinate_p)) add(doc, CollectionSchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon()));
}
if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode());
// fields that were additionally in URIMetadataRow
Date loadDate = new Date();
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength());
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate);
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash()));
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher());
if ((allAttr || contains(CollectionSchema.language_s)) && language != null) add(doc, CollectionSchema.language_s, language);
if (allAttr || contains(CollectionSchema.size_i)) add(doc, CollectionSchema.size_i, size);
if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, document.getAudiolinks().size());
if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, document.getVideolinks().size());
if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, document.getApplinks().size());
// document post-processing
if ((allAttr || contains(CollectionSchema.process_sxt)) && processTypes.size() > 0) {
List p = new ArrayList();
for (ProcessType t: processTypes) p.add(t.name());
add(doc, CollectionSchema.process_sxt, p);
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
add(doc, CollectionSchema.harvestkey_s, sourceName);
}
}
// document enrichments (synonyms, facets)
enrich(doc, condenser.synonyms(), document.getGenericFacets());
return doc;
}
/**
* Add icons metadata to Solr doc when corresponding schema attributes are
* enabled.
*
* @param doc
* solr document to fill. Must not be null.
* @param allAttr
* all attributes are enabled.
* @param icons
* document icon entries.
*/
private void processIcons(SolrInputDocument doc, boolean allAttr, Collection icons) {
processIcons(doc, allAttr, null, null, icons);
}
/**
* Add icons metadata to Solr doc when corresponding schema attributes are
* enabled. Remove icons urls from inboudLinks and outboundLinks.
*
* @param doc
* solr document to fill. Must not be null.
* @param allAttr
* all attributes are enabled.
* @param inboundLinks
* all document inbound links.
* @param outboundLinks
* all document outbound links.
* @param icons
* document icon entries.
*/
private void processIcons(SolrInputDocument doc, boolean allAttr, LinkedHashMap inboundLinks,
LinkedHashMap outboundLinks, Collection icons) {
if (icons != null) {
final List protocols = new ArrayList(icons.size());
final String[] sizes = new String[icons.size()];
final String[] stubs = new String[icons.size()];
final String[] rels = new String[icons.size()];
int i = 0;
/* Prepare solr field values */
for (final IconEntry ie : icons) {
final DigestURL url = ie.getUrl();
if(inboundLinks != null) {
inboundLinks.remove(url);
}
if(outboundLinks != null) {
outboundLinks.remove(url);
}
String protocol = url.getProtocol();
protocols.add(protocol);
/*
* There may be multiple sizes and multiple rels for one icon :
* we store this as flat string as currently solr doesn't
* support multidimensionnal array fields
*/
sizes[i] = ie.sizesToString();
stubs[i] = url.toString().substring(protocol.length() + 3);
rels[i] = ie.relToString();
i++;
}
if (allAttr || contains(CollectionSchema.icons_protocol_sxt)) {
add(doc, CollectionSchema.icons_protocol_sxt, protocolList2indexedList(protocols));
}
if (allAttr || contains(CollectionSchema.icons_urlstub_sxt)) {
add(doc, CollectionSchema.icons_urlstub_sxt, stubs);
}
if (allAttr || contains(CollectionSchema.icons_rel_sxt)) {
add(doc, CollectionSchema.icons_rel_sxt, rels);
}
if (allAttr || contains(CollectionSchema.icons_sizes_sxt)) {
add(doc, CollectionSchema.icons_sizes_sxt, sizes);
}
}
}
/**
* Add images metadata to Solr doc when corresponding schema attributes are enabled.
* Remove images urls from inboudLinks and outboundLinks.
* @param doc solr document to fill
* @param allAttr all attributes are enabled
* @param inboundLinks all document inbound links
* @param outboundLinks all document outbound links
* @param images document images
*/
private void processImages(SolrVector doc, boolean allAttr, LinkedHashMap inboundLinks,
LinkedHashMap outboundLinks, List images) {
final ArrayList imgprots = new ArrayList(images.size());
final Integer[] imgheights = new Integer[images.size()];
final Integer[] imgwidths = new Integer[images.size()];
final Integer[] imgpixels = new Integer[images.size()];
final String[] imgstubs = new String[images.size()];
final String[] imgalts = new String[images.size()];
int withalt = 0;
int i = 0;
LinkedHashSet images_text_map = new LinkedHashSet();
/* Prepare flat solr field values */
for (final ImageEntry ie: images) {
final MultiProtocolURL uri = ie.url();
inboundLinks.remove(uri);
outboundLinks.remove(uri);
imgheights[i] = ie.height();
imgwidths[i] = ie.width();
imgpixels[i] = ie.height() < 0 || ie.width() < 0 ? -1 : ie.height() * ie.width();
String protocol = uri.getProtocol();
imgprots.add(protocol);
imgstubs[i] = uri.toString().substring(protocol.length() + 3);
imgalts[i] = ie.alt();
for (String it: CommonPattern.SPACE.split(uri.toTokens())) images_text_map.add(it);
if (ie.alt() != null && ie.alt().length() > 0) {
SentenceReader sr = new SentenceReader(ie.alt());
while (sr.hasNext()) images_text_map.add(sr.next().toString());
withalt++;
}
i++;
}
StringBuilder images_text = new StringBuilder(images_text_map.size() * 6 + 1);
for (String s: images_text_map) images_text.append(s.trim()).append(' ');
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, images.size());
if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots));
if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs);
if (allAttr || contains(CollectionSchema.images_alt_sxt)) add(doc, CollectionSchema.images_alt_sxt, imgalts);
if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, imgheights);
if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths);
if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt);
if (allAttr || contains(CollectionSchema.images_text_t)) add(doc, CollectionSchema.images_text_t, images_text.toString().trim());
}
/**
* attach additional information to the document to enable navigation features
* @param doc the document to be enriched
* @param synonyms a list of synonyms detected for the text content
* @param genericFacets a map where the key is the navigator name and the value is the set of attributes names
*/
public void enrich(SolrInputDocument doc, List synonyms, Map> genericFacets) {
remove(doc, CollectionSchema.vocabularies_sxt); // delete old values
for (SolrInputField sif: doc) {
if (sif.getName().startsWith(CollectionSchema.VOCABULARY_PREFIX)) remove(doc, sif.getName());
}
if (this.isEmpty() || contains(CollectionSchema.vocabularies_sxt)) {
// write generic navigation
// there are no pre-defined solr fields for navigation because the vocabulary is generic
// we use dynamically allocated solr fields for this.
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
// add to genericFacets the probabilistic categories
String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
Map classification = ProbabilisticClassifier.getClassification(text);
for (Map.Entry entry: classification.entrySet()) {
Set facetAttrbutes = new HashSet<>();
facetAttrbutes.add(entry.getValue());
genericFacets.put(entry.getKey(), facetAttrbutes);
}
// compute the document field values
List vocabularies = new ArrayList<>();
for (Map.Entry> facet: genericFacets.entrySet()) {
String facetName = facet.getKey();
Set facetValues = facet.getValue();
int count = facetValues.size();
if (count == 0) continue;
int logcount = (int) (Math.log(count) / Math.log(2));
Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i;
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count]));
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size());
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount);
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts);
vocabularies.add(facetName);
}
if (vocabularies.size() > 0) add(doc, CollectionSchema.vocabularies_sxt, vocabularies);
}
remove(doc, CollectionSchema.synonyms_sxt); // delete old values
if (this.isEmpty() || contains(CollectionSchema.synonyms_sxt)) {
if (synonyms.size() > 0) add(doc, CollectionSchema.synonyms_sxt, synonyms);
}
}
public static boolean postprocessingRunning = false;
public static String postprocessingActivity = "";
// if started, the following values are assigned
public static long postprocessingStartTime = 0; // the start time for the processing; not started = 0
public static int postprocessingCollection1Count = 0; // number of documents to be processed
public static int postprocessingWebgraphCount = 0; // number of documents to be processed
public static final String collection1query(final Segment segment, final String harvestkey) {
return (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ?
"" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
}
public static final String webgraphquery(final Segment segment, final String harvestkey) {
return (harvestkey == null || !segment.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.harvestkey_s) ?
"" : WebgraphSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
WebgraphSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
}
/**
* Performs post-processing steps for all entries that have a process tag assigned
* @param segment Solr segment. Must not be null.
* @param rrCache reference report cache for the segment.
* @param harvestkey key from a harvest process, used to mark documents needing post-processing
* @param byPartialUpdate when true, perform partial updates on documents
* @return the number of post processed documents
*/
public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey, final boolean byPartialUpdate) {
if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0;
final SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
collectionConnector.commit(false); // make sure that we have latest information that can be found
if (segment.fulltext().useWebgraph()) segment.fulltext().getWebgraphConnector().commit(false);
final CollectionConfiguration collection = segment.fulltext().getDefaultConfiguration();
final WebgraphConfiguration webgraph = segment.fulltext().getWebgraphConfiguration();
// calculate the number of documents to be processed
String collection1query = collection1query(segment, harvestkey);
String webgraphquery = webgraphquery(segment, harvestkey);
postprocessingRunning = true;
postprocessingStartTime = System.currentTimeMillis();
postprocessingActivity = "collecting counts";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
try {
postprocessingCollection1Count = (int) collectionConnector.getCountByQuery("{!cache=false}" + collection1query);
postprocessingWebgraphCount = segment.fulltext().useWebgraph() ? (int) segment.fulltext().getWebgraphConnector().getCountByQuery("{!cache=false}" + webgraphquery) : 0;
} catch (IOException e) {
postprocessingCollection1Count = -1;
postprocessingWebgraphCount = -1;
}
postprocessingActivity = "create ranking map";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
boolean shallComputeCR = (segment.fulltext().useWebgraph() &&
((webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) ||
(webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i))) ||
(collection.contains(CollectionSchema.cr_host_count_i) &&
collection.contains(CollectionSchema.cr_host_chance_d) &&
collection.contains(CollectionSchema.cr_host_norm_i)));
// create the ranking map
final Map rankings;
if(shallComputeCR) {
// collect hosts from index which shall take part in citation computation
postprocessingActivity = "collecting host facets for collection";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
ReversibleScoreMap collection1hosts;
try {
Map> hostfacet = collectionConnector.getFacets("{!cache=false}" + collection1query, 10000000, CollectionSchema.host_s.getSolrFieldName());
collection1hosts = hostfacet.get(CollectionSchema.host_s.getSolrFieldName());
} catch (final IOException e2) {
ConcurrentLog.logException(e2);
collection1hosts = new ClusteredScoreMap(true);
}
rankings = createRankingMap(segment, rrCache, collectionConnector, collection1hosts);
} else {
rankings = new ConcurrentHashMap();
}
// process all documents at the webgraph for the outgoing links of this document
final AtomicInteger allcount = new AtomicInteger(0);
if (segment.fulltext().useWebgraph() && shallComputeCR) {
postprocessWebgraph(segment, webgraph, webgraphquery, rankings, allcount);
}
// process all documents in collection
postprocessDocuments(segment, rrCache, harvestkey, byPartialUpdate, collectionConnector, collection,
collection1query, rankings, allcount);
postprocessingCollection1Count = 0;
postprocessingWebgraphCount = 0;
postprocessingActivity = "postprocessing terminated";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
postprocessingRunning = false;
return allcount.get();
}
/**
* Performs postprocessing steps on the main documents dollection.
* @param segment Solr segment.
* @param rrCache reference report cache for the segment.
* @param harvestkey key from a harvest process, used to mark documents needing post-processing
* @param byPartialUpdate when true, perform partial updates on documents
* @param collectionConnector connector to the main Solr collection
* @param collection schema configuration for the collection
* @param collection1query query used to harvest items to postprocess in the main collection
* @param rankings postprocessed rankings
* @param allcount global postprocessed documents count
*/
private void postprocessDocuments(final Segment segment, final ReferenceReportCache rrCache,
final String harvestkey, final boolean byPartialUpdate, final SolrConnector collectionConnector,
final CollectionConfiguration collection, final String collection1query, final Map rankings,
final AtomicInteger allcount) {
final Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id
final Set uniqueURLs = new ConcurrentHashSet(); // will be used in a concurrent environment
final Set omitFields = new HashSet();
omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
final Collection failids = new ConcurrentHashSet();
final AtomicInteger countcheck = new AtomicInteger(0);
final AtomicInteger proccount = new AtomicInteger();
final AtomicInteger proccount_referencechange = new AtomicInteger();
final AtomicInteger proccount_citationchange = new AtomicInteger();
try {
// partitioning of the index, get a facet for a partitioning key
final long count = collectionConnector.getCountByQuery("{!cache=false}" + collection1query);
String partitioningKey = CollectionSchema.responsetime_i.getSolrFieldName();
postprocessingActivity = "collecting " + count + " documents from the collection for harvestkey " + harvestkey + ", partitioned by " + partitioningKey;
if (count > 0) {
Map> partitioningFacet = collectionConnector.getFacets("{!cache=false}" + collection1query, 100000, partitioningKey);
ReversibleScoreMap partitioning = partitioningFacet.get(partitioningKey);
long emptyCount = collectionConnector.getCountByQuery("{!cache=false}" + "-" + partitioningKey + AbstractSolrConnector.CATCHALL_DTERM + " AND (" + collection1query + ")");
if (emptyCount > 0) partitioning.inc("", (int) emptyCount);
final long start = System.currentTimeMillis();
List querystrings = new ArrayList<>(partitioning.size());
for (String partitioningValue: partitioning) {
String partitioningQuery = "{!cache=false}" + ((partitioningValue.length() == 0) ?
"-" + partitioningKey + AbstractSolrConnector.CATCHALL_DTERM + " AND (" + collection1query + ")" :
partitioningKey + ":" + partitioningValue + " AND (" + collection1query + ")");
querystrings.add(partitioningQuery);
}
// start collection of documents
final int concurrency = Math.max(1, Math.min((int) (MemoryControl.available() / (100L * 1024L * 1024L)), Runtime.getRuntime().availableProcessors()));
//final int concurrency = 1;
final boolean reference_computation = this.contains(CollectionSchema.references_i) &&
this.contains(CollectionSchema.references_internal_i) &&
this.contains(CollectionSchema.references_external_i) &&
this.contains(CollectionSchema.references_exthosts_i);
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
final BlockingQueue docs = collectionConnector.concurrentDocumentsByQueries(
querystrings,
(this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) ?
CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
: null, // null sort is faster!
0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true,
byPartialUpdate ?
new String[]{
// the following fields are needed to perform the postprocessing
// and should only be used for partial updates; for full updates use a
// full list of fields to avoid LazyInstantiation which has poor performace
CollectionSchema.id.getSolrFieldName(),
CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.harvestkey_s.getSolrFieldName(),
CollectionSchema.process_sxt.getSolrFieldName(),
CollectionSchema.canonical_equal_sku_b.getSolrFieldName(),
CollectionSchema.canonical_s.getSolrFieldName(),
CollectionSchema.exact_signature_l.getSolrFieldName(),
CollectionSchema.fuzzy_signature_l.getSolrFieldName(),
CollectionSchema.title_exact_signature_l.getSolrFieldName(),
CollectionSchema.description_exact_signature_l.getSolrFieldName(),
CollectionSchema.host_id_s.getSolrFieldName(),
CollectionSchema.host_s.getSolrFieldName(),
CollectionSchema.host_subdomain_s.getSolrFieldName(),
CollectionSchema.url_chars_i.getSolrFieldName(),
CollectionSchema.url_protocol_s.getSolrFieldName(),
CollectionSchema.httpstatus_i.getSolrFieldName(),
CollectionSchema.inboundlinkscount_i.getSolrFieldName(),
CollectionSchema.robots_i.getSolrFieldName()} :
this.allFields());
final Thread rewriteThread[] = new Thread[concurrency];
for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) {
rewriteThread[rewrite_start] = new Thread("CollectionConfiguration.postprocessing.rewriteThread-" + rewrite_start) {
@Override
public void run() {
SolrDocument doc;
try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// for each to-be-processed entry work on the process tag
Collection