/**
* CollectionConfiguration
* Copyright 2011 by Michael Peter Christen
* First released 14.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7654 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see .
*/
package net.yacy.search.schema;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.Array;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.ProcessType;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL;
import net.yacy.cora.federate.solr.logic.BooleanLiteral;
import net.yacy.cora.federate.solr.logic.CatchallLiteral;
import net.yacy.cora.federate.solr.logic.Conjunction;
import net.yacy.cora.federate.solr.logic.Disjunction;
import net.yacy.cora.federate.solr.logic.LongLiteral;
import net.yacy.cora.federate.solr.logic.Negation;
import net.yacy.cora.federate.solr.logic.StringLiteral;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.SentenceReader;
import net.yacy.document.Tokenizer;
import net.yacy.document.content.DCEntry;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.index.RowHandleMap;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.eclipse.jetty.util.ConcurrentHashSet;
public class CollectionConfiguration extends SchemaConfiguration implements Serializable {
private static final long serialVersionUID=-499100932212840385L;
public static boolean UNIQUE_HEURISTIC_PREFER_HTTPS = false;
public static boolean UNIQUE_HEURISTIC_PREFER_WWWPREFIX = true;
private final ArrayList rankings;
/**
* initialize the schema with a given configuration file
* the configuration file simply contains a list of lines with keywords
* or keyword = value lines (while value is a custom Solr field name
* @param configurationFile
* @throws IOException
*/
public CollectionConfiguration(final File configurationFile, final boolean lazy) throws IOException {
super(configurationFile);
super.lazy = lazy;
this.rankings = new ArrayList(4);
for (int i = 0; i <= 3; i++) rankings.add(new Ranking());
// check consistency: compare with YaCyField enum
if (this.isEmpty()) return;
Iterator it = this.entryIterator();
for (SchemaConfiguration.Entry etr = it.next(); it.hasNext(); etr = it.next()) {
try {
CollectionSchema f = CollectionSchema.valueOf(etr.key());
f.setSolrFieldName(etr.getValue());
} catch (final IllegalArgumentException e) {
ConcurrentLog.fine("SolrCollectionWriter", "solr schema file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + etr.toString() + "'");
it.remove();
}
}
// check consistency the other way: look if all enum constants in SolrField appear in the configuration file
for (CollectionSchema field: CollectionSchema.values()) {
if (this.get(field.name()) == null) {
if (CollectionSchema.author_sxt.getSolrFieldName().endsWith(field.name())) continue; // exception for this: that is a copy-field
if (CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName().endsWith(field.name())) continue; // exception for this: automatically generated
if (CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName().endsWith(field.name())) continue; // exception for this: automatically generated
ConcurrentLog.warn("SolrCollectionWriter", " solr schema file " + configurationFile.getAbsolutePath() + " is missing declaration for '" + field.name() + "'");
}
}
}
public String[] allFields() {
ArrayList a = new ArrayList<>(this.size());
for (CollectionSchema f: CollectionSchema.values()) {
if (this.contains(f)) a.add(f.getSolrFieldName());
}
return a.toArray(new String[a.size()]);
}
public Ranking getRanking(final int idx) {
return this.rankings.get(idx % this.rankings.size()); // simply prevent out of bound exeption (& callers don't check for null)
}
/**
* @param name The name of the ranking to get.
* @return The corresponding Ranking-object.
*/
public Ranking getRanking(final String name) {
if (name == null) return null;
for (int i = 0; i < this.rankings.size(); i++) {
Ranking currentRanking = this.rankings.get(i);
if (name.equals(currentRanking.getName())) return currentRanking;
}
return null;
}
/**
* save configuration to file and update enum SolrFields
* @throws IOException
*/
@Override
public void commit() throws IOException {
try {
super.commit();
// make sure the enum SolrField.SolrFieldName is current
Iterator it = this.entryIterator();
for (SchemaConfiguration.Entry etr = it.next(); it.hasNext(); etr = it.next()) {
try {
SchemaDeclaration f = CollectionSchema.valueOf(etr.key());
f.setSolrFieldName(etr.getValue());
} catch (final IllegalArgumentException e) {
continue;
}
}
} catch (final IOException e) {}
}
private final static Set omitFields = new HashSet(3);
static {
omitFields.add(CollectionSchema.author_sxt.getSolrFieldName());
omitFields.add(CollectionSchema.coordinate_p_0_coordinate.getSolrFieldName());
omitFields.add(CollectionSchema.coordinate_p_1_coordinate.getSolrFieldName());
}
public SolrInputDocument toSolrInputDocument(final SolrDocument doc) {
return toSolrInputDocument(doc, omitFields);
}
public SolrDocument toSolrDocument(final SolrInputDocument doc) {
return toSolrDocument(doc, omitFields);
}
/**
* add uri attributes to solr document
* @param doc
* @param allAttr
* @param digestURL
* @param doctype
* @return the normalized url
*/
public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL) {
add(doc, CollectionSchema.id, ASCII.String(digestURL.hash()));
if (allAttr || contains(CollectionSchema.host_id_s)) add(doc, CollectionSchema.host_id_s, digestURL.hosthash());
String us = digestURL.toNormalform(true);
add(doc, CollectionSchema.sku, us);
if (allAttr || contains(CollectionSchema.ip_s)) {
final InetAddress address = digestURL.getInetAddress();
if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress());
}
String host = null;
if ((host = digestURL.getHost()) != null) {
String dnc = Domains.getDNC(host);
String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1);
int p = subdomOrga.lastIndexOf('.');
String subdom = (p < 0) ? "" : subdomOrga.substring(0, p);
String orga = (p < 0) ? subdomOrga : subdomOrga.substring(p + 1);
if (allAttr || contains(CollectionSchema.host_s)) add(doc, CollectionSchema.host_s, host);
if (allAttr || contains(CollectionSchema.host_dnc_s)) add(doc, CollectionSchema.host_dnc_s, dnc);
if (allAttr || contains(CollectionSchema.host_organization_s)) add(doc, CollectionSchema.host_organization_s, orga);
if (allAttr || contains(CollectionSchema.host_organizationdnc_s)) add(doc, CollectionSchema.host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(CollectionSchema.host_subdomain_s)) add(doc, CollectionSchema.host_subdomain_s, subdom);
}
// path elements of link
String filename = digestURL.getFileName();
String extension = MultiProtocolURL.getFileExtension(filename);
String filenameStub = filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename;
// remove possible jsession (or other url parm like "img.jpg;jsession=123")
// TODO: consider to implement ";jsession=123" check in getFileExtension()
if (extension.indexOf(';') >= 0) extension = extension.substring(0,extension.indexOf(';'));
if (allAttr || contains(CollectionSchema.url_chars_i)) add(doc, CollectionSchema.url_chars_i, us.length());
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURL.getProtocol());
if (allAttr || contains(CollectionSchema.url_paths_sxt) || contains(CollectionSchema.url_paths_count_i)) {
String[] paths = digestURL.getPaths();
if (allAttr || contains(CollectionSchema.url_paths_count_i)) add(doc, CollectionSchema.url_paths_count_i, paths.length);
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, paths);
}
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filenameStub);
if (allAttr || contains(CollectionSchema.url_file_name_tokens_t)) add(doc, CollectionSchema.url_file_name_tokens_t, MultiProtocolURL.toTokens(filenameStub));
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
Map searchpart = digestURL.getSearchpartMap();
if (searchpart == null) {
if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, 0);
} else {
if (allAttr || contains(CollectionSchema.url_parameter_i)) add(doc, CollectionSchema.url_parameter_i, searchpart.size());
if (allAttr || contains(CollectionSchema.url_parameter_key_sxt)) add(doc, CollectionSchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()]));
if (allAttr || contains(CollectionSchema.url_parameter_value_sxt)) add(doc, CollectionSchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()]));
}
return us;
}
public SolrInputDocument metadata2solr(final URIMetadataNode md) {
SolrInputDocument doc = toSolrInputDocument(md); //urimetadatanode stores some values in private fields, add now to sorldocument
boolean allAttr = this.isEmpty();
addURIAttributes(doc, allAttr, md.url());
String title = md.dc_title();
if (allAttr || contains(CollectionSchema.title_count_i)) add(doc, CollectionSchema.title_count_i, 1);
if (allAttr || contains(CollectionSchema.title_chars_val)) {
Integer[] cv = new Integer[]{new Integer(title.length())};
add(doc, CollectionSchema.title_chars_val, cv);
}
if (allAttr || contains(CollectionSchema.title_words_val)) {
Integer[] cv = new Integer[]{new Integer(CommonPattern.SPACES.split(title).length)};
add(doc, CollectionSchema.title_words_val, cv);
}
String description = md.snippet();
boolean description_exist = description != null;
if (description == null) description = "";
if (allAttr || contains(CollectionSchema.description_txt)) add(doc, CollectionSchema.description_txt, description_exist ? new String[]{description} : new String[0]);
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, description_exist ? 1 : 0);
if (allAttr || contains(CollectionSchema.description_chars_val)) {
add(doc, CollectionSchema.description_chars_val, description_exist ? new Integer[]{new Integer(description.length())} : new Integer[0]);
}
if (allAttr || contains(CollectionSchema.description_words_val)) {
add(doc, CollectionSchema.description_words_val, description_exist ? new Integer[]{new Integer(description.length() == 0 ? 0 : CommonPattern.SPACES.split(description).length)} : new Integer[0]);
}
String keywords = md.dc_subject();
Bitfield flags = md.flags();
if (flags.get(Tokenizer.flag_cat_indexof)) {
if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else {
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
}
}
if (allAttr || contains(CollectionSchema.keywords)) {
add(doc, CollectionSchema.keywords, keywords);
}
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, md.limage());
if (allAttr || contains(CollectionSchema.linkscount_i)) add(doc, CollectionSchema.linkscount_i, md.llocal() + md.lother());
if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, md.llocal());
if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, md.lother());
if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, StandardCharsets.UTF_8.name());
// coordinates
if (md.lat() != 0.0 && md.lon() != 0.0) {
// i.e. from or
if (allAttr || contains(CollectionSchema.coordinate_p)) {
add(doc, CollectionSchema.coordinate_p, Double.toString(md.lat()) + "," + Double.toString(md.lon()));
}
}
if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, 200);
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher());
// fields that are in URIMetadataRow additional to yacy2solr basic requirement
if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, md.laudio());
if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, md.lvideo());
if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, md.lapp());
if (allAttr || contains(CollectionSchema.text_t)) {
// construct the text from other metadata parts.
// This is necessary here since that is used to search the link when no other data (parsed text body) is available
StringBuilder sb = new StringBuilder(120);
// accText(sb, md.dc_title()); // default search field via getQueryFields(), not needed for snippet (always displayed)
// accText(sb, md.dc_creator()); // author is in Default ranking/getQueryFields
// accText(sb, md.dc_publisher()); // has it's own metadata field publisher_t (not part of default queryfields) and mostly N/A
// accText(sb, md.snippet()); // above added to description_txt, default search field via getQueryFields(), description_txt incl. in snippet calculation
accText(sb, md.url().toTokens());
// accText(sb, keywords); // default search field via getQueryFields(), keywords not incl. in snippet calculation
add(doc, CollectionSchema.text_t, sb.toString());
}
return doc;
}
private static void accText(final StringBuilder sb, String text) {
if (text == null || text.length() == 0) return;
if (sb.length() != 0) sb.append(' ');
text = text.trim();
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
public static class Subgraph {
public final ArrayList[] urlProtocols, urlStubs, urlAnchorTexts;
@SuppressWarnings("unchecked")
public Subgraph(int inboundSize, int outboundSize) {
this.urlProtocols = (ArrayList[]) Array.newInstance(ArrayList.class, 2);
this.urlProtocols[0] = new ArrayList(inboundSize);
this.urlProtocols[1] = new ArrayList(outboundSize);
this.urlStubs = (ArrayList[]) Array.newInstance(ArrayList.class, 2);
this.urlStubs[0] = new ArrayList(inboundSize);
this.urlStubs[1] = new ArrayList(outboundSize);
this.urlAnchorTexts = (ArrayList[]) Array.newInstance(ArrayList.class, 2);
this.urlAnchorTexts[0] = new ArrayList(inboundSize);
this.urlAnchorTexts[1] = new ArrayList(outboundSize);
}
}
public static boolean enrichSubgraph(final Subgraph subgraph, final DigestURL source_url, AnchorURL target_url) {
final String text = target_url.getTextProperty(); // the text between the tag
String source_host = source_url.getHost();
String target_host = target_url.getHost();
boolean inbound =
(source_host == null && target_host == null) ||
(source_host != null && target_host != null &&
(target_host.equals(source_host) ||
target_host.equals("www." + source_host) ||
source_host.equals("www." + target_host))); // well, not everybody defines 'outbound' that way but however, thats used here.
int ioidx = inbound ? 0 : 1;
subgraph.urlProtocols[ioidx].add(target_url.getProtocol());
subgraph.urlStubs[ioidx].add(target_url.urlstub(true, true));
subgraph.urlAnchorTexts[ioidx].add(text);
return inbound;
}
/**
* a SolrVector is a SolrInputDocument with the ability
* to store also the webgraph that is associated with
* the web document in the Solr document.
*/
public static class SolrVector extends SolrInputDocument {
private static final long serialVersionUID = -210901881471714939L;
private List webgraphDocuments;
public SolrVector() {
super();
this.webgraphDocuments = new ArrayList();
}
public void addWebgraphDocument(SolrInputDocument webgraphDocument) {
this.webgraphDocuments.add(webgraphDocument);
}
public List getWebgraphDocuments() {
return this.webgraphDocuments;
}
}
public SolrVector yacy2solr(
final Segment segment,
final Map collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final boolean setUnique,
final WebgraphConfiguration webgraph, final String sourceName) {
// we use the SolrCell design as index schema
SolrVector doc = new SolrVector();
final DigestURL digestURL = document.dc_source();
boolean allAttr = this.isEmpty();
String url = addURIAttributes(doc, allAttr, digestURL);
add(doc, CollectionSchema.content_type, new String[]{document.dc_format()}); // content_type (mime) is defined a schema field and we rely on it in some queries like imagequery (makes it mandatory, no need to check)
Set processTypes = new LinkedHashSet();
String host = digestURL.getHost();
int crawldepth = document.getDepth();
if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
CollectionSchema.crawldepth_i.add(doc, crawldepth);
}
if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) {
processTypes.add(ProcessType.CITATION); // postprocessing needed
}
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
List cs = new ArrayList();
for (Map.Entry e: collections.entrySet()) {
if (e.getValue().matcher(url).matches()) cs.add(e.getKey());
}
add(doc, CollectionSchema.collection_sxt, cs);
}
char doctype = Response.docType(responseHeader== null ? null : responseHeader.getContentType()); // null returns DT_UNKNOWN
List titles = document.titles();
if (allAttr || contains(CollectionSchema.title)) {
if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
String mediatitle = responseHeader.get(HeaderFramework.X_YACY_MEDIA_TITLE, "");
if (mediatitle.length() > 0) {
if (titles.size() == 0) titles.add(mediatitle); else titles.set(0, mediatitle);
}
}
add(doc, CollectionSchema.title, titles);
if ((allAttr || contains(CollectionSchema.title_exact_signature_l)) && titles.size() > 0) {
add(doc, CollectionSchema.title_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(titles.get(0)));
}
}
if (allAttr || contains(CollectionSchema.title_count_i)) add(doc, CollectionSchema.title_count_i, titles.size());
if (allAttr || contains(CollectionSchema.title_chars_val)) {
ArrayList cv = new ArrayList(titles.size());
for (String s: titles) cv.add(new Integer(s.length()));
add(doc, CollectionSchema.title_chars_val, cv);
}
if (allAttr || contains(CollectionSchema.title_words_val)) {
ArrayList cv = new ArrayList(titles.size());
for (String s: titles) cv.add(new Integer(CommonPattern.SPACES.split(s).length));
add(doc, CollectionSchema.title_words_val, cv);
}
String[] descriptions = document.dc_description();
if (allAttr || contains(CollectionSchema.description_txt)) {
add(doc, CollectionSchema.description_txt, descriptions);
if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && descriptions != null && descriptions.length > 0) {
add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(descriptions));
}
}
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.length);
if (allAttr || contains(CollectionSchema.description_chars_val)) {
ArrayList cv = new ArrayList(descriptions.length);
for (String s: descriptions) cv.add(new Integer(s.length()));
add(doc, CollectionSchema.description_chars_val, cv);
}
if (allAttr || contains(CollectionSchema.description_words_val)) {
ArrayList cv = new ArrayList(descriptions.length);
for (String s: descriptions) cv.add(new Integer(CommonPattern.SPACES.split(s).length));
add(doc, CollectionSchema.description_words_val, cv);
}
if (allAttr || contains(CollectionSchema.author)) {
String author = document.dc_creator();
if (author == null || author.length() == 0) author = document.dc_publisher();
add(doc, CollectionSchema.author, author);
}
if (allAttr || contains(CollectionSchema.last_modified)) {
Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified();
if (lastModified == null) lastModified = new Date();
if (document.getLastModified().before(lastModified)) lastModified = document.getLastModified();
long firstSeen = segment.getFirstSeenTime(digestURL.hash());
if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
add(doc, CollectionSchema.last_modified, lastModified);
}
if (allAttr || contains(CollectionSchema.dates_in_content_dts) || contains(CollectionSchema.dates_in_content_count_i)) {
LinkedHashSet dates_in_content = condenser.dates_in_content;
if (allAttr || contains(CollectionSchema.dates_in_content_count_i)) {
add(doc, CollectionSchema.dates_in_content_count_i, dates_in_content.size());
}
if (dates_in_content.size() > 0 && (allAttr || contains(CollectionSchema.dates_in_content_dts))) {
add(doc, CollectionSchema.dates_in_content_dts, dates_in_content.toArray(new Date[dates_in_content.size()]));
}
}
if (allAttr || contains(CollectionSchema.keywords)) {
String keywords = document.dc_subject(' ');
if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
keywords = responseHeader.get(HeaderFramework.X_YACY_MEDIA_KEYWORDS, keywords);
}
add(doc, CollectionSchema.keywords, keywords);
}
// unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is)
add(doc, CollectionSchema.http_unique_b, setUnique || UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.www_unique_b, setUnique || host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature());
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
add(doc, CollectionSchema.fuzzy_signature_l, condenser.fuzzySignature());
add(doc, CollectionSchema.fuzzy_signature_text_t, condenser.fuzzySignatureText());
add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.fuzzy_signature_copycount_i, 0); // this must be corrected afterwards during postprocessing!
if (this.contains(CollectionSchema.exact_signature_unique_b) || this.contains(CollectionSchema.exact_signature_copycount_i) ||
this.contains(CollectionSchema.fuzzy_signature_l) || this.contains(CollectionSchema.fuzzy_signature_copycount_i) ||
this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) {
processTypes.add(ProcessType.UNIQUE);
}
// get list of all links; they will be shrinked by urls that appear in other fields of the solr schema
LinkedHashMap inboundLinks = document.inboundLinks();
LinkedHashMap outboundLinks = document.outboundLinks();
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
int c = 0;
final Object parser = document.getParserObject();
boolean containsCanonical = false;
DigestURL canonical = null;
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
List images = html.getImages();
// header tags
int h = 0;
int f = 1;
String[] hs;
hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h1_txt, hs); add(doc, CollectionSchema.h1_i, hs.length);
hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h2_txt, hs); add(doc, CollectionSchema.h2_i, hs.length);
hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h3_txt, hs); add(doc, CollectionSchema.h3_i, hs.length);
hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h4_txt, hs); add(doc, CollectionSchema.h4_i, hs.length);
hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h5_txt, hs); add(doc, CollectionSchema.h5_i, hs.length);
hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, CollectionSchema.h6_txt, hs); add(doc, CollectionSchema.h6_i, hs.length);
add(doc, CollectionSchema.htags_i, h);
add(doc, CollectionSchema.schema_org_breadcrumb_i, html.breadcrumbCount());
// meta tags: Open Graph properties
String og;
og = html.getMetas().get("og:title"); if (og != null) add(doc, CollectionSchema.opengraph_title_t, og);
og = html.getMetas().get("og:type"); if (og != null) add(doc, CollectionSchema.opengraph_type_s, og);
og = html.getMetas().get("og:url"); if (og != null) add(doc, CollectionSchema.opengraph_url_s, og);
og = html.getMetas().get("og:image"); if (og != null) add(doc, CollectionSchema.opengraph_image_s, og);
// noindex and nofollow attributes
// from HTML (meta-tag in HTML header: robots)
// and HTTP header (X-Robots-Tag property)
// coded as binary value:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "follow" contained in html header meta
// bit 3: "noindex" contained in html header meta
// bit 4: "nofollow" contained in html header meta
// bit 5: "noarchive" contained in html header meta
// bit 8: "all" contained in http header X-Robots-Tag
// bit 9: "noindex" contained in http header X-Robots-Tag
// bit 10: "nofollow" contained in http header X-Robots-Tag
// bit 11: "noarchive" contained in http header X-Robots-Tag
// bit 12: "nosnippet" contained in http header X-Robots-Tag
// bit 13: "noodp" contained in http header X-Robots-Tag
// bit 14: "notranslate" contained in http header X-Robots-Tag
// bit 15: "noimageindex" contained in http header X-Robots-Tag
// bit 16: "unavailable_after" contained in http header X-Robots-Tag
int b = 0;
String robots_meta = html.getMetas().get("robots");
// this tag may have values: all, index, noindex, nofollow; see http://www.robotstxt.org/meta.html
if (robots_meta != null) {
robots_meta = robots_meta.toLowerCase();
if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0
if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1
if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2
if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3
if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
if (robots_meta.indexOf("noarchive",0) >= 0) b += 32; // set bit 5
}
String x_robots_tag = responseHeader == null ? "" : responseHeader.getXRobotsTag();
if (!x_robots_tag.isEmpty()) {
// this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de
if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8
if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9
if (x_robots_tag.indexOf("nofollow",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<10; // set bit 10
if (x_robots_tag.indexOf("noarchive",0) >= 0) b += 1<<11; // set bit 11
if (x_robots_tag.indexOf("nosnippet",0) >= 0) b += 1<<12; // set bit 12
if (x_robots_tag.indexOf("noodp",0) >= 0) b += 1<<13; // set bit 13
if (x_robots_tag.indexOf("notranslate",0) >= 0) b += 1<<14; // set bit 14
if (x_robots_tag.indexOf("noimageindex",0) >= 0) b += 1<<15; // set bit 15
if (x_robots_tag.indexOf("unavailable_after",0) >= 0) b += 1<<16; // set bit 16
}
add(doc, CollectionSchema.robots_i, b);
// meta tags: generator
final String generator = html.getMetas().get("generator");
if (generator != null) add(doc, CollectionSchema.metagenerator_t, generator);
// bold, italic
final String[] bold = html.getBold();
add(doc, CollectionSchema.boldcount_i, bold.length);
if (bold.length > 0) {
add(doc, CollectionSchema.bold_txt, bold);
if (allAttr || contains(CollectionSchema.bold_val)) {
add(doc, CollectionSchema.bold_val, html.getBoldCount(bold));
}
}
final String[] italic = html.getItalic();
add(doc, CollectionSchema.italiccount_i, italic.length);
if (italic.length > 0) {
add(doc, CollectionSchema.italic_txt, italic);
if (allAttr || contains(CollectionSchema.italic_val)) {
add(doc, CollectionSchema.italic_val, html.getItalicCount(italic));
}
}
final String[] underline = html.getUnderline();
add(doc, CollectionSchema.underlinecount_i, underline.length);
if (underline.length > 0) {
add(doc, CollectionSchema.underline_txt, underline);
if (allAttr || contains(CollectionSchema.underline_val)) {
add(doc, CollectionSchema.underline_val, html.getUnderlineCount(underline));
}
}
final String[] li = html.getLi();
add(doc, CollectionSchema.licount_i, li.length);
if (li.length > 0) add(doc, CollectionSchema.li_txt, li);
final String[] dt = html.getDt();
add(doc, CollectionSchema.dtcount_i, dt.length);
if (dt.length > 0) add(doc, CollectionSchema.dt_txt, dt);
final String[] dd = html.getDd();
add(doc, CollectionSchema.ddcount_i, dd.length);
if (dd.length > 0) add(doc, CollectionSchema.dd_txt, dd);
final List startDates = html.getStartDates();
if (startDates.size() > 0) add(doc, CollectionSchema.startDates_dts, startDates.toArray(new Date[startDates.size()]));
final List endDates = html.getStartDates();
if (endDates.size() > 0) add(doc, CollectionSchema.endDates_dts, endDates.toArray(new Date[endDates.size()]));
final List articles = html.getArticles();
add(doc, CollectionSchema.articlecount_i, articles.size());
if (articles.size() > 0) add(doc, CollectionSchema.article_txt, articles);
// images
final ArrayList imgprots = new ArrayList(images.size());
final Integer[] imgheights = new Integer[images.size()];
final Integer[] imgwidths = new Integer[images.size()];
final Integer[] imgpixels = new Integer[images.size()];
final String[] imgstubs = new String[images.size()];
final String[] imgalts = new String[images.size()];
int withalt = 0;
int i = 0;
LinkedHashSet images_text_map = new LinkedHashSet();
for (final ImageEntry ie: images) {
final MultiProtocolURL uri = ie.url();
inboundLinks.remove(uri);
outboundLinks.remove(uri);
imgheights[i] = ie.height();
imgwidths[i] = ie.width();
imgpixels[i] = ie.height() < 0 || ie.width() < 0 ? -1 : ie.height() * ie.width();
String protocol = uri.getProtocol();
imgprots.add(protocol);
imgstubs[i] = uri.toString().substring(protocol.length() + 3);
imgalts[i] = ie.alt();
for (String it: CommonPattern.SPACE.split(uri.toTokens())) images_text_map.add(it);
if (ie.alt() != null && ie.alt().length() > 0) {
SentenceReader sr = new SentenceReader(ie.alt());
while (sr.hasNext()) images_text_map.add(sr.next().toString());
withalt++;
}
i++;
}
StringBuilder images_text = new StringBuilder(images_text_map.size() * 6 + 1);
for (String s: images_text_map) images_text.append(s.trim()).append(' ');
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, images.size());
if (allAttr || contains(CollectionSchema.images_protocol_sxt)) add(doc, CollectionSchema.images_protocol_sxt, protocolList2indexedList(imgprots));
if (allAttr || contains(CollectionSchema.images_urlstub_sxt)) add(doc, CollectionSchema.images_urlstub_sxt, imgstubs);
if (allAttr || contains(CollectionSchema.images_alt_sxt)) add(doc, CollectionSchema.images_alt_sxt, imgalts);
if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, imgheights);
if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, imgwidths);
if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
if (allAttr || contains(CollectionSchema.images_withalt_i)) add(doc, CollectionSchema.images_withalt_i, withalt);
if (allAttr || contains(CollectionSchema.images_text_t)) add(doc, CollectionSchema.images_text_t, images_text.toString().trim());
// style sheets
if (allAttr || contains(CollectionSchema.css_tag_sxt)) {
final Map csss = html.getCSS();
final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()];
c = 0;
for (final Map.Entry entry: csss.entrySet()) {
final String cssurl = entry.getKey().toNormalform(false);
inboundLinks.remove(entry.getKey());
outboundLinks.remove(entry.getKey());
css_tag[c] =
"";
css_url[c] = cssurl;
c++;
}
add(doc, CollectionSchema.csscount_i, css_tag.length);
if (css_tag.length > 0) add(doc, CollectionSchema.css_tag_sxt, css_tag);
if (css_url.length > 0) add(doc, CollectionSchema.css_url_sxt, css_url);
}
// Scripts
if (allAttr || contains(CollectionSchema.scripts_sxt)) {
final Set scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
for (final AnchorURL u: scriptss) {
inboundLinks.remove(u);
outboundLinks.remove(u);
scripts[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.scriptscount_i, scripts.length);
if (scripts.length > 0) add(doc, CollectionSchema.scripts_sxt, scripts);
}
// Frames
if (allAttr || contains(CollectionSchema.frames_sxt)) {
final Set framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
for (final AnchorURL u: framess) {
inboundLinks.remove(u);
outboundLinks.remove(u);
frames[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.framesscount_i, frames.length);
if (frames.length > 0) {
add(doc, CollectionSchema.frames_sxt, frames);
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
}
}
// IFrames
if (allAttr || contains(CollectionSchema.iframes_sxt)) {
final Set iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
for (final AnchorURL u: iframess) {
inboundLinks.remove(u);
outboundLinks.remove(u);
iframes[c++] = u.toNormalform(false);
}
add(doc, CollectionSchema.iframesscount_i, iframes.length);
if (iframes.length > 0) {
add(doc, CollectionSchema.iframes_sxt, iframes);
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
}
}
// canonical tag
if (allAttr || contains(CollectionSchema.canonical_s)) {
canonical = html.getCanonical();
// if there is no canonical in the html then look into the http header:
if (canonical == null && responseHeader != null) {
String link = responseHeader.get("Link", null);
int p;
if (link != null && ((p = link.indexOf("rel=\"canonical\"")) > 0)) {
link = link.substring(0, p).trim();
p = link.indexOf('<');
int q = link.lastIndexOf('>');
if (p >= 0 && q > 0) {
link = link.substring(p + 1, q);
try {
canonical = new DigestURL(link);
} catch (MalformedURLException e) {}
}
}
}
if (canonical != null) {
containsCanonical = true;
inboundLinks.remove(canonical);
outboundLinks.remove(canonical);
add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false));
// set a flag if this is equal to sku
if (contains(CollectionSchema.canonical_equal_sku_b)) {
add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(digestURL));
}
}
}
// meta refresh tag
if (allAttr || contains(CollectionSchema.refresh_s)) {
String refresh = html.getRefreshPath();
if (refresh != null && refresh.length() > 0) {
MultiProtocolURL refreshURL;
try {
refreshURL = refresh.startsWith("http") ? new MultiProtocolURL(html.getRefreshPath()) : new MultiProtocolURL(digestURL, html.getRefreshPath());
if (refreshURL != null) {
inboundLinks.remove(refreshURL);
outboundLinks.remove(refreshURL);
add(doc, CollectionSchema.refresh_s, refreshURL.toNormalform(false));
}
} catch (final MalformedURLException e) {
add(doc, CollectionSchema.refresh_s, refresh);
}
}
}
// flash embedded
if (allAttr || contains(CollectionSchema.flash_b)) {
MultiProtocolURL[] flashURLs = html.getFlash();
for (MultiProtocolURL u: flashURLs) {
// remove all flash links from ibound/outbound links
inboundLinks.remove(u);
outboundLinks.remove(u);
}
add(doc, CollectionSchema.flash_b, flashURLs.length > 0);
}
// generic evaluation pattern
for (final String model: html.getEvaluationModelNames()) {
if (allAttr || contains("ext_" + model + "_txt")) {
final String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) {
add(doc, CollectionSchema.valueOf("ext_" + model + "_txt"), scorenames);
add(doc, CollectionSchema.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames));
}
}
}
// response time
add(doc, CollectionSchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
// hreflang link tag, see http://support.google.com/webmasters/bin/answer.py?hl=de&answer=189077
if (allAttr || (contains(CollectionSchema.hreflang_url_sxt) && contains(CollectionSchema.hreflang_cc_sxt))) {
final String[] ccs = new String[html.getHreflang().size()];
final String[] urls = new String[html.getHreflang().size()];
c = 0;
for (Map.Entry e: html.getHreflang().entrySet()) {
ccs[c] = e.getKey();
urls[c] = e.getValue().toNormalform(true);
c++;
}
add(doc, CollectionSchema.hreflang_cc_sxt, ccs);
add(doc, CollectionSchema.hreflang_url_sxt, urls);
}
// page navigation url, see http://googlewebmastercentral.blogspot.de/2011/09/pagination-with-relnext-and-relprev.html
if (allAttr || (contains(CollectionSchema.navigation_url_sxt) && contains(CollectionSchema.navigation_type_sxt))) {
final String[] navs = new String[html.getNavigation().size()];
final String[] urls = new String[html.getNavigation().size()];
c = 0;
for (Map.Entry e: html.getNavigation().entrySet()) {
navs[c] = e.getKey();
urls[c] = e.getValue().toNormalform(true);
c++;
}
add(doc, CollectionSchema.navigation_type_sxt, navs);
add(doc, CollectionSchema.navigation_url_sxt, urls);
}
// publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de
if (allAttr || contains(CollectionSchema.publisher_url_s) && html.getPublisherLink() != null) {
add(doc, CollectionSchema.publisher_url_s, html.getPublisherLink().toNormalform(true));
}
}
if (parser instanceof DCEntry) {
// the document was created with a surrogate parsing; overwrite all md: -entries to Solr
DCEntry dcentry = (DCEntry) parser;
for (Map.Entry entry: dcentry.getMap().entrySet()) {
String tag = entry.getKey();
if (!tag.startsWith("md:") || tag.length() < 4) continue;
CollectionSchema solr_field = CollectionSchema.valueOf(tag.substring(3));
if (solr_field == null) continue;
String[] values = entry.getValue();
if (values == null || values.length == 0) continue;
if (allAttr || contains(solr_field)) {
add(doc, solr_field, values);
}
}
}
String content = document.getTextString();
String tokens = digestURL.toTokens();
if (content == null || content.length() == 0) {
content = tokens;
} else {
String[] t = CommonPattern.SPACE.split(tokens);
for (String r: t) {
if (r.length() > 0 &&
content.indexOf(" " + r + " ") < 0 &&
!content.startsWith(r + " ") &&
!content.endsWith(" " + r)) content += " " + r;
}
}
// handle image source meta data
if (document.getContentDomain() == ContentDomain.IMAGE) {
// add image pixel size if known
Iterator imgit = document.getImages().values().iterator();
List heights = new ArrayList<>();
List widths = new ArrayList<>();
List pixels = new ArrayList<>();
while (imgit.hasNext()) {
ImageEntry img = imgit.next();
int imgpixels = (img.height() < 0 || img.width() < 0) ? -1 : img.height() * img.width();
if (imgpixels > 0 && (allAttr || (contains(CollectionSchema.images_height_val) && contains(CollectionSchema.images_width_val) && contains(CollectionSchema.images_pixel_val)))) {
heights.add(img.height());
widths.add(img.width());
pixels.add(imgpixels);
}
}
if (heights.size() > 0) {
add(doc, CollectionSchema.images_height_val, heights);
add(doc, CollectionSchema.images_width_val, widths);
add(doc, CollectionSchema.images_pixel_val, pixels);
}
if (allAttr || contains(CollectionSchema.images_text_t)) {
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
content = digestURL.toTokens(); // remove all other entry but the url tokens
}
}
// content (must be written after special parser data, since this can influence the content)
if (allAttr || contains(CollectionSchema.text_t)) add(doc, CollectionSchema.text_t, content);
if (allAttr || contains(CollectionSchema.wordcount_i)) {
if (content.length() == 0) {
add(doc, CollectionSchema.wordcount_i, 0);
} else {
int contentwc = 1;
for (int i = content.length() - 1; i >= 0; i--) if (content.charAt(i) == ' ') contentwc++;
add(doc, CollectionSchema.wordcount_i, contentwc);
}
}
// statistics about the links
if (allAttr || contains(CollectionSchema.linkscount_i)) add(doc, CollectionSchema.linkscount_i, inboundLinks.size() + outboundLinks.size());
if (allAttr || contains(CollectionSchema.linksnofollowcount_i)) add(doc, CollectionSchema.linksnofollowcount_i, document.inboundLinkNofollowCount() + document.outboundLinkNofollowCount());
if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, inboundLinks.size());
if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount());
if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size());
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
// create a subgraph
Boolean canonical_equal_sku = canonical == null ? null : canonical.toNormalform(true).equals(url);
if (webgraph != null && (!containsCanonical || (canonical_equal_sku != null && (canonical_equal_sku.booleanValue())))) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
List edges = webgraph.getEdges(subgraph, digestURL, responseHeader, collections, crawldepth, processTypes, document.getHyperlinks().keySet(), sourceName);
// this also enriched the subgraph
doc.webgraphDocuments.addAll(edges);
} else {
if (allAttr ||
contains(CollectionSchema.inboundlinks_protocol_sxt) ||
contains(CollectionSchema.inboundlinks_urlstub_sxt) ||
contains(CollectionSchema.inboundlinks_anchortext_txt) ||
contains(CollectionSchema.outboundlinks_protocol_sxt) ||
contains(CollectionSchema.outboundlinks_urlstub_sxt) ||
contains(CollectionSchema.outboundlinks_anchortext_txt)) {
for (final AnchorURL target_url: document.getHyperlinks().keySet()) {
enrichSubgraph(subgraph, digestURL, target_url);
}
}
}
// attach the subgraph content
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_sxt)) add(doc, CollectionSchema.inboundlinks_urlstub_sxt, subgraph.urlStubs[0]);
if (allAttr || contains(CollectionSchema.inboundlinks_anchortext_txt)) add(doc, CollectionSchema.inboundlinks_anchortext_txt, subgraph.urlAnchorTexts[0]);
if (allAttr || contains(CollectionSchema.outboundlinks_protocol_sxt)) add(doc, CollectionSchema.outboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[1]));
if (allAttr || contains(CollectionSchema.outboundlinks_urlstub_sxt)) add(doc, CollectionSchema.outboundlinks_urlstub_sxt, subgraph.urlStubs[1]);
if (allAttr || contains(CollectionSchema.outboundlinks_anchortext_txt)) add(doc, CollectionSchema.outboundlinks_anchortext_txt, subgraph.urlAnchorTexts[1]);
// charset
if (allAttr || contains(CollectionSchema.charset_s)) add(doc, CollectionSchema.charset_s, document.getCharset());
// coordinates
if (document.lat() != 0.0 && document.lon() != 0.0) {
if (allAttr || contains(CollectionSchema.coordinate_p)) add(doc, CollectionSchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon()));
}
if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode());
// fields that were additionally in URIMetadataRow
Date loadDate = new Date();
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength());
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, loadDate);
if (allAttr || contains(CollectionSchema.fresh_date_dt)) add(doc, CollectionSchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && referrerURL != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(referrerURL.hash()));
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, document.dc_publisher());
if ((allAttr || contains(CollectionSchema.language_s)) && language != null) add(doc, CollectionSchema.language_s, language);
if (allAttr || contains(CollectionSchema.size_i)) add(doc, CollectionSchema.size_i, size);
if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, document.getAudiolinks().size());
if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, document.getVideolinks().size());
if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, document.getApplinks().size());
// document post-processing
if ((allAttr || contains(CollectionSchema.process_sxt)) && processTypes.size() > 0) {
List p = new ArrayList();
for (ProcessType t: processTypes) p.add(t.name());
add(doc, CollectionSchema.process_sxt, p);
if (allAttr || contains(CollectionSchema.harvestkey_s)) {
add(doc, CollectionSchema.harvestkey_s, sourceName);
}
}
// document enrichments (synonyms, facets)
enrich(doc, condenser.synonyms(), document.getGenericFacets());
return doc;
}
/**
* attach additional information to the document to enable navigation features
* @param doc the document to be enriched
* @param synonyms a list of synonyms detected for the text content
* @param genericFacets a map where the key is the navigator name and the value is the set of attributes names
*/
public void enrich(SolrInputDocument doc, List synonyms, Map> genericFacets) {
remove(doc, CollectionSchema.vocabularies_sxt); // delete old values
for (SolrInputField sif: doc) {
if (sif.getName().startsWith(CollectionSchema.VOCABULARY_PREFIX)) remove(doc, sif.getName());
}
if (this.isEmpty() || contains(CollectionSchema.vocabularies_sxt)) {
// write generic navigation
// there are no pre-defined solr fields for navigation because the vocabulary is generic
// we use dynamically allocated solr fields for this.
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
// add to genericFacets the probabilistic categories
String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
Map classification = ProbabilisticClassifier.getClassification(text);
for (Map.Entry entry: classification.entrySet()) {
Set facetAttrbutes = new HashSet<>();
facetAttrbutes.add(entry.getValue());
genericFacets.put(entry.getKey(), facetAttrbutes);
}
// compute the document field values
List vocabularies = new ArrayList<>();
for (Map.Entry> facet: genericFacets.entrySet()) {
String facetName = facet.getKey();
Set facetValues = facet.getValue();
int count = facetValues.size();
if (count == 0) continue;
int logcount = (int) (Math.log(count) / Math.log(2));
Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i;
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count]));
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size());
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount);
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts);
vocabularies.add(facetName);
}
if (vocabularies.size() > 0) add(doc, CollectionSchema.vocabularies_sxt, vocabularies);
}
remove(doc, CollectionSchema.synonyms_sxt); // delete old values
if (this.isEmpty() || contains(CollectionSchema.synonyms_sxt)) {
if (synonyms.size() > 0) add(doc, CollectionSchema.synonyms_sxt, synonyms);
}
}
public static boolean postprocessingRunning = false;
public static String postprocessingActivity = "";
// if started, the following values are assigned
public static long postprocessingStartTime = 0; // the start time for the processing; not started = 0
public static int postprocessingCollection1Count = 0; // number of documents to be processed
public static int postprocessingWebgraphCount = 0; // number of documents to be processed
public static final String collection1query(final Segment segment, final String harvestkey) {
return (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ?
"" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
}
public static final String webgraphquery(final Segment segment, final String harvestkey) {
return (harvestkey == null || !segment.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.harvestkey_s) ?
"" : WebgraphSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") +
WebgraphSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
}
/**
* post-processing steps for all entries that have a process tag assigned
* @param connector
* @param urlCitation
* @return
*/
public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey, final boolean byPartialUpdate) {
if (!this.contains(CollectionSchema.process_sxt)) return 0;
if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0;
final SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
collectionConnector.commit(false); // make sure that we have latest information that can be found
if (segment.fulltext().useWebgraph()) segment.fulltext().getWebgraphConnector().commit(false);
final CollectionConfiguration collection = segment.fulltext().getDefaultConfiguration();
final WebgraphConfiguration webgraph = segment.fulltext().getWebgraphConfiguration();
// calculate the number of documents to be processed
String collection1query = collection1query(segment, harvestkey);
String webgraphquery = webgraphquery(segment, harvestkey);
postprocessingRunning = true;
postprocessingStartTime = System.currentTimeMillis();
postprocessingActivity = "collecting counts";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
try {
postprocessingCollection1Count = (int) collectionConnector.getCountByQuery("{!cache=false}" + collection1query);
postprocessingWebgraphCount = segment.fulltext().useWebgraph() ? (int) segment.fulltext().getWebgraphConnector().getCountByQuery("{!cache=false}" + webgraphquery) : 0;
} catch (IOException e) {
postprocessingCollection1Count = -1;
postprocessingWebgraphCount = -1;
}
// collect hosts from index which shall take part in citation computation
postprocessingActivity = "collecting host facets for collection";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
ReversibleScoreMap collection1hosts;
try {
Map> hostfacet = collectionConnector.getFacets("{!cache=false}" + collection1query, 10000000, CollectionSchema.host_s.getSolrFieldName());
collection1hosts = hostfacet.get(CollectionSchema.host_s.getSolrFieldName());
} catch (final IOException e2) {
ConcurrentLog.logException(e2);
collection1hosts = new ClusteredScoreMap(true);
}
postprocessingActivity = "create ranking map";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
boolean shallComputeCR = (segment.fulltext().useWebgraph() &&
((webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) ||
(webgraph.contains(WebgraphSchema.target_id_s) && webgraph.contains(WebgraphSchema.target_cr_host_norm_i))) ||
(collection.contains(CollectionSchema.cr_host_count_i) &&
collection.contains(CollectionSchema.cr_host_chance_d) &&
collection.contains(CollectionSchema.cr_host_norm_i)));
// create the ranking map
final Map rankings = new ConcurrentHashMap();
if (shallComputeCR) try {
int concurrency = Math.min(collection1hosts.size(), Runtime.getRuntime().availableProcessors());
postprocessingActivity = "collecting cr for " + collection1hosts.size() + " hosts, concurrency = " + concurrency;
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
int countcheck = 0;
for (String host: collection1hosts.keyList(true)) {
// Patch the citation index for links with canonical tags.
// This shall fulfill the following requirement:
// If a document A links to B and B contains a 'canonical C', then the citation rank computation shall consider that A links to C and B does not link to C.
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
long patchquerycount = collectionConnector.getCountByQuery("{!cache=false}" + patchquery);
BlockingQueue documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000000, Long.MAX_VALUE, 20, 1, true,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
SolrDocument doc_B;
int patchquerycountcheck = 0;
try {
while ((doc_B = documents_with_canonical_tag.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
// find all documents which link to the canonical doc
DigestURL doc_C_url = new DigestURL((String) doc_B.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()));
byte[] doc_B_id = ASCII.getBytes(((String) doc_B.getFieldValue(CollectionSchema.id.getSolrFieldName())));
// we remove all references to B, because these become references to C
if (segment.connectedCitation()) {
ReferenceContainer doc_A_ids = segment.urlCitation().remove(doc_B_id);
if (doc_A_ids == null) {
//System.out.println("*** document with canonical but no referrer: " + doc_B.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
continue; // the document has a canonical tag but no referrer?
}
Iterator doc_A_ids_iterator = doc_A_ids.entries();
// for each of the referrer A of B, set A as a referrer of C
while (doc_A_ids_iterator.hasNext()) {
CitationReference doc_A_citation = doc_A_ids_iterator.next();
segment.urlCitation().add(doc_C_url.hash(), doc_A_citation);
}
}
patchquerycountcheck++;
if (MemoryControl.shortStatus()) {
ConcurrentLog.warn("CollectionConfiguration", "terminated canonical collection during postprocessing because of short memory");
break;
}
}
} catch (InterruptedException e) {
ConcurrentLog.logException(e);
} catch (SpaceExceededException e) {
ConcurrentLog.logException(e);
}
if (patchquerycount != patchquerycountcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous patchquery count for host " + host + ": expected=" + patchquerycount + ", counted=" + patchquerycountcheck);
// do the citation rank computation
if (collection1hosts.get(host) <= 0) continue;
// select all documents for each host
CRHost crh = new CRHost(segment, rrCache, host, 0.85d, 6);
int convergence_attempts = 0;
while (convergence_attempts++ < 30) {
ConcurrentLog.info("CollectionConfiguration", "convergence step " + convergence_attempts + " for host " + host + " ...");
if (crh.convergenceStep()) break;
if (MemoryControl.shortStatus()) {
ConcurrentLog.warn("CollectionConfiguration", "terminated convergenceStep during postprocessing because of short memory");
break;
}
}
ConcurrentLog.info("CollectionConfiguration", "convergence for host " + host + " after " + convergence_attempts + " steps");
// we have now the cr for all documents of a specific host; we store them for later use
Map crn = crh.normalize();
//crh.log(crn);
rankings.putAll(crn); // accumulate this here for usage in document update later
if (MemoryControl.shortStatus()) {
ConcurrentLog.warn("CollectionConfiguration", "terminated crn akkumulation during postprocessing because of short memory");
break;
}
countcheck++;
}
if (collection1hosts.size() != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous host count: expected=" + collection1hosts.size() + ", counted=" + countcheck);
} catch (final IOException e2) {
ConcurrentLog.logException(e2);
collection1hosts = new ClusteredScoreMap(true);
}
// process all documents at the webgraph for the outgoing links of this document
final AtomicInteger allcount = new AtomicInteger(0);
if (segment.fulltext().useWebgraph() && shallComputeCR) {
postprocessingActivity = "collecting host facets for webgraph cr calculation";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
final Set omitFields = new HashSet();
omitFields.add(WebgraphSchema.process_sxt.getSolrFieldName());
omitFields.add(WebgraphSchema.harvestkey_s.getSolrFieldName());
// collect hosts from index which shall take part in citation computation
ReversibleScoreMap webgraphhosts;
try {
Map> hostfacet = segment.fulltext().getWebgraphConnector().getFacets(webgraphquery, 10000000, WebgraphSchema.source_host_s.getSolrFieldName());
webgraphhosts = hostfacet.get(WebgraphSchema.source_host_s.getSolrFieldName());
} catch (final IOException e2) {
ConcurrentLog.logException(e2);
webgraphhosts = new ClusteredScoreMap(true);
}
try {
final long start = System.currentTimeMillis();
for (String host: webgraphhosts.keyList(true)) {
if (webgraphhosts.get(host) <= 0) continue;
final String hostfinal = host;
// select all webgraph edges and modify their cr value
postprocessingActivity = "writing cr values to webgraph for host " + host;
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
String patchquery = WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + host + "\" AND " + WebgraphSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
final long count = segment.fulltext().getWebgraphConnector().getCountByQuery("{!cache=false}" + patchquery);
int concurrency = Math.min((int) count, Math.max(1, Runtime.getRuntime().availableProcessors() / 4));
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph, concurrency = " + concurrency);
final BlockingQueue docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(
patchquery,
WebgraphSchema.source_chars_i.getSolrFieldName() + " asc",
0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true
// TODO: add field list and do partial updates
);
final AtomicInteger proccount = new AtomicInteger(0);
Thread[] t = new Thread[concurrency];
for (final AtomicInteger i = new AtomicInteger(0); i.get() < t.length; i.incrementAndGet()) {
t[i.get()] = new Thread() {
private String name = "CollectionConfiguration.postprocessing.webgraph-" + i.get();
@Override
public void run() {
Thread.currentThread().setName(name);
SolrDocument doc; String id;
try {
processloop: while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
try {
SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields);
Collection