You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
998 lines
56 KiB
998 lines
56 KiB
/**
|
|
* SolrScheme
|
|
* Copyright 2011 by Michael Peter Christen
|
|
* First released 14.04.2011 at http://yacy.net
|
|
*
|
|
* $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
|
|
* $LastChangedRevision: 7654 $
|
|
* $LastChangedBy: orbiter $
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package net.yacy.search.index;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.io.Serializable;
|
|
import java.net.InetAddress;
|
|
import java.net.MalformedURLException;
|
|
import java.util.ArrayList;
|
|
import java.util.Collection;
|
|
import java.util.Date;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.LinkedHashSet;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Properties;
|
|
import java.util.Set;
|
|
|
|
import net.yacy.cora.document.ASCII;
|
|
import net.yacy.cora.document.MultiProtocolURI;
|
|
import net.yacy.cora.document.UTF8;
|
|
import net.yacy.cora.federate.solr.FailType;
|
|
import net.yacy.cora.federate.solr.ProcessType;
|
|
import net.yacy.cora.federate.solr.YaCySchema;
|
|
import net.yacy.cora.federate.yacy.ConfigurationSet;
|
|
import net.yacy.cora.protocol.Domains;
|
|
import net.yacy.cora.protocol.HeaderFramework;
|
|
import net.yacy.cora.protocol.ResponseHeader;
|
|
import net.yacy.cora.util.CommonPattern;
|
|
import net.yacy.cora.util.SpaceExceededException;
|
|
import net.yacy.crawler.data.CrawlProfile;
|
|
import net.yacy.crawler.retrieval.Response;
|
|
import net.yacy.document.Condenser;
|
|
import net.yacy.document.Document;
|
|
import net.yacy.document.parser.html.ContentScraper;
|
|
import net.yacy.document.parser.html.ImageEntry;
|
|
import net.yacy.kelondro.data.citation.CitationReference;
|
|
import net.yacy.kelondro.data.meta.DigestURI;
|
|
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
|
import net.yacy.kelondro.index.RowHandleSet;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import net.yacy.kelondro.rwi.IndexCell;
|
|
import net.yacy.kelondro.rwi.ReferenceContainer;
|
|
import net.yacy.kelondro.util.Bitfield;
|
|
import net.yacy.kelondro.util.ByteBuffer;
|
|
|
|
import org.apache.solr.common.SolrInputDocument;
|
|
|
|
|
|
public class SolrConfiguration extends ConfigurationSet implements Serializable {
|
|
|
|
private static final long serialVersionUID=-499100932212840385L;
|
|
|
|
private boolean lazy;
|
|
|
|
/**
|
|
* initialize with an empty ConfigurationSet which will cause that all the index
|
|
* attributes are used
|
|
*/
|
|
public SolrConfiguration() {
|
|
super();
|
|
this.lazy = false;
|
|
}
|
|
|
|
/**
|
|
* initialize the scheme with a given configuration file
|
|
* the configuration file simply contains a list of lines with keywords
|
|
* or keyword = value lines (while value is a custom Solr field name
|
|
* @param configurationFile
|
|
*/
|
|
public SolrConfiguration(final File configurationFile, boolean lazy) {
|
|
super(configurationFile);
|
|
this.lazy = lazy;
|
|
// check consistency: compare with YaCyField enum
|
|
if (this.isEmpty()) return;
|
|
Iterator<Entry> it = this.entryIterator();
|
|
for (ConfigurationSet.Entry etr = it.next(); it.hasNext(); etr = it.next()) {
|
|
try {
|
|
YaCySchema f = YaCySchema.valueOf(etr.key());
|
|
f.setSolrFieldName(etr.getValue());
|
|
} catch (IllegalArgumentException e) {
|
|
Log.logFine("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + etr.toString() + "'");
|
|
it.remove();
|
|
}
|
|
}
|
|
// check consistency the other way: look if all enum constants in SolrField appear in the configuration file
|
|
for (YaCySchema field: YaCySchema.values()) {
|
|
if (this.get(field.name()) == null) {
|
|
Log.logWarning("SolrScheme", " solr scheme file " + configurationFile.getAbsolutePath() + " is missing declaration for '" + field.name() + "'");
|
|
}
|
|
}
|
|
}
|
|
|
|
public boolean contains(YaCySchema field) {
|
|
return this.contains(field.name());
|
|
}
|
|
|
|
private void add(final SolrInputDocument doc, final YaCySchema key, final String value) {
|
|
assert !key.isMultiValued();
|
|
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) key.add(doc, value);
|
|
}
|
|
|
|
private void add(final SolrInputDocument doc, final YaCySchema key, final Date value) {
|
|
assert !key.isMultiValued();
|
|
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.getTime() > 0))) key.add(doc, value);
|
|
}
|
|
|
|
private void add(final SolrInputDocument doc, final YaCySchema key, final String[] value) {
|
|
assert key.isMultiValued();
|
|
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value);
|
|
}
|
|
|
|
private void add(final SolrInputDocument doc, final YaCySchema key, final Integer[] value) {
|
|
assert key.isMultiValued();
|
|
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value);
|
|
}
|
|
|
|
private void add(final SolrInputDocument doc, final YaCySchema key, final List<?> values) {
|
|
assert key.isMultiValued();
|
|
if ((isEmpty() || contains(key)) && (!this.lazy || (values != null && !values.isEmpty()))) key.add(doc, values);
|
|
}
|
|
|
|
private void add(final SolrInputDocument doc, final YaCySchema key, final int value) {
|
|
assert !key.isMultiValued();
|
|
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value);
|
|
}
|
|
|
|
private void add(final SolrInputDocument doc, final YaCySchema key, final long value) {
|
|
assert !key.isMultiValued();
|
|
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value);
|
|
}
|
|
|
|
private void add(final SolrInputDocument doc, final YaCySchema key, final boolean value) {
|
|
assert !key.isMultiValued();
|
|
if (isEmpty() || contains(key)) key.add(doc, value);
|
|
}
|
|
|
|
protected static Date getDate(SolrInputDocument doc, final YaCySchema key) {
|
|
Date x = (Date) doc.getFieldValue(key.getSolrFieldName());
|
|
Date now = new Date();
|
|
return (x == null) ? new Date(0) : x.after(now) ? now : x;
|
|
}
|
|
|
|
/**
|
|
* save configuration to file and update enum SolrFields
|
|
* @throws IOException
|
|
*/
|
|
@Override
|
|
public void commit() throws IOException {
|
|
try {
|
|
super.commit();
|
|
// make sure the enum SolrField.SolrFieldName is current
|
|
Iterator<Entry> it = this.entryIterator();
|
|
for (ConfigurationSet.Entry etr = it.next(); it.hasNext(); etr = it.next()) {
|
|
try {
|
|
YaCySchema f = YaCySchema.valueOf(etr.key());
|
|
f.setSolrFieldName(etr.getValue());
|
|
} catch (IllegalArgumentException e) {
|
|
continue;
|
|
}
|
|
}
|
|
} catch (final IOException e) {}
|
|
}
|
|
|
|
protected SolrInputDocument metadata2solr(final URIMetadataRow md) {
|
|
|
|
final SolrInputDocument doc = new SolrInputDocument();
|
|
final DigestURI digestURI = DigestURI.toDigestURI(md.url());
|
|
boolean allAttr = this.isEmpty();
|
|
|
|
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, "");
|
|
add(doc, YaCySchema.id, ASCII.String(md.hash()));
|
|
String us = digestURI.toNormalform(true);
|
|
add(doc, YaCySchema.sku, us);
|
|
if (allAttr || contains(YaCySchema.ip_s)) {
|
|
final InetAddress address = digestURI.getInetAddress();
|
|
if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress());
|
|
}
|
|
if (allAttr || contains(YaCySchema.url_protocol_s)) add(doc, YaCySchema.url_protocol_s, digestURI.getProtocol());
|
|
Map<String, String> searchpart = digestURI.getSearchpartMap();
|
|
if (searchpart == null) {
|
|
if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, 0);
|
|
} else {
|
|
if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, searchpart.size());
|
|
if (allAttr || contains(YaCySchema.url_parameter_key_sxt)) add(doc, YaCySchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()]));
|
|
if (allAttr || contains(YaCySchema.url_parameter_value_sxt)) add(doc, YaCySchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()]));
|
|
}
|
|
if (allAttr || contains(YaCySchema.url_chars_i)) add(doc, YaCySchema.url_chars_i, us.length());
|
|
String host = null;
|
|
if ((host = digestURI.getHost()) != null) {
|
|
String dnc = Domains.getDNC(host);
|
|
String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1);
|
|
int p = subdomOrga.lastIndexOf('.');
|
|
String subdom = (p < 0) ? "" : subdomOrga.substring(0, p);
|
|
String orga = (p < 0) ? subdomOrga : subdomOrga.substring(p + 1);
|
|
if (allAttr || contains(YaCySchema.host_s)) add(doc, YaCySchema.host_s, host);
|
|
if (allAttr || contains(YaCySchema.host_dnc_s)) add(doc, YaCySchema.host_dnc_s, dnc);
|
|
if (allAttr || contains(YaCySchema.host_organization_s)) add(doc, YaCySchema.host_organization_s, orga);
|
|
if (allAttr || contains(YaCySchema.host_organizationdnc_s)) add(doc, YaCySchema.host_organizationdnc_s, orga + '.' + dnc);
|
|
if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom);
|
|
}
|
|
|
|
String title = md.dc_title();
|
|
if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, new String[]{title});
|
|
if (allAttr || contains(YaCySchema.title_count_i)) add(doc, YaCySchema.title_count_i, 1);
|
|
if (allAttr || contains(YaCySchema.title_chars_val)) {
|
|
Integer[] cv = new Integer[]{new Integer(title.length())};
|
|
add(doc, YaCySchema.title_chars_val, cv);
|
|
}
|
|
if (allAttr || contains(YaCySchema.title_words_val)) {
|
|
Integer[] cv = new Integer[]{new Integer(CommonPattern.SPACE.split(title).length)};
|
|
add(doc, YaCySchema.title_words_val, cv);
|
|
}
|
|
|
|
String description = md.snippet(); if (description == null) description = "";
|
|
if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, description);
|
|
if (allAttr || contains(YaCySchema.description_count_i)) add(doc, YaCySchema.description_count_i, 1);
|
|
if (allAttr || contains(YaCySchema.description_chars_val)) {
|
|
Integer[] cv = new Integer[]{new Integer(description.length())};
|
|
add(doc, YaCySchema.description_chars_val, cv);
|
|
}
|
|
if (allAttr || contains(YaCySchema.description_words_val)) {
|
|
Integer[] cv = new Integer[]{new Integer(CommonPattern.SPACE.split(description).length)};
|
|
add(doc, YaCySchema.description_words_val, cv);
|
|
}
|
|
|
|
if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, md.dc_creator());
|
|
if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype()));
|
|
if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, md.moddate());
|
|
if (allAttr || contains(YaCySchema.wordcount_i)) add(doc, YaCySchema.wordcount_i, md.wordCount());
|
|
|
|
String keywords = md.dc_subject();
|
|
Bitfield flags = md.flags();
|
|
if (flags.get(Condenser.flag_cat_indexof)) {
|
|
if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else {
|
|
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
|
|
}
|
|
}
|
|
if (allAttr || contains(YaCySchema.keywords)) {
|
|
add(doc, YaCySchema.keywords, keywords);
|
|
}
|
|
|
|
// path elements of link
|
|
if (allAttr || contains(YaCySchema.url_paths_sxt)) add(doc, YaCySchema.url_paths_sxt, digestURI.getPaths());
|
|
if (allAttr || contains(YaCySchema.url_file_ext_s)) add(doc, YaCySchema.url_file_ext_s, digestURI.getFileExtension());
|
|
|
|
if (allAttr || contains(YaCySchema.imagescount_i)) add(doc, YaCySchema.imagescount_i, md.limage());
|
|
if (allAttr || contains(YaCySchema.inboundlinkscount_i)) add(doc, YaCySchema.inboundlinkscount_i, md.llocal());
|
|
if (allAttr || contains(YaCySchema.outboundlinkscount_i)) add(doc, YaCySchema.outboundlinkscount_i, md.lother());
|
|
if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, "UTF8");
|
|
|
|
// coordinates
|
|
if (md.lat() != 0.0 && md.lon() != 0.0) {
|
|
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(md.lat()) + "," + Double.toString(md.lon()));
|
|
}
|
|
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, 200);
|
|
|
|
// fields that are in URIMetadataRow additional to yacy2solr basic requirement
|
|
if (allAttr || contains(YaCySchema.load_date_dt)) add(doc, YaCySchema.load_date_dt, md.loaddate());
|
|
if (allAttr || contains(YaCySchema.fresh_date_dt)) add(doc, YaCySchema.fresh_date_dt, md.freshdate());
|
|
if (allAttr || contains(YaCySchema.host_id_s)) add(doc, YaCySchema.host_id_s, md.hosthash());
|
|
if ((allAttr || contains(YaCySchema.referrer_id_txt)) && md.referrerHash() != null) add(doc, YaCySchema.referrer_id_txt, new String[]{ASCII.String(md.referrerHash())});
|
|
if (allAttr || contains(YaCySchema.md5_s)) add(doc, YaCySchema.md5_s, md.md5());
|
|
if (allAttr || contains(YaCySchema.publisher_t)) add(doc, YaCySchema.publisher_t, md.dc_publisher());
|
|
if ((allAttr || contains(YaCySchema.language_s)) && md.language() != null) add(doc, YaCySchema.language_s, UTF8.String(md.language()));
|
|
if (allAttr || contains(YaCySchema.size_i)) add(doc, YaCySchema.size_i, md.size());
|
|
if (allAttr || contains(YaCySchema.audiolinkscount_i)) add(doc, YaCySchema.audiolinkscount_i, md.laudio());
|
|
if (allAttr || contains(YaCySchema.videolinkscount_i)) add(doc, YaCySchema.videolinkscount_i, md.lvideo());
|
|
if (allAttr || contains(YaCySchema.applinkscount_i)) add(doc, YaCySchema.applinkscount_i, md.lapp());
|
|
if (allAttr || contains(YaCySchema.text_t)) {
|
|
// construct the text from other metadata parts.
|
|
// This is necessary here since that is used to search the link when no other data (parsed text body) is available
|
|
StringBuilder sb = new StringBuilder(120);
|
|
accText(sb, md.dc_title());
|
|
accText(sb, md.dc_creator());
|
|
accText(sb, md.dc_publisher());
|
|
accText(sb, md.snippet());
|
|
accText(sb, digestURI.toTokens());
|
|
accText(sb, keywords);
|
|
add(doc, YaCySchema.text_t, sb.toString());
|
|
}
|
|
|
|
return doc;
|
|
}
|
|
|
|
private static void accText(final StringBuilder sb, String text) {
|
|
if (text == null || text.length() == 0) return;
|
|
if (sb.length() != 0) sb.append(' ');
|
|
text = text.trim();
|
|
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
|
|
}
|
|
|
|
protected SolrInputDocument yacy2solr(
|
|
final String id, final CrawlProfile profile, final ResponseHeader responseHeader,
|
|
final Document document, Condenser condenser, DigestURI referrerURL, String language,
|
|
IndexCell<CitationReference> citations) {
|
|
// we use the SolrCell design as index scheme
|
|
final SolrInputDocument doc = new SolrInputDocument();
|
|
final DigestURI digestURI = DigestURI.toDigestURI(document.dc_source());
|
|
boolean allAttr = this.isEmpty();
|
|
|
|
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
|
|
|
|
add(doc, YaCySchema.id, id);
|
|
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
|
|
String docurl = digestURI.toNormalform(true);
|
|
add(doc, YaCySchema.sku, docurl);
|
|
|
|
if ((allAttr || contains(YaCySchema.clickdepth_i)) && citations != null) {
|
|
if (digestURI.probablyRootURL()) {
|
|
boolean lc = this.lazy; this.lazy = false;
|
|
add(doc, YaCySchema.clickdepth_i, 0);
|
|
this.lazy = lc;
|
|
} else {
|
|
// search the citations for references
|
|
int clickdepth = -1;
|
|
try {
|
|
clickdepth = getClickDepth(citations, digestURI);
|
|
} catch (IOException e) {
|
|
add(doc, YaCySchema.clickdepth_i, -1);
|
|
}
|
|
add(doc, YaCySchema.clickdepth_i, clickdepth);
|
|
if (clickdepth < 0 || clickdepth > 1) {
|
|
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
|
|
}
|
|
}
|
|
}
|
|
|
|
if (allAttr || contains(YaCySchema.ip_s)) {
|
|
final InetAddress address = digestURI.getInetAddress();
|
|
if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress());
|
|
}
|
|
if (allAttr || contains(YaCySchema.collection_sxt) && profile != null) add(doc, YaCySchema.collection_sxt, profile.collections());
|
|
if (allAttr || contains(YaCySchema.url_protocol_s)) add(doc, YaCySchema.url_protocol_s, digestURI.getProtocol());
|
|
Map<String, String> searchpart = digestURI.getSearchpartMap();
|
|
if (searchpart == null) {
|
|
if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, 0);
|
|
} else {
|
|
if (allAttr || contains(YaCySchema.url_parameter_i)) add(doc, YaCySchema.url_parameter_i, searchpart.size());
|
|
if (allAttr || contains(YaCySchema.url_parameter_key_sxt)) add(doc, YaCySchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()]));
|
|
if (allAttr || contains(YaCySchema.url_parameter_value_sxt)) add(doc, YaCySchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()]));
|
|
}
|
|
if (allAttr || contains(YaCySchema.url_chars_i)) add(doc, YaCySchema.url_chars_i, docurl.length());
|
|
String host = null;
|
|
if ((host = digestURI.getHost()) != null) {
|
|
String dnc = Domains.getDNC(host);
|
|
String subdomOrga = host.length() - dnc.length() <= 0 ? "" : host.substring(0, host.length() - dnc.length() - 1);
|
|
int p = subdomOrga.lastIndexOf('.');
|
|
String subdom = (p < 0) ? "" : subdomOrga.substring(0, p);
|
|
String orga = (p < 0) ? subdomOrga : subdomOrga.substring(p + 1);
|
|
if (allAttr || contains(YaCySchema.host_s)) add(doc, YaCySchema.host_s, host);
|
|
if (allAttr || contains(YaCySchema.host_dnc_s)) add(doc, YaCySchema.host_dnc_s, dnc);
|
|
if (allAttr || contains(YaCySchema.host_organization_s)) add(doc, YaCySchema.host_organization_s, orga);
|
|
if (allAttr || contains(YaCySchema.host_organizationdnc_s)) add(doc, YaCySchema.host_organizationdnc_s, orga + '.' + dnc);
|
|
if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom);
|
|
}
|
|
|
|
List<String> titles = document.titles();
|
|
if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, titles);
|
|
if (allAttr || contains(YaCySchema.title_count_i)) add(doc, YaCySchema.title_count_i, titles.size());
|
|
if (allAttr || contains(YaCySchema.title_chars_val)) {
|
|
ArrayList<Integer> cv = new ArrayList<Integer>(titles.size());
|
|
for (String s: titles) cv.add(new Integer(s.length()));
|
|
add(doc, YaCySchema.title_chars_val, cv);
|
|
}
|
|
if (allAttr || contains(YaCySchema.title_words_val)) {
|
|
ArrayList<Integer> cv = new ArrayList<Integer>(titles.size());
|
|
for (String s: titles) cv.add(new Integer(CommonPattern.SPACE.split(s).length));
|
|
add(doc, YaCySchema.title_words_val, cv);
|
|
}
|
|
|
|
String description = document.dc_description();
|
|
List<String> descriptions = new ArrayList<String>();
|
|
for (String s: CommonPattern.NEWLINE.split(description)) descriptions.add(s);
|
|
if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, description);
|
|
if (allAttr || contains(YaCySchema.description_count_i)) add(doc, YaCySchema.description_count_i, descriptions.size());
|
|
if (allAttr || contains(YaCySchema.description_chars_val)) {
|
|
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.size());
|
|
for (String s: descriptions) cv.add(new Integer(s.length()));
|
|
add(doc, YaCySchema.description_chars_val, cv);
|
|
}
|
|
if (allAttr || contains(YaCySchema.description_words_val)) {
|
|
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.size());
|
|
for (String s: descriptions) cv.add(new Integer(CommonPattern.SPACE.split(s).length));
|
|
add(doc, YaCySchema.description_words_val, cv);
|
|
}
|
|
|
|
if (allAttr || contains(YaCySchema.author)) {
|
|
String author = document.dc_creator();
|
|
if (author == null || author.length() == 0) author = document.dc_publisher();
|
|
add(doc, YaCySchema.author, author);
|
|
}
|
|
if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, new String[]{document.dc_format()});
|
|
if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
|
|
if (allAttr || contains(YaCySchema.keywords)) add(doc, YaCySchema.keywords, document.dc_subject(' '));
|
|
String content = document.getTextString();
|
|
if (content == null || content.length() == 0) {
|
|
content = digestURI.toTokens();
|
|
}
|
|
if (allAttr || contains(YaCySchema.text_t)) add(doc, YaCySchema.text_t, content);
|
|
if (allAttr || contains(YaCySchema.wordcount_i)) {
|
|
if (content.length() == 0) {
|
|
add(doc, YaCySchema.wordcount_i, 0);
|
|
} else {
|
|
int contentwc = 1;
|
|
for (int i = content.length() - 1; i >= 0; i--) if (content.charAt(i) == ' ') contentwc++;
|
|
add(doc, YaCySchema.wordcount_i, contentwc);
|
|
}
|
|
}
|
|
if (allAttr || contains(YaCySchema.synonyms_sxt)) {
|
|
List<String> synonyms = condenser.synonyms();
|
|
add(doc, YaCySchema.synonyms_sxt, synonyms);
|
|
}
|
|
add(doc, YaCySchema.exact_signature_l, condenser.exactSignature());
|
|
add(doc, YaCySchema.exact_signature_unique_b, true); // this must be corrected afterwards!
|
|
add(doc, YaCySchema.fuzzy_signature_l, condenser.fuzzySignature());
|
|
add(doc, YaCySchema.fuzzy_signature_text_t, condenser.fuzzySignatureText());
|
|
add(doc, YaCySchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards!
|
|
|
|
// path elements of link
|
|
if (allAttr || contains(YaCySchema.url_paths_sxt)) add(doc, YaCySchema.url_paths_sxt, digestURI.getPaths());
|
|
if (allAttr || contains(YaCySchema.url_file_ext_s)) add(doc, YaCySchema.url_file_ext_s, digestURI.getFileExtension());
|
|
|
|
// get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme
|
|
Set<MultiProtocolURI> inboundLinks = document.inboundLinks();
|
|
Set<MultiProtocolURI> outboundLinks = document.outboundLinks();
|
|
|
|
int c = 0;
|
|
final Object parser = document.getParserObject();
|
|
Map<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
|
|
if (parser instanceof ContentScraper) {
|
|
final ContentScraper html = (ContentScraper) parser;
|
|
images = html.getImages();
|
|
|
|
// header tags
|
|
int h = 0;
|
|
int f = 1;
|
|
String[] hs;
|
|
|
|
hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h1_txt, hs); add(doc, YaCySchema.h1_i, hs.length);
|
|
hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h2_txt, hs); add(doc, YaCySchema.h2_i, hs.length);
|
|
hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h3_txt, hs); add(doc, YaCySchema.h3_i, hs.length);
|
|
hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h4_txt, hs); add(doc, YaCySchema.h4_i, hs.length);
|
|
hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h5_txt, hs); add(doc, YaCySchema.h5_i, hs.length);
|
|
hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h6_txt, hs); add(doc, YaCySchema.h6_i, hs.length);
|
|
|
|
add(doc, YaCySchema.htags_i, h);
|
|
add(doc, YaCySchema.schema_org_breadcrumb_i, html.breadcrumbCount());
|
|
|
|
// meta tags: Open Graph properties
|
|
String og;
|
|
og = html.getMetas().get("og:title"); if (og != null) add(doc, YaCySchema.opengraph_title_t, og);
|
|
og = html.getMetas().get("og:type"); if (og != null) add(doc, YaCySchema.opengraph_type_s, og);
|
|
og = html.getMetas().get("og:url"); if (og != null) add(doc, YaCySchema.opengraph_url_s, og);
|
|
og = html.getMetas().get("og:image"); if (og != null) add(doc, YaCySchema.opengraph_image_s, og);
|
|
|
|
// noindex and nofollow attributes
|
|
// from HTML (meta-tag in HTML header: robots)
|
|
// and HTTP header (x-robots property)
|
|
// coded as binary value:
|
|
// bit 0: "all" contained in html header meta
|
|
// bit 1: "index" contained in html header meta
|
|
// bit 2: "noindex" contained in html header meta
|
|
// bit 3: "nofollow" contained in html header meta
|
|
// bit 8: "noarchive" contained in http header properties
|
|
// bit 9: "nosnippet" contained in http header properties
|
|
// bit 10: "noindex" contained in http header properties
|
|
// bit 11: "nofollow" contained in http header properties
|
|
// bit 12: "unavailable_after" contained in http header properties
|
|
int b = 0;
|
|
final String robots_meta = html.getMetas().get("robots");
|
|
// this tag may have values: all, index, noindex, nofollow
|
|
if (robots_meta != null) {
|
|
if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0
|
|
if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1
|
|
if (robots_meta.indexOf("noindex",0) >= 0) b += 4; // set bit 2
|
|
if (robots_meta.indexOf("nofollow",0) >= 0) b += 8; // set bit 3
|
|
}
|
|
String x_robots_tag = "";
|
|
if (responseHeader != null) {
|
|
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, "");
|
|
if (x_robots_tag.isEmpty()) {
|
|
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, "");
|
|
}
|
|
}
|
|
if (!x_robots_tag.isEmpty()) {
|
|
// this tag may have values: noarchive, nosnippet, noindex, unavailable_after
|
|
if (x_robots_tag.indexOf("noarchive",0) >= 0) b += 256; // set bit 8
|
|
if (x_robots_tag.indexOf("nosnippet",0) >= 0) b += 512; // set bit 9
|
|
if (x_robots_tag.indexOf("noindex",0) >= 0) b += 1024; // set bit 10
|
|
if (x_robots_tag.indexOf("nofollow",0) >= 0) b += 2048; // set bit 11
|
|
if (x_robots_tag.indexOf("unavailable_after",0) >=0) b += 4096; // set bit 12
|
|
}
|
|
add(doc, YaCySchema.robots_i, b);
|
|
|
|
// meta tags: generator
|
|
final String generator = html.getMetas().get("generator");
|
|
if (generator != null) add(doc, YaCySchema.metagenerator_t, generator);
|
|
|
|
// bold, italic
|
|
final String[] bold = html.getBold();
|
|
add(doc, YaCySchema.boldcount_i, bold.length);
|
|
if (bold.length > 0) {
|
|
add(doc, YaCySchema.bold_txt, bold);
|
|
if (allAttr || contains(YaCySchema.bold_val)) {
|
|
add(doc, YaCySchema.bold_val, html.getBoldCount(bold));
|
|
}
|
|
}
|
|
final String[] italic = html.getItalic();
|
|
add(doc, YaCySchema.italiccount_i, italic.length);
|
|
if (italic.length > 0) {
|
|
add(doc, YaCySchema.italic_txt, italic);
|
|
if (allAttr || contains(YaCySchema.italic_val)) {
|
|
add(doc, YaCySchema.italic_val, html.getItalicCount(italic));
|
|
}
|
|
}
|
|
final String[] underline = html.getUnderline();
|
|
add(doc, YaCySchema.underlinecount_i, underline.length);
|
|
if (underline.length > 0) {
|
|
add(doc, YaCySchema.underline_txt, underline);
|
|
if (allAttr || contains(YaCySchema.underline_val)) {
|
|
add(doc, YaCySchema.underline_val, html.getUnderlineCount(underline));
|
|
}
|
|
}
|
|
final String[] li = html.getLi();
|
|
add(doc, YaCySchema.licount_i, li.length);
|
|
if (li.length > 0) add(doc, YaCySchema.li_txt, li);
|
|
|
|
// images
|
|
final Collection<ImageEntry> imagesc = images.values();
|
|
final List<String> imgtags = new ArrayList<String>(imagesc.size());
|
|
final List<String> imgprots = new ArrayList<String>(imagesc.size());
|
|
final List<String> imgstubs = new ArrayList<String>(imagesc.size());
|
|
final List<String> imgalts = new ArrayList<String>(imagesc.size());
|
|
int withalt = 0;
|
|
for (final ImageEntry ie: imagesc) {
|
|
final MultiProtocolURI uri = ie.url();
|
|
inboundLinks.remove(uri);
|
|
outboundLinks.remove(uri);
|
|
imgtags.add(ie.toString());
|
|
String protocol = uri.getProtocol();
|
|
imgprots.add(protocol);
|
|
imgstubs.add(uri.toString().substring(protocol.length() + 3));
|
|
imgalts.add(ie.alt());
|
|
if (ie.alt() != null && ie.alt().length() > 0) withalt++;
|
|
}
|
|
if (allAttr || contains(YaCySchema.imagescount_i)) add(doc, YaCySchema.imagescount_i, imgtags.size());
|
|
if (allAttr || contains(YaCySchema.images_tag_txt)) add(doc, YaCySchema.images_tag_txt, imgtags);
|
|
if (allAttr || contains(YaCySchema.images_protocol_sxt)) add(doc, YaCySchema.images_protocol_sxt, protocolList2indexedList(imgprots));
|
|
if (allAttr || contains(YaCySchema.images_urlstub_txt)) add(doc, YaCySchema.images_urlstub_txt, imgstubs);
|
|
if (allAttr || contains(YaCySchema.images_alt_txt)) add(doc, YaCySchema.images_alt_txt, imgalts);
|
|
if (allAttr || contains(YaCySchema.images_withalt_i)) add(doc, YaCySchema.images_withalt_i, withalt);
|
|
|
|
// style sheets
|
|
if (allAttr || contains(YaCySchema.css_tag_txt)) {
|
|
final Map<MultiProtocolURI, String> csss = html.getCSS();
|
|
final String[] css_tag = new String[csss.size()];
|
|
final String[] css_url = new String[csss.size()];
|
|
c = 0;
|
|
for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
|
|
final String cssurl = entry.getKey().toNormalform(false);
|
|
inboundLinks.remove(cssurl);
|
|
outboundLinks.remove(cssurl);
|
|
css_tag[c] =
|
|
"<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
|
|
" href=\""+ cssurl + "\" />";
|
|
css_url[c] = cssurl;
|
|
c++;
|
|
}
|
|
add(doc, YaCySchema.csscount_i, css_tag.length);
|
|
if (css_tag.length > 0) add(doc, YaCySchema.css_tag_txt, css_tag);
|
|
if (css_url.length > 0) add(doc, YaCySchema.css_url_txt, css_url);
|
|
}
|
|
|
|
// Scripts
|
|
if (allAttr || contains(YaCySchema.scripts_txt)) {
|
|
final Set<MultiProtocolURI> scriptss = html.getScript();
|
|
final String[] scripts = new String[scriptss.size()];
|
|
c = 0;
|
|
for (final MultiProtocolURI u: scriptss) {
|
|
inboundLinks.remove(u);
|
|
outboundLinks.remove(u);
|
|
scripts[c++] = u.toNormalform(false);
|
|
}
|
|
add(doc, YaCySchema.scriptscount_i, scripts.length);
|
|
if (scripts.length > 0) add(doc, YaCySchema.scripts_txt, scripts);
|
|
}
|
|
|
|
// Frames
|
|
if (allAttr || contains(YaCySchema.frames_txt)) {
|
|
final Set<MultiProtocolURI> framess = html.getFrames();
|
|
final String[] frames = new String[framess.size()];
|
|
c = 0;
|
|
for (final MultiProtocolURI u: framess) {
|
|
inboundLinks.remove(u);
|
|
outboundLinks.remove(u);
|
|
frames[c++] = u.toNormalform(false);
|
|
}
|
|
add(doc, YaCySchema.framesscount_i, frames.length);
|
|
if (frames.length > 0) add(doc, YaCySchema.frames_txt, frames);
|
|
}
|
|
|
|
// IFrames
|
|
if (allAttr || contains(YaCySchema.iframes_txt)) {
|
|
final Set<MultiProtocolURI> iframess = html.getIFrames();
|
|
final String[] iframes = new String[iframess.size()];
|
|
c = 0;
|
|
for (final MultiProtocolURI u: iframess) {
|
|
inboundLinks.remove(u);
|
|
outboundLinks.remove(u);
|
|
iframes[c++] = u.toNormalform(false);
|
|
}
|
|
add(doc, YaCySchema.iframesscount_i, iframes.length);
|
|
if (iframes.length > 0) add(doc, YaCySchema.iframes_txt, iframes);
|
|
}
|
|
|
|
// canonical tag
|
|
if (allAttr || contains(YaCySchema.canonical_t)) {
|
|
final MultiProtocolURI canonical = html.getCanonical();
|
|
if (canonical != null) {
|
|
inboundLinks.remove(canonical);
|
|
outboundLinks.remove(canonical);
|
|
add(doc, YaCySchema.canonical_t, canonical.toNormalform(false));
|
|
// set a flag if this is equal to sku
|
|
if (contains(YaCySchema.canonical_equal_sku_b) && canonical.equals(docurl)) {
|
|
add(doc, YaCySchema.canonical_equal_sku_b, true);
|
|
}
|
|
}
|
|
}
|
|
|
|
// meta refresh tag
|
|
if (allAttr || contains(YaCySchema.refresh_s)) {
|
|
String refresh = html.getRefreshPath();
|
|
if (refresh != null && refresh.length() > 0) {
|
|
MultiProtocolURI refreshURL;
|
|
try {
|
|
refreshURL = refresh.startsWith("http") ? new MultiProtocolURI(html.getRefreshPath()) : new MultiProtocolURI(digestURI, html.getRefreshPath());
|
|
if (refreshURL != null) {
|
|
inboundLinks.remove(refreshURL);
|
|
outboundLinks.remove(refreshURL);
|
|
add(doc, YaCySchema.refresh_s, refreshURL.toNormalform(false));
|
|
}
|
|
} catch (MalformedURLException e) {
|
|
add(doc, YaCySchema.refresh_s, refresh);
|
|
}
|
|
}
|
|
}
|
|
|
|
// flash embedded
|
|
if (allAttr || contains(YaCySchema.flash_b)) {
|
|
MultiProtocolURI[] flashURLs = html.getFlash();
|
|
for (MultiProtocolURI u: flashURLs) {
|
|
// remove all flash links from ibound/outbound links
|
|
inboundLinks.remove(u);
|
|
outboundLinks.remove(u);
|
|
}
|
|
add(doc, YaCySchema.flash_b, flashURLs.length > 0);
|
|
}
|
|
|
|
// generic evaluation pattern
|
|
for (final String model: html.getEvaluationModelNames()) {
|
|
if (allAttr || contains("ext_" + model + "_txt")) {
|
|
final String[] scorenames = html.getEvaluationModelScoreNames(model);
|
|
if (scorenames.length > 0) {
|
|
add(doc, YaCySchema.valueOf("ext_" + model + "_txt"), scorenames);
|
|
add(doc, YaCySchema.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames));
|
|
}
|
|
}
|
|
}
|
|
|
|
// response time
|
|
add(doc, YaCySchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
|
|
}
|
|
|
|
// list all links
|
|
final Map<MultiProtocolURI, Properties> alllinks = document.getAnchors();
|
|
c = 0;
|
|
if (allAttr || contains(YaCySchema.inboundlinkscount_i)) add(doc, YaCySchema.inboundlinkscount_i, inboundLinks.size());
|
|
if (allAttr || contains(YaCySchema.inboundlinksnofollowcount_i)) add(doc, YaCySchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount());
|
|
final List<String> inboundlinksTag = new ArrayList<String>(inboundLinks.size());
|
|
final List<String> inboundlinksURLProtocol = new ArrayList<String>(inboundLinks.size());
|
|
final List<String> inboundlinksURLStub = new ArrayList<String>(inboundLinks.size());
|
|
final List<String> inboundlinksName = new ArrayList<String>(inboundLinks.size());
|
|
final List<String> inboundlinksRel = new ArrayList<String>(inboundLinks.size());
|
|
final List<String> inboundlinksText = new ArrayList<String>(inboundLinks.size());
|
|
final List<Integer> inboundlinksTextChars = new ArrayList<Integer>(inboundLinks.size());
|
|
final List<Integer> inboundlinksTextWords = new ArrayList<Integer>(inboundLinks.size());
|
|
final List<String> inboundlinksAltTag = new ArrayList<String>(inboundLinks.size());
|
|
for (final MultiProtocolURI u: inboundLinks) {
|
|
final Properties p = alllinks.get(u);
|
|
if (p == null) continue;
|
|
final String name = p.getProperty("name", ""); // the name attribute
|
|
final String rel = p.getProperty("rel", ""); // the rel-attribute
|
|
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
|
|
final String urls = u.toNormalform(false);
|
|
final int pr = urls.indexOf("://",0);
|
|
inboundlinksURLProtocol.add(urls.substring(0, pr));
|
|
inboundlinksURLStub.add(urls.substring(pr + 3));
|
|
inboundlinksName.add(name.length() > 0 ? name : "");
|
|
inboundlinksRel.add(rel.length() > 0 ? rel : "");
|
|
inboundlinksText.add(text.length() > 0 ? text : "");
|
|
inboundlinksTextChars.add(text.length() > 0 ? text.length() : 0);
|
|
inboundlinksTextWords.add(text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
|
|
inboundlinksTag.add(
|
|
"<a href=\"" + u.toNormalform(false) + "\"" +
|
|
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
|
|
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
|
|
">" +
|
|
((text.length() > 0) ? text : "") + "</a>");
|
|
ImageEntry ientry = images.get(u);
|
|
inboundlinksAltTag.add(ientry == null ? "" : ientry.alt());
|
|
c++;
|
|
}
|
|
if (allAttr || contains(YaCySchema.inboundlinks_tag_txt)) add(doc, YaCySchema.inboundlinks_tag_txt, inboundlinksTag);
|
|
if (allAttr || contains(YaCySchema.inboundlinks_protocol_sxt)) add(doc, YaCySchema.inboundlinks_protocol_sxt, protocolList2indexedList(inboundlinksURLProtocol));
|
|
if (allAttr || contains(YaCySchema.inboundlinks_urlstub_txt)) add(doc, YaCySchema.inboundlinks_urlstub_txt, inboundlinksURLStub);
|
|
if (allAttr || contains(YaCySchema.inboundlinks_name_txt)) add(doc, YaCySchema.inboundlinks_name_txt, inboundlinksName);
|
|
if (allAttr || contains(YaCySchema.inboundlinks_rel_sxt)) add(doc, YaCySchema.inboundlinks_rel_sxt, inboundlinksRel);
|
|
if (allAttr || contains(YaCySchema.inboundlinks_relflags_val)) add(doc, YaCySchema.inboundlinks_relflags_val, relEval(inboundlinksRel));
|
|
if (allAttr || contains(YaCySchema.inboundlinks_text_txt)) add(doc, YaCySchema.inboundlinks_text_txt, inboundlinksText);
|
|
if (allAttr || contains(YaCySchema.inboundlinks_text_chars_val)) add(doc, YaCySchema.inboundlinks_text_chars_val, inboundlinksTextChars);
|
|
if (allAttr || contains(YaCySchema.inboundlinks_text_words_val)) add(doc, YaCySchema.inboundlinks_text_words_val, inboundlinksTextWords);
|
|
if (allAttr || contains(YaCySchema.inboundlinks_alttag_txt)) add(doc, YaCySchema.inboundlinks_alttag_txt, inboundlinksAltTag);
|
|
|
|
c = 0;
|
|
if (allAttr || contains(YaCySchema.outboundlinkscount_i)) add(doc, YaCySchema.outboundlinkscount_i, outboundLinks.size());
|
|
if (allAttr || contains(YaCySchema.outboundlinksnofollowcount_i)) add(doc, YaCySchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
|
|
final List<String> outboundlinksTag = new ArrayList<String>(outboundLinks.size());
|
|
final List<String> outboundlinksURLProtocol = new ArrayList<String>(outboundLinks.size());
|
|
final List<String> outboundlinksURLStub = new ArrayList<String>(outboundLinks.size());
|
|
final List<String> outboundlinksName = new ArrayList<String>(outboundLinks.size());
|
|
final List<String> outboundlinksRel = new ArrayList<String>(outboundLinks.size());
|
|
final List<Integer> outboundlinksTextChars = new ArrayList<Integer>(outboundLinks.size());
|
|
final List<Integer> outboundlinksTextWords = new ArrayList<Integer>(outboundLinks.size());
|
|
final List<String> outboundlinksText = new ArrayList<String>(outboundLinks.size());
|
|
final List<String> outboundlinksAltTag = new ArrayList<String>(outboundLinks.size());
|
|
for (final MultiProtocolURI u: outboundLinks) {
|
|
final Properties p = alllinks.get(u);
|
|
if (p == null) continue;
|
|
final String name = p.getProperty("name", ""); // the name attribute
|
|
final String rel = p.getProperty("rel", ""); // the rel-attribute
|
|
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
|
|
final String urls = u.toNormalform(false);
|
|
final int pr = urls.indexOf("://",0);
|
|
outboundlinksURLProtocol.add(urls.substring(0, pr));
|
|
outboundlinksURLStub.add(urls.substring(pr + 3));
|
|
outboundlinksName.add(name.length() > 0 ? name : "");
|
|
outboundlinksRel.add(rel.length() > 0 ? rel : "");
|
|
outboundlinksText.add(text.length() > 0 ? text : "");
|
|
outboundlinksTextChars.add(text.length() > 0 ? text.length() : 0);
|
|
outboundlinksTextWords.add(text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
|
|
outboundlinksTag.add(
|
|
"<a href=\"" + u.toNormalform(false) + "\"" +
|
|
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
|
|
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
|
|
">" +
|
|
((text.length() > 0) ? text : "") + "</a>");
|
|
ImageEntry ientry = images.get(u);
|
|
inboundlinksAltTag.add(ientry == null ? "" : ientry.alt());
|
|
c++;
|
|
}
|
|
if (allAttr || contains(YaCySchema.outboundlinks_tag_txt)) add(doc, YaCySchema.outboundlinks_tag_txt, outboundlinksTag);
|
|
if (allAttr || contains(YaCySchema.outboundlinks_protocol_sxt)) add(doc, YaCySchema.outboundlinks_protocol_sxt, protocolList2indexedList(outboundlinksURLProtocol));
|
|
if (allAttr || contains(YaCySchema.outboundlinks_urlstub_txt)) add(doc, YaCySchema.outboundlinks_urlstub_txt, outboundlinksURLStub);
|
|
if (allAttr || contains(YaCySchema.outboundlinks_name_txt)) add(doc, YaCySchema.outboundlinks_name_txt, outboundlinksName);
|
|
if (allAttr || contains(YaCySchema.outboundlinks_rel_sxt)) add(doc, YaCySchema.outboundlinks_rel_sxt, outboundlinksRel);
|
|
if (allAttr || contains(YaCySchema.outboundlinks_relflags_val)) add(doc, YaCySchema.outboundlinks_relflags_val, relEval(outboundlinksRel));
|
|
if (allAttr || contains(YaCySchema.outboundlinks_text_txt)) add(doc, YaCySchema.outboundlinks_text_txt, outboundlinksText);
|
|
if (allAttr || contains(YaCySchema.outboundlinks_text_chars_val)) add(doc, YaCySchema.outboundlinks_text_chars_val, outboundlinksTextChars);
|
|
if (allAttr || contains(YaCySchema.outboundlinks_text_words_val)) add(doc, YaCySchema.outboundlinks_text_words_val, outboundlinksTextWords);
|
|
if (allAttr || contains(YaCySchema.outboundlinks_alttag_txt)) add(doc, YaCySchema.outboundlinks_alttag_txt, outboundlinksAltTag);
|
|
|
|
// charset
|
|
if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, document.getCharset());
|
|
|
|
// coordinates
|
|
if (document.lat() != 0.0 && document.lon() != 0.0) {
|
|
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon()));
|
|
}
|
|
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode());
|
|
|
|
// fields that were additionally in URIMetadataRow
|
|
Date loadDate = new Date();
|
|
Date modDate = responseHeader == null ? new Date() : responseHeader.lastModified();
|
|
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
|
|
int size = (int) Math.max(document.dc_source().length(), responseHeader == null ? 0 : responseHeader.getContentLength());
|
|
if (allAttr || contains(YaCySchema.load_date_dt)) add(doc, YaCySchema.load_date_dt, loadDate);
|
|
if (allAttr || contains(YaCySchema.fresh_date_dt)) add(doc, YaCySchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula
|
|
if (allAttr || contains(YaCySchema.host_id_s)) add(doc, YaCySchema.host_id_s, document.dc_source().hosthash());
|
|
if ((allAttr || contains(YaCySchema.referrer_id_txt)) && referrerURL != null) add(doc, YaCySchema.referrer_id_txt, new String[]{ASCII.String(referrerURL.hash())});
|
|
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
|
|
if (allAttr || contains(YaCySchema.publisher_t)) add(doc, YaCySchema.publisher_t, document.dc_publisher());
|
|
if ((allAttr || contains(YaCySchema.language_s)) && language != null) add(doc, YaCySchema.language_s, language);
|
|
if (allAttr || contains(YaCySchema.size_i)) add(doc, YaCySchema.size_i, size);
|
|
if (allAttr || contains(YaCySchema.audiolinkscount_i)) add(doc, YaCySchema.audiolinkscount_i, document.getAudiolinks().size());
|
|
if (allAttr || contains(YaCySchema.videolinkscount_i)) add(doc, YaCySchema.videolinkscount_i, document.getVideolinks().size());
|
|
if (allAttr || contains(YaCySchema.applinkscount_i)) add(doc, YaCySchema.applinkscount_i, document.getApplinks().size());
|
|
|
|
// write generic navigation
|
|
// there are no pre-defined solr fields for navigation because the vocabulary is generic
|
|
// we use dynamically allocated solr fields for this.
|
|
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
|
|
for (Map.Entry<String, Set<String>> facet: document.getGenericFacets().entrySet()) {
|
|
String facetName = facet.getKey();
|
|
Set<String> facetValues = facet.getValue();
|
|
doc.setField(YaCySchema.VOCABULARY_PREFIX + facetName + YaCySchema.VOCABULARY_SUFFIX, facetValues.toArray(new String[facetValues.size()]));
|
|
}
|
|
|
|
if (allAttr || contains(YaCySchema.process_sxt)) {
|
|
List<String> p = new ArrayList<String>();
|
|
for (ProcessType t: processTypes) p.add(t.name());
|
|
add(doc, YaCySchema.process_sxt, p);
|
|
}
|
|
return doc;
|
|
}
|
|
|
|
/**
|
|
* compute the click level using the citation reference database
|
|
* @param citations the citation database
|
|
* @param searchhash the hash of the url to be checked
|
|
* @return the clickdepth level or -1 if the root url cannot be found or a recursion limit is reached
|
|
* @throws IOException
|
|
*/
|
|
public static int getClickDepth(final IndexCell<CitationReference> citations, final DigestURI url) throws IOException {
|
|
|
|
final byte[] searchhash = url.hash();
|
|
RowHandleSet rootCandidates = url.getPossibleRootHashes();
|
|
|
|
RowHandleSet ignore = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
|
|
RowHandleSet levelhashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
|
|
try {levelhashes.put(searchhash);} catch (SpaceExceededException e) {throw new IOException(e);}
|
|
int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
|
|
final byte[] hosthash = new byte[6]; // the host of the url to be checked
|
|
System.arraycopy(searchhash, 6, hosthash, 0, 6);
|
|
|
|
long timeout = System.currentTimeMillis() + 10000;
|
|
for (int maxdepth = 0; maxdepth < 10 && System.currentTimeMillis() < timeout; maxdepth++) {
|
|
|
|
RowHandleSet checknext = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 100);
|
|
|
|
// loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
|
|
checkloop: for (byte[] urlhash: levelhashes) {
|
|
|
|
// get all the citations for this url and iterate
|
|
ReferenceContainer<CitationReference> references = citations.get(urlhash, null);
|
|
if (references == null || references.size() == 0) continue checkloop; // don't know
|
|
Iterator<CitationReference> i = references.entries();
|
|
nextloop: while (i.hasNext()) {
|
|
CitationReference ref = i.next();
|
|
if (ref == null) continue nextloop;
|
|
byte[] u = ref.urlhash();
|
|
|
|
// check ignore
|
|
if (ignore.has(u)) continue nextloop;
|
|
|
|
// check if this is from the same host
|
|
if (!ByteBuffer.equals(u, 6, hosthash, 0, 6)) continue nextloop;
|
|
|
|
// check if the url is a root url
|
|
if (rootCandidates.has(u)) {
|
|
return leveldepth + 1;
|
|
}
|
|
|
|
// step to next depth level
|
|
try {checknext.put(u);} catch (SpaceExceededException e) {}
|
|
try {ignore.put(u);} catch (SpaceExceededException e) {}
|
|
}
|
|
}
|
|
leveldepth++;
|
|
levelhashes = checknext;
|
|
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/**
|
|
* this method compresses a list of protocol names to an indexed list.
|
|
* To do this, all 'http' entries are removed and considered as default.
|
|
* The remaining entries are indexed as follows: a list of <i>-<p> entries is produced, where
|
|
* <i> is an index pointing to the original index of the protocol entry and <p> is the protocol entry itself.
|
|
* The <i> entry is formatted as a 3-digit decimal number with leading zero digits.
|
|
* @param protocol
|
|
* @return a list of indexed protocol entries
|
|
*/
|
|
private static List<String> protocolList2indexedList(List<String> protocol) {
|
|
List<String> a = new ArrayList<String>();
|
|
String p;
|
|
for (int i = 0; i < protocol.size(); i++) {
|
|
p = protocol.get(i);
|
|
if (!p.equals("http")) {
|
|
String c = Integer.toString(i);
|
|
while (c.length() < 3) c = "0" + c;
|
|
a.add(c + "-" + p);
|
|
}
|
|
}
|
|
return a;
|
|
}
|
|
|
|
/**
|
|
* encode a string containing attributes from anchor rel properties binary:
|
|
* bit 0: "me" contained in rel
|
|
* bit 1: "nofollow" contained in rel
|
|
* @param rel
|
|
* @return binary encoded information about rel
|
|
*/
|
|
private static List<Integer> relEval(final List<String> rel) {
|
|
List<Integer> il = new ArrayList<Integer>(rel.size());
|
|
for (final String s: rel) {
|
|
int i = 0;
|
|
final String s0 = s.toLowerCase().trim();
|
|
if ("me".equals(s0)) i += 1;
|
|
if ("nofollow".equals(s0)) i += 2;
|
|
il.add(i);
|
|
}
|
|
return il;
|
|
}
|
|
|
|
/**
|
|
* register an entry as error document
|
|
* @param digestURI
|
|
* @param failReason
|
|
* @param httpstatus
|
|
* @throws IOException
|
|
*/
|
|
public SolrInputDocument err(final DigestURI digestURI, final String failReason, final FailType failType, final int httpstatus) throws IOException {
|
|
final SolrInputDocument solrdoc = new SolrInputDocument();
|
|
add(solrdoc, YaCySchema.id, ASCII.String(digestURI.hash()));
|
|
add(solrdoc, YaCySchema.sku, digestURI.toNormalform(true));
|
|
final InetAddress address = digestURI.getInetAddress();
|
|
if (contains(YaCySchema.ip_s) && address != null) add(solrdoc, YaCySchema.ip_s, address.getHostAddress());
|
|
if (contains(YaCySchema.host_s) && digestURI.getHost() != null) add(solrdoc, YaCySchema.host_s, digestURI.getHost());
|
|
|
|
// path elements of link
|
|
if (contains(YaCySchema.url_paths_sxt)) add(solrdoc, YaCySchema.url_paths_sxt, digestURI.getPaths());
|
|
if (contains(YaCySchema.url_file_ext_s)) add(solrdoc, YaCySchema.url_file_ext_s, digestURI.getFileExtension());
|
|
|
|
// fail reason and status
|
|
if (contains(YaCySchema.failreason_t)) add(solrdoc, YaCySchema.failreason_t, failReason);
|
|
if (contains(YaCySchema.failtype_s)) add(solrdoc, YaCySchema.failtype_s, failType.name());
|
|
if (contains(YaCySchema.httpstatus_i)) add(solrdoc, YaCySchema.httpstatus_i, httpstatus);
|
|
return solrdoc;
|
|
}
|
|
|
|
|
|
/*
|
|
standard solr schema
|
|
|
|
<field name="name" type="textgen" indexed="true" stored="true"/>
|
|
<field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>
|
|
<field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
|
|
<field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
|
|
|
|
<field name="weight" type="float" indexed="true" stored="true"/>
|
|
<field name="price" type="float" indexed="true" stored="true"/>
|
|
<field name="popularity" type="int" indexed="true" stored="true" />
|
|
|
|
<!-- Common metadata fields, named specifically to match up with
|
|
SolrCell metadata when parsing rich documents such as Word, PDF.
|
|
Some fields are multiValued only because Tika currently may return
|
|
multiple values for them.
|
|
-->
|
|
<field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
|
|
<field name="subject" type="text" indexed="true" stored="true"/>
|
|
<field name="description" type="text" indexed="true" stored="true"/>
|
|
<field name="comments" type="text" indexed="true" stored="true"/>
|
|
<field name="author" type="textgen" indexed="true" stored="true"/>
|
|
<field name="keywords" type="textgen" indexed="true" stored="true"/>
|
|
<field name="category" type="textgen" indexed="true" stored="true"/>
|
|
<field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
|
|
<field name="last_modified" type="date" indexed="true" stored="true"/>
|
|
<field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
|
|
*/
|
|
}
|