added more solr fields to integrate values from URIMetadataRow. All

writings to the Metadata-DB are now also done to solr. This includes
metadata transfer during search and rwi transfer.

The new/added solr fields are:

## time when resource was loaded
load_date_dt

## date until resource shall be considered as fresh
fresh_date_dt

## id of the host, a 6-byte hash that is part of the document id
host_id_s

## ids of referrer to this document
referrer_id_ss

## the md5 of the raw source
md5_s

## the name of the publisher of the document
publisher_t

## the language used in the document; starts with primary language
language_ss

## an external ranking value
ranking_i

## the size of the raw source
size_i

## number of links to audio resources
audiolinkscount_i

## number of links to video resources
videolinkscount_i

## number of links to application resources
applinkscount_i
pull/1/head
orbiter 13 years ago
parent e432bb9cd9
commit d9173ba7ed

@ -267,3 +267,45 @@ failreason_t
## response time of target server in milliseconds, int
responsetime_i
### values used additionally by URIMetadataRow, part of the index transfer process
## time when resource was loaded
load_date_dt
## date until resource shall be considered as fresh
fresh_date_dt
## id of the host, a 6-byte hash that is part of the document id
host_id_s
## ids of referrer to this document
referrer_id_ss
## the md5 of the raw source
md5_s
## the name of the publisher of the document
publisher_t
## the language used in the document; starts with primary language
language_ss
## an external ranking value
ranking_i
## the size of the raw source
size_i
## number of links to audio resources
audiolinkscount_i
## number of links to video resources
videolinkscount_i
## number of links to application resources
applinkscount_i
## index creation comment
process_s

@ -197,17 +197,6 @@ public class IndexFederated_p {
if (field.getComment() != null) prop.putHTML("scheme_" + c + "_comment",field.getComment());
c++;
}
/* final Iterator<ConfigurationSet.Entry> i = sb.solrScheme.entryIterator();
ConfigurationSet.Entry entry;
while (i.hasNext()) {
entry = i.next();
prop.put("scheme_" + c + "_dark", dark ? 1 : 0); dark = !dark;
prop.put("scheme_" + c + "_checked", entry.enabled() ? 1 : 0);
prop.putHTML("scheme_" + c + "_key", entry.key());
prop.putHTML("scheme_" + c + "_solrfieldname",entry.getValue() == null ? "" : entry.getValue());
if (entry.getComment() != null) prop.putHTML("scheme_" + c + "_comment",entry.getComment());
c++;
}*/
prop.put("scheme", c);
// fill attribute fields

@ -147,6 +147,9 @@ public final class crawlReceipt {
if ("fill".equals(result)) try {
// put new entry into database
sb.index.urlMetadata().store(entry);
if (!sb.index.urlMetadata().getSolr().exists(ASCII.String(entry.url().hash()))) {
sb.index.urlMetadata().getSolr().add(sb.index.urlMetadata().getSolrScheme().metadata2solr(entry));
}
ResultURLs.stack(entry, youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS);
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done
if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false, true));

@ -30,6 +30,7 @@ import java.io.IOException;
import java.text.ParseException;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -141,6 +142,9 @@ public final class transferURL {
if (Network.log.isFine()) Network.log.logFine("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.url().toNormalform(true, false));
try {
sb.index.urlMetadata().store(lEntry);
if (!sb.index.urlMetadata().getSolr().exists(ASCII.String(lEntry.url().hash()))) {
sb.index.urlMetadata().getSolr().add(sb.index.urlMetadata().getSolrScheme().metadata2solr(lEntry));
}
ResultURLs.stack(lEntry, iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER);
if (Network.log.isFine()) Network.log.logFine("transferURL: received URL '" + lEntry.url().toNormalform(false, true) + "' from peer " + otherPeerName);
received++;

@ -37,6 +37,7 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield;
@ -96,7 +97,7 @@ public final class ResultURLs {
}
public static void stack(
final URIMetadataRow e,
final URIMetadata e,
final byte[] initiatorHash,
final byte[] executorHash,
final EventOrigin stackType) {

@ -70,29 +70,28 @@ public class Response {
// doctype calculation
public static char docType(final MultiProtocolURI url) {
final String path = url.getPath().toLowerCase();
// serverLog.logFinest("PLASMA", "docType URL=" + path);
char doctype = DT_UNKNOWN;
if (path.endsWith(".gif")) { doctype = DT_IMAGE; }
else if (path.endsWith(".ico")) { doctype = DT_IMAGE; }
else if (path.endsWith(".bmp")) { doctype = DT_IMAGE; }
else if (path.endsWith(".jpg")) { doctype = DT_IMAGE; }
else if (path.endsWith(".jpeg")) { doctype = DT_IMAGE; }
else if (path.endsWith(".png")) { doctype = DT_IMAGE; }
else if (path.endsWith(".html")) { doctype = DT_HTML; }
else if (path.endsWith(".txt")) { doctype = DT_TEXT; }
else if (path.endsWith(".doc")) { doctype = DT_DOC; }
else if (path.endsWith(".rtf")) { doctype = DT_DOC; }
else if (path.endsWith(".pdf")) { doctype = DT_PDFPS; }
else if (path.endsWith(".ps")) { doctype = DT_PDFPS; }
else if (path.endsWith(".avi")) { doctype = DT_MOVIE; }
else if (path.endsWith(".mov")) { doctype = DT_MOVIE; }
else if (path.endsWith(".qt")) { doctype = DT_MOVIE; }
else if (path.endsWith(".mpg")) { doctype = DT_MOVIE; }
else if (path.endsWith(".md5")) { doctype = DT_SHARE; }
else if (path.endsWith(".mpeg")) { doctype = DT_MOVIE; }
else if (path.endsWith(".asf")) { doctype = DT_FLASH; }
return doctype;
String ext = url.getFileExtension();
if (ext == null) return DT_UNKNOWN;
if (ext.equals(".gif")) return DT_IMAGE;
if (ext.equals(".ico")) return DT_IMAGE;
if (ext.equals(".bmp")) return DT_IMAGE;
if (ext.equals(".jpg")) return DT_IMAGE;
if (ext.equals(".jpeg")) return DT_IMAGE;
if (ext.equals(".png")) return DT_IMAGE;
if (ext.equals(".html")) return DT_HTML;
if (ext.equals(".txt")) return DT_TEXT;
if (ext.equals(".doc")) return DT_DOC;
if (ext.equals(".rtf")) return DT_DOC;
if (ext.equals(".pdf")) return DT_PDFPS;
if (ext.equals(".ps")) return DT_PDFPS;
if (ext.equals(".avi")) return DT_MOVIE;
if (ext.equals(".mov")) return DT_MOVIE;
if (ext.equals(".qt")) return DT_MOVIE;
if (ext.equals(".mpg")) return DT_MOVIE;
if (ext.equals(".md5")) return DT_SHARE;
if (ext.equals(".mpeg")) return DT_MOVIE;
if (ext.equals(".asf")) return DT_FLASH;
return DT_UNKNOWN;
}
public static char docType(final String mime) {
@ -115,30 +114,20 @@ public class Response {
else if (mime.startsWith("image/")) doctype = DT_IMAGE;
else if (mime.startsWith("audio/")) doctype = DT_AUDIO;
else if (mime.startsWith("video/")) doctype = DT_MOVIE;
//bz2 = application/x-bzip2
//dvi = application/x-dvi
//gz = application/gzip
//hqx = application/mac-binhex40
//lha = application/x-lzh
//lzh = application/x-lzh
//pac = application/x-ns-proxy-autoconfig
//php = application/x-httpd-php
//phtml = application/x-httpd-php
//rss = application/xml
//tar = application/tar
//tex = application/x-tex
//tgz = application/tar
//torrent = application/x-bittorrent
//xhtml = application/xhtml+xml
//xla = application/msexcel
//xls = application/msexcel
//xsl = application/xml
//xml = application/xml
//Z = application/x-compress
//zip = application/zip
return doctype;
}
public static String doctype2mime(String ext, char doctype) {
String mime = Classification.ext2mime(ext);
int p = mime.indexOf('/');
if (p < 0) return mime;
if (doctype == DT_TEXT) return "text" + mime.substring(p);
if (doctype == DT_IMAGE) return "image" + mime.substring(p);
if (doctype == DT_AUDIO) return "audio" + mime.substring(p);
if (doctype == DT_MOVIE) return "video" + mime.substring(p);
return mime;
}
public static final int QUEUE_STATE_FRESH = 0;
public static final int QUEUE_STATE_PARSING = 1;
public static final int QUEUE_STATE_CONDENSING = 2;

@ -52,6 +52,10 @@ public class SolrDoc extends SolrInputDocument {
this.setField(key.getSolrFieldName(), value);
}
public final void addSolr(final SolrField key, final long value) {
this.setField(key.getSolrFieldName(), value);
}
public final void addSolr(final SolrField key, final String[] value) {
this.setField(key.getSolrFieldName(), value);
}

@ -123,7 +123,7 @@ public class ConfigurationSet extends TreeMap<String,Entry> implements Serializa
/**
* override the abstract implementation because that is not stable in concurrent requests
*/
public boolean contains (String key) {
public boolean contains(String key) {
if (key == null) return false;
Entry e = this.get(key);
return e == null ? false : e.enabled();
@ -260,11 +260,7 @@ public class ConfigurationSet extends TreeMap<String,Entry> implements Serializa
}
writer.close();
}
/*
public Iterator<String> iterator() {
return this.keySet().iterator();
}
*/
public Iterator<Entry> entryIterator() {
return this.values().iterator();
}

@ -56,54 +56,22 @@ public final class Condenser {
// category flags that show how the page can be distinguished in different interest groups
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
public static final int flag_cat_opencontent = 1; // open source, any free stuff
public static final int flag_cat_business = 2; // web shops, marketing, trade
public static final int flag_cat_stockfinance = 3; // stock exchange (quotes), finance, economy
public static final int flag_cat_health = 4; // health
public static final int flag_cat_sport = 5; // any sport, cars etc.
public static final int flag_cat_lifestyle = 6; // travel, lifestyle
public static final int flag_cat_politics = 7; // politics
public static final int flag_cat_news = 8; // blogs, news pages
public static final int flag_cat_children = 9; // toys, childrens education, help for parents
public static final int flag_cat_entertainment = 10; // boulevard, entertainment, cultural content
public static final int flag_cat_knowledge = 11; // science, school stuff, help for homework
public static final int flag_cat_computer = 12; // any computer related stuff, networks, operation systems
public static final int flag_cat_p2p = 13; // p2p support, file-sharing archives etc.
public static final int flag_cat_sex = 14; // sexual content
public static final int flag_cat_spam = 15; // pages that anybody would consider as not interesting
public static final int flag_cat_linux = 16; // pages about linux software
public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os
public static final int flag_cat_windows = 18; // pages about windows os and software
public static final int flag_cat_haslocation = 19; // the page has a location metadata attached
public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
//private Properties analysis;
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
//public int RESULT_NUMB_TEXT_BYTES = -1;
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public Bitfield RESULT_FLAGS = new Bitfield(4);
private final Identificator languageIdentificator;
/*
private final static int numlength = 5;
private static final ThreadLocal <NumberFormat> intStringFormatter =
new ThreadLocal <NumberFormat>() {
@Override protected NumberFormat initialValue() {
NumberFormat n = NumberFormat.getIntegerInstance();
n.setMinimumIntegerDigits(numlength);
n.setMaximumIntegerDigits(numlength);
return n;
}
};
*/
public Condenser(
final Document document,

@ -153,20 +153,13 @@ public class pdfParser extends AbstractParser implements Parser {
if (t.isAlive()) t.interrupt();
pdfDoc.close();
contentBytes = writer.getBytes(); // get final text before closing writer
} catch (final IOException e) {
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
try {pdfDoc.close();} catch (final IOException ee) {}
//throw new Parser.Failure(e.getMessage(), location);
} catch (final NullPointerException e) {
// this exception appeared after the insertion of the jempbox-1.5.0.jar library
Log.logException(e);
} catch (final Throwable e) {
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
try {pdfDoc.close();} catch (final IOException ee) {}
try {pdfDoc.close();} catch (final Throwable ee) {}
//throw new Parser.Failure(e.getMessage(), location);
} finally {
try {pdfDoc.close();} catch (final IOException e) {}
try {pdfDoc.close();} catch (final Throwable e) {}
writer.close();
}

@ -357,7 +357,9 @@ public final class Protocol
if ( p < 0 ) {
return -1;
}
final String host = Domains.dnsResolve(address.substring(0, p)).getHostAddress();
InetAddress ia = Domains.dnsResolve(address.substring(0, p));
if (ia == null) continue;
final String host = ia.getHostAddress();
s = Seed.genRemoteSeed(seedStr, false, host);
} else {
s = Seed.genRemoteSeed(seedStr, false, null);
@ -752,6 +754,9 @@ public final class Protocol
// passed all checks, store url
try {
indexSegment.urlMetadata().store(urlEntry);
if (!indexSegment.urlMetadata().getSolr().exists(ASCII.String(urlEntry.url().hash()))) {
indexSegment.urlMetadata().getSolr().add(indexSegment.urlMetadata().getSolrScheme().metadata2solr(urlEntry));
}
ResultURLs.stack(
urlEntry,
mySeed.hash.getBytes(),
@ -1081,7 +1086,7 @@ public final class Protocol
final String process,
final String result,
final String reason,
final URIMetadataRow entry,
final URIMetadata entry,
final String wordhashes) {
assert (target != null);
assert (mySeed != null);

@ -111,7 +111,6 @@ import net.yacy.gui.Tray;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
@ -2514,7 +2513,7 @@ public final class Switchboard extends serverSwitch
this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url);
// STORE WORD INDEX
URIMetadataRow newEntry = null;
URIMetadata newEntry = null;
try {
newEntry =
this.index.storeDocument(
@ -2761,9 +2760,9 @@ public final class Switchboard extends serverSwitch
public class receiptSending implements Runnable
{
private final Seed initiatorPeer;
private final URIMetadataRow reference;
private final URIMetadata reference;
public receiptSending(final Seed initiatorPeer, final URIMetadataRow reference) {
public receiptSending(final Seed initiatorPeer, final URIMetadata reference) {
this.initiatorPeer = initiatorPeer;
this.reference = reference;
}

@ -42,7 +42,6 @@ import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.query.QueryParams;
import net.yacy.search.query.RWIProcess;
@ -54,8 +53,7 @@ import net.yacy.search.ranking.ReferenceOrder;
*
* @author Michael Christen
*/
public class DocumentIndex extends Segment
{
public class DocumentIndex extends Segment {
private static final RankingProfile textRankingDefault = new RankingProfile(Classification.ContentDomain.TEXT);
//private Bitfield zeroConstraint = new Bitfield(4);
@ -102,12 +100,12 @@ public class DocumentIndex extends Segment
@Override
public void run() {
DigestURI f;
URIMetadataRow[] resultRows;
URIMetadata[] resultRows;
try {
while ( (f = DocumentIndex.this.queue.take()) != poison ) {
try {
resultRows = add(f);
for ( final URIMetadataRow resultRow : resultRows ) {
for ( final URIMetadata resultRow : resultRows ) {
if ( DocumentIndex.this.callback != null ) {
if ( resultRow == null ) {
DocumentIndex.this.callback.fail(f, "result is null");
@ -139,7 +137,7 @@ public class DocumentIndex extends Segment
this.queue.clear();
}
private URIMetadataRow[] add(final DigestURI url) throws IOException {
private URIMetadata[] add(final DigestURI url) throws IOException {
if ( url == null ) {
throw new IOException("file = null");
}
@ -162,7 +160,7 @@ public class DocumentIndex extends Segment
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
}
//Document document = Document.mergeDocuments(url, null, documents);
final URIMetadataRow[] rows = new URIMetadataRow[documents.length];
final URIMetadata[] rows = new URIMetadata[documents.length];
int c = 0;
for ( final Document document : documents ) {
if (document == null) continue;
@ -274,7 +272,7 @@ public class DocumentIndex extends Segment
public interface CallbackListener
{
public void commit(DigestURI f, URIMetadataRow resultRow);
public void commit(DigestURI f, URIMetadata resultRow);
public void fail(DigestURI f, String failReason);
}
@ -295,7 +293,7 @@ public class DocumentIndex extends Segment
System.out.println("using index files at " + segmentPath.getAbsolutePath());
final CallbackListener callback = new CallbackListener() {
@Override
public void commit(final DigestURI f, final URIMetadataRow resultRow) {
public void commit(final DigestURI f, final URIMetadata resultRow) {
System.out.println("indexed: " + f.toString());
}

@ -71,14 +71,16 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
private String tablename;
private ArrayList<HostStat> statsDump;
private final DoubleSolrConnector solr;
private final SolrConfiguration solrScheme;
public MetadataRepository(final File path) {
public MetadataRepository(final File path, final SolrConfiguration solrScheme) {
this.location = path;
this.tablename = null;
this.urlIndexFile = null;
this.exportthread = null; // will have a export thread assigned if exporter is running
this.statsDump = null;
this.solr = new DoubleSolrConnector();
this.solrScheme = solrScheme;
}
public boolean connectedUrlDb() {
@ -97,6 +99,10 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.urlIndexFile = null;
}
public SolrConfiguration getSolrScheme() {
return this.solrScheme;
}
public boolean connectedLocalSolr() {
return this.solr.isConnected0();
}

@ -99,7 +99,6 @@ public class Segment {
private final Log log;
private final File segmentPath;
private final SolrConfiguration solrScheme;
protected final MetadataRepository urlMetadata;
protected IndexCell<WordReference> termIndex;
protected IndexCell<CitationReference> urlCitationIndex;
@ -108,10 +107,9 @@ public class Segment {
log.logInfo("Initializing Segment '" + segmentPath + ".");
this.log = log;
this.segmentPath = segmentPath;
this.solrScheme = solrScheme;
// create LURL-db
this.urlMetadata = new MetadataRepository(segmentPath);
this.urlMetadata = new MetadataRepository(segmentPath, solrScheme);
}
public boolean connectedRWI() {
@ -203,7 +201,7 @@ public class Segment {
}
public SolrConfiguration getSolrScheme() {
return this.solrScheme;
return this.urlMetadata.getSolrScheme();
}
public SolrConnector getRemoteSolr() {
@ -398,7 +396,7 @@ public class Segment {
return language;
}
public URIMetadataRow storeDocument(
public URIMetadata storeDocument(
final DigestURI url,
final DigestURI referrerURL,
Date modDate,
@ -420,22 +418,10 @@ public class Segment {
final String urlNormalform = url.toNormalform(true, false);
final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language
// STORE TO SOLR
boolean localSolr = this.connectedLocalSolr();
boolean remoteSolr = this.connectedRemoteSolr();
if (localSolr || remoteSolr) {
try {
SolrDoc solrDoc = this.solrScheme.yacy2solr(id, responseHeader, document);
this.getSolr().add(solrDoc);
} catch ( final IOException e ) {
Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage());
}
}
// STORE URL TO LOADED-URL-DB
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader
char docType = Response.docType(document.dc_format());
final URIMetadataRow newEntry = new URIMetadataRow(
final URIMetadata metadata = new URIMetadataRow(
url, // URL
dc_title, // document description
document.dc_creator(), // author
@ -460,9 +446,21 @@ public class Segment {
document.getVideolinks().size(), // lvideo
document.getApplinks().size() // lapp
);
this.urlMetadata.store(newEntry);
this.urlMetadata.store(metadata);
final long storageEndTime = System.currentTimeMillis();
// STORE TO SOLR
boolean localSolr = this.connectedLocalSolr();
boolean remoteSolr = this.connectedRemoteSolr();
if (localSolr || remoteSolr) {
try {
SolrDoc solrDoc = this.urlMetadata.getSolrScheme().yacy2solr(id, responseHeader, document, metadata);
this.getSolr().add(solrDoc);
} catch ( final IOException e ) {
Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage());
}
}
// STORE PAGE INDEX INTO WORD INDEX DB
int outlinksSame = document.inboundLinks().size();
int outlinksOther = document.outboundLinks().size();
@ -545,7 +543,7 @@ public class Segment {
}
// finished
return newEntry;
return metadata;
}
public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {

@ -24,7 +24,6 @@
package net.yacy.search.index;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
@ -41,18 +40,24 @@ import java.util.Set;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.services.federated.solr.SolrDoc;
import net.yacy.cora.storage.ConfigurationSet;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield;
import org.apache.solr.common.SolrDocument;
import de.anomic.crawler.retrieval.Response;
public class SolrConfiguration extends ConfigurationSet implements Serializable {
private static final long serialVersionUID=-499100932212840385L;
@ -88,46 +93,63 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
it.remove();
}
}
// check consistency the other way: look if all enum constants in SolrField appear in the configuration file
for (SolrField field: SolrField.values()) {
if (this.get(field.name()) == null) {
Log.logWarning("SolrScheme", " solr scheme file " + configurationFile.getAbsolutePath() + " is missing declaration for '" + field.name() + "'");
}
}
this.lazy = lazy;
}
private boolean contains(SolrField field) {
return this.contains(field.name());
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final byte[] value) {
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length != 0))) solrdoc.addSolr(key, UTF8.String(value));
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value);
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value, final float boost) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value, boost);
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value, boost);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final Date value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.getTime() > 0))) solrdoc.addSolr(key, value);
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.getTime() > 0))) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String[] value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.length > 0))) solrdoc.addSolr(key, value);
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final List<String> value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value);
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final int value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || value > 0)) solrdoc.addSolr(key, value);
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final long value) {
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final float value) {
if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value);
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0.0f)) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final double value) {
if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value);
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0.0d)) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final boolean value) {
if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value);
if (isEmpty() || contains(key)) solrdoc.addSolr(key, value);
}
/**
* save configuration to file and update enum SolrFields
* @throws IOException
@ -148,33 +170,103 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
} catch (final IOException e) {}
}
public SolrDoc metadata2solr(final URIMetadata md) {
final SolrDoc solrdoc = new SolrDoc();
final DigestURI digestURI = new DigestURI(md.url());
boolean allAttr = this.isEmpty();
if (allAttr || contains(SolrField.failreason_t)) addSolr(solrdoc, SolrField.failreason_t, "");
addSolr(solrdoc, SolrField.id, ASCII.String(md.hash()));
addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false));
if (allAttr || contains(SolrField.ip_s)) {
final InetAddress address = digestURI.getInetAddress();
if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress());
}
if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost());
if (allAttr || contains(SolrField.title)) addSolr(solrdoc, SolrField.title, md.dc_title());
if (allAttr || contains(SolrField.author)) addSolr(solrdoc, SolrField.author, md.dc_creator());
if (allAttr || contains(SolrField.description)) addSolr(solrdoc, SolrField.description, md.snippet());
if (allAttr || contains(SolrField.content_type)) addSolr(solrdoc, SolrField.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype()));
if (allAttr || contains(SolrField.last_modified)) addSolr(solrdoc, SolrField.last_modified, md.moddate());
if (allAttr || contains(SolrField.text_t)) addSolr(solrdoc, SolrField.text_t, ""); // not delivered in metadata
if (allAttr || contains(SolrField.wordcount_i)) addSolr(solrdoc, SolrField.wordcount_i, md.wordCount());
if (allAttr || contains(SolrField.keywords)) {
String keywords = md.dc_subject();
Bitfield flags = md.flags();
if (flags.get(Condenser.flag_cat_indexof)) {
if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else {
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
}
}
addSolr(solrdoc, SolrField.keywords, keywords);
}
// path elements of link
final String path = digestURI.getPath();
if (path != null && (allAttr || contains(SolrField.paths_txt))) {
final String[] paths = path.split("/");
if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths);
}
if (allAttr || contains(SolrField.imagescount_i)) addSolr(solrdoc, SolrField.imagescount_i, md.limage());
if (allAttr || contains(SolrField.inboundlinkscount_i)) addSolr(solrdoc, SolrField.inboundlinkscount_i, md.llocal());
if (allAttr || contains(SolrField.outboundlinkscount_i)) addSolr(solrdoc, SolrField.outboundlinkscount_i, md.lother());
if (allAttr || contains(SolrField.charset_s)) addSolr(solrdoc, SolrField.charset_s, "UTF8");
// coordinates
if (md.lat() != 0.0f && md.lon() != 0.0f) {
if (allAttr || contains(SolrField.lon_coordinate)) addSolr(solrdoc, SolrField.lon_coordinate, md.lon());
if (allAttr || contains(SolrField.lat_coordinate)) addSolr(solrdoc, SolrField.lat_coordinate, md.lat());
}
if (allAttr || contains(SolrField.httpstatus_i)) addSolr(solrdoc, SolrField.httpstatus_i, 200);
// fields that are in URIMetadataRow additional to yacy2solr basic requirement
if (allAttr || contains(SolrField.load_date_dt)) addSolr(solrdoc, SolrField.load_date_dt, md.loaddate());
if (allAttr || contains(SolrField.fresh_date_dt)) addSolr(solrdoc, SolrField.fresh_date_dt, md.freshdate());
if (allAttr || contains(SolrField.host_id_s)) addSolr(solrdoc, SolrField.host_id_s, md.hosthash());
if ((allAttr || contains(SolrField.referrer_id_ss)) && md.referrerHash() != null) addSolr(solrdoc, SolrField.referrer_id_ss, new String[]{ASCII.String(md.referrerHash())});
if (allAttr || contains(SolrField.md5_s)) addSolr(solrdoc, SolrField.md5_s, md.md5());
if (allAttr || contains(SolrField.publisher_t)) addSolr(solrdoc, SolrField.publisher_t, md.dc_publisher());
if ((allAttr || contains(SolrField.language_ss)) && md.language() != null) addSolr(solrdoc, SolrField.language_ss,new String[]{UTF8.String(md.language())});
if (allAttr || contains(SolrField.ranking_i)) addSolr(solrdoc, SolrField.ranking_i, md.ranking());
if (allAttr || contains(SolrField.size_i)) addSolr(solrdoc, SolrField.size_i, md.size());
if (allAttr || contains(SolrField.audiolinkscount_i)) addSolr(solrdoc, SolrField.audiolinkscount_i, md.laudio());
if (allAttr || contains(SolrField.videolinkscount_i)) addSolr(solrdoc, SolrField.videolinkscount_i, md.lvideo());
if (allAttr || contains(SolrField.applinkscount_i)) addSolr(solrdoc, SolrField.applinkscount_i, md.lapp());
return solrdoc;
}
public SolrDoc yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) {
public SolrDoc yacy2solr(final String id, final ResponseHeader header, final Document yacydoc, final URIMetadata metadata) {
// we use the SolrCell design as index scheme
final SolrDoc solrdoc = new SolrDoc();
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
addSolr(solrdoc, SolrField.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
boolean allAttr = this.isEmpty();
addSolr(solrdoc, SolrField.id, id);
addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false));
final InetAddress address = digestURI.getInetAddress();
if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress());
if (allAttr || contains(SolrField.failreason_t)) addSolr(solrdoc, SolrField.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
if (allAttr || contains(SolrField.ip_s)) {
final InetAddress address = digestURI.getInetAddress();
if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress());
}
if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost());
addSolr(solrdoc, SolrField.title, yacydoc.dc_title());
addSolr(solrdoc, SolrField.author, yacydoc.dc_creator());
addSolr(solrdoc, SolrField.description, yacydoc.dc_description());
addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format());
addSolr(solrdoc, SolrField.last_modified, header == null ? new Date() : header.lastModified());
addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' '));
if (allAttr || contains(SolrField.title)) addSolr(solrdoc, SolrField.title, yacydoc.dc_title());
if (allAttr || contains(SolrField.author)) addSolr(solrdoc, SolrField.author, yacydoc.dc_creator());
if (allAttr || contains(SolrField.description)) addSolr(solrdoc, SolrField.description, yacydoc.dc_description());
if (allAttr || contains(SolrField.content_type)) addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format());
if (allAttr || contains(SolrField.last_modified)) addSolr(solrdoc, SolrField.last_modified, header == null ? new Date() : header.lastModified());
if (allAttr || contains(SolrField.keywords)) addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' '));
final String content = yacydoc.getTextString();
addSolr(solrdoc, SolrField.text_t, content);
if (isEmpty() || contains(SolrField.wordcount_i.name())) {
if (allAttr || contains(SolrField.text_t)) addSolr(solrdoc, SolrField.text_t, content);
if (allAttr || contains(SolrField.wordcount_i)) {
final int contentwc = content.split(" ").length;
addSolr(solrdoc, SolrField.wordcount_i, contentwc);
}
// path elements of link
final String path = digestURI.getPath();
if (path != null && (isEmpty() || contains(SolrField.paths_txt.name()))) {
if (path != null && (allAttr || contains(SolrField.paths_txt))) {
final String[] paths = path.split("/");
if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths);
}
@ -250,7 +342,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
addSolr(solrdoc, SolrField.boldcount_i, bold.length);
if (bold.length > 0) {
addSolr(solrdoc, SolrField.bold_txt, bold);
if (isEmpty() || contains(SolrField.bold_val.name())) {
if (allAttr || contains(SolrField.bold_val)) {
addSolr(solrdoc, SolrField.bold_val, html.getBoldCount(bold));
}
}
@ -258,7 +350,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
addSolr(solrdoc, SolrField.italiccount_i, italic.length);
if (italic.length > 0) {
addSolr(solrdoc, SolrField.italic_txt, italic);
if (isEmpty() || contains(SolrField.italic_val.name())) {
if (allAttr || contains(SolrField.italic_val)) {
addSolr(solrdoc, SolrField.italic_val, html.getItalicCount(italic));
}
}
@ -282,14 +374,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
imgstubs.add(uri.toString().substring(protocol.length() + 3));
imgalts.add(ie.alt());
}
addSolr(solrdoc, SolrField.imagescount_i, imgtags.size());
if (isEmpty() || contains(SolrField.images_tag_txt.name())) addSolr(solrdoc, SolrField.images_tag_txt, imgtags);
if (isEmpty() || contains(SolrField.images_protocol_txt.name())) addSolr(solrdoc, SolrField.images_protocol_txt, protocolList2indexedList(imgprots));
if (isEmpty() || contains(SolrField.images_urlstub_txt.name())) addSolr(solrdoc, SolrField.images_urlstub_txt, imgstubs);
if (isEmpty() || contains(SolrField.images_alt_txt.name())) addSolr(solrdoc, SolrField.images_alt_txt, imgalts);
if (allAttr || contains(SolrField.imagescount_i)) addSolr(solrdoc, SolrField.imagescount_i, imgtags.size());
if (allAttr || contains(SolrField.images_tag_txt)) addSolr(solrdoc, SolrField.images_tag_txt, imgtags);
if (allAttr || contains(SolrField.images_protocol_txt)) addSolr(solrdoc, SolrField.images_protocol_txt, protocolList2indexedList(imgprots));
if (allAttr || contains(SolrField.images_urlstub_txt)) addSolr(solrdoc, SolrField.images_urlstub_txt, imgstubs);
if (allAttr || contains(SolrField.images_alt_txt)) addSolr(solrdoc, SolrField.images_alt_txt, imgalts);
// style sheets
if (isEmpty() || contains(SolrField.css_tag_txt.name())) {
if (allAttr || contains(SolrField.css_tag_txt)) {
final Map<MultiProtocolURI, String> csss = html.getCSS();
final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()];
@ -310,7 +402,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// Scripts
if (isEmpty() || contains(SolrField.scripts_txt.name())) {
if (allAttr || contains(SolrField.scripts_txt)) {
final Set<MultiProtocolURI> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
@ -324,7 +416,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// Frames
if (isEmpty() || contains(SolrField.frames_txt.name())) {
if (allAttr || contains(SolrField.frames_txt)) {
final Set<MultiProtocolURI> framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
@ -338,7 +430,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// IFrames
if (isEmpty() || contains(SolrField.iframes_txt.name())) {
if (allAttr || contains(SolrField.iframes_txt)) {
final Set<MultiProtocolURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
@ -352,7 +444,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// canonical tag
if (isEmpty() || contains(SolrField.canonical_s.name())) {
if (allAttr || contains(SolrField.canonical_s)) {
final MultiProtocolURI canonical = html.getCanonical();
if (canonical != null) {
inboundLinks.remove(canonical);
@ -362,7 +454,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// meta refresh tag
if (isEmpty() || contains(SolrField.refresh_s.name())) {
if (allAttr || contains(SolrField.refresh_s)) {
String refresh = html.getRefreshPath();
if (refresh != null && refresh.length() > 0) {
MultiProtocolURI refreshURL;
@ -380,7 +472,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// flash embedded
if (isEmpty() || contains(SolrField.flash_b.name())) {
if (allAttr || contains(SolrField.flash_b)) {
MultiProtocolURI[] flashURLs = html.getFlash();
for (MultiProtocolURI u: flashURLs) {
// remove all flash links from ibound/outbound links
@ -392,7 +484,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// generic evaluation pattern
for (final String model: html.getEvaluationModelNames()) {
if (isEmpty() || contains("ext_" + model + "_txt")) {
if (allAttr || contains("ext_" + model + "_txt")) {
final String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) {
addSolr(solrdoc, SolrField.valueOf("ext_" + model + "_txt"), scorenames);
@ -408,8 +500,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// list all links
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
c = 0;
if (isEmpty() || contains(SolrField.inboundlinkscount_i.name())) addSolr(solrdoc, SolrField.inboundlinkscount_i, inboundLinks.size());
if (isEmpty() || contains(SolrField.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
if (allAttr || contains(SolrField.inboundlinkscount_i)) addSolr(solrdoc, SolrField.inboundlinkscount_i, inboundLinks.size());
if (allAttr || contains(SolrField.inboundlinksnofollowcount_i)) addSolr(solrdoc, SolrField.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
final List<String> inboundlinksTag = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksURLProtocol = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksURLStub = new ArrayList<String>(inboundLinks.size());
@ -437,17 +529,17 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
((text.length() > 0) ? text : "") + "</a>");
c++;
}
if (isEmpty() || contains(SolrField.inboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_tag_txt, inboundlinksTag);
if (isEmpty() || contains(SolrField.inboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
if (isEmpty() || contains(SolrField.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (isEmpty() || contains(SolrField.inboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_name_txt, inboundlinksName);
if (isEmpty() || contains(SolrField.inboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_rel_txt, inboundlinksRel);
if (isEmpty() || contains(SolrField.inboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(SolrField.inboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_text_txt, inboundlinksText);
if (allAttr || contains(SolrField.inboundlinks_tag_txt)) addSolr(solrdoc, SolrField.inboundlinks_tag_txt, inboundlinksTag);
if (allAttr || contains(SolrField.inboundlinks_protocol_txt)) addSolr(solrdoc, SolrField.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
if (allAttr || contains(SolrField.inboundlinks_urlstub_txt)) addSolr(solrdoc, SolrField.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (allAttr || contains(SolrField.inboundlinks_name_txt)) addSolr(solrdoc, SolrField.inboundlinks_name_txt, inboundlinksName);
if (allAttr || contains(SolrField.inboundlinks_rel_txt)) addSolr(solrdoc, SolrField.inboundlinks_rel_txt, inboundlinksRel);
if (allAttr || contains(SolrField.inboundlinks_relflags_txt)) addSolr(solrdoc, SolrField.inboundlinks_relflags_txt, relEval(inboundlinksRel));
if (allAttr || contains(SolrField.inboundlinks_text_txt)) addSolr(solrdoc, SolrField.inboundlinks_text_txt, inboundlinksText);
c = 0;
if (isEmpty() || contains(SolrField.outboundlinkscount_i.name())) addSolr(solrdoc, SolrField.outboundlinkscount_i, ouboundLinks.size());
if (isEmpty() || contains(SolrField.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
if (allAttr || contains(SolrField.outboundlinkscount_i)) addSolr(solrdoc, SolrField.outboundlinkscount_i, ouboundLinks.size());
if (allAttr || contains(SolrField.outboundlinksnofollowcount_i)) addSolr(solrdoc, SolrField.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
final List<String> outboundlinksTag = new ArrayList<String>(ouboundLinks.size());
final List<String> outboundlinksURLProtocol = new ArrayList<String>(ouboundLinks.size());
final List<String> outboundlinksURLStub = new ArrayList<String>(ouboundLinks.size());
@ -475,24 +567,38 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
((text.length() > 0) ? text : "") + "</a>");
c++;
}
if (isEmpty() || contains(SolrField.outboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_tag_txt, outboundlinksTag);
if (isEmpty() || contains(SolrField.outboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
if (isEmpty() || contains(SolrField.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (isEmpty() || contains(SolrField.outboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_name_txt, outboundlinksName);
if (isEmpty() || contains(SolrField.outboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_rel_txt, outboundlinksRel);
if (isEmpty() || contains(SolrField.outboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(SolrField.outboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_text_txt, outboundlinksText);
if (allAttr || contains(SolrField.outboundlinks_tag_txt)) addSolr(solrdoc, SolrField.outboundlinks_tag_txt, outboundlinksTag);
if (allAttr || contains(SolrField.outboundlinks_protocol_txt)) addSolr(solrdoc, SolrField.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
if (allAttr || contains(SolrField.outboundlinks_urlstub_txt)) addSolr(solrdoc, SolrField.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (allAttr || contains(SolrField.outboundlinks_name_txt)) addSolr(solrdoc, SolrField.outboundlinks_name_txt, outboundlinksName);
if (allAttr || contains(SolrField.outboundlinks_rel_txt)) addSolr(solrdoc, SolrField.outboundlinks_rel_txt, outboundlinksRel);
if (allAttr || contains(SolrField.outboundlinks_relflags_txt)) addSolr(solrdoc, SolrField.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (allAttr || contains(SolrField.outboundlinks_text_txt)) addSolr(solrdoc, SolrField.outboundlinks_text_txt, outboundlinksText);
// charset
addSolr(solrdoc, SolrField.charset_s, yacydoc.getCharset());
if (allAttr || contains(SolrField.charset_s)) addSolr(solrdoc, SolrField.charset_s, yacydoc.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon());
addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat());
if (allAttr || contains(SolrField.lon_coordinate)) addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon());
if (allAttr || contains(SolrField.lat_coordinate)) addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat());
}
addSolr(solrdoc, SolrField.httpstatus_i, header == null ? 200 : header.getStatusCode());
if (allAttr || contains(SolrField.httpstatus_i)) addSolr(solrdoc, SolrField.httpstatus_i, header == null ? 200 : header.getStatusCode());
// fields that are additionally in URIMetadataRow
if (allAttr || contains(SolrField.load_date_dt)) addSolr(solrdoc, SolrField.load_date_dt, metadata.loaddate());
if (allAttr || contains(SolrField.fresh_date_dt)) addSolr(solrdoc, SolrField.fresh_date_dt, metadata.freshdate());
if (allAttr || contains(SolrField.host_id_s)) addSolr(solrdoc, SolrField.host_id_s, metadata.hosthash());
if ((allAttr || contains(SolrField.referrer_id_ss)) && metadata.referrerHash() != null) addSolr(solrdoc, SolrField.referrer_id_ss, new String[]{ASCII.String(metadata.referrerHash())});
//if (allAttr || contains(SolrField.md5_s)) addSolr(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(SolrField.publisher_t)) addSolr(solrdoc, SolrField.publisher_t, yacydoc.dc_publisher());
if ((allAttr || contains(SolrField.language_ss)) && metadata.language() != null) addSolr(solrdoc, SolrField.language_ss,new String[]{UTF8.String(metadata.language())});
if (allAttr || contains(SolrField.ranking_i)) addSolr(solrdoc, SolrField.ranking_i, metadata.ranking());
if (allAttr || contains(SolrField.size_i)) addSolr(solrdoc, SolrField.size_i, metadata.size());
if (allAttr || contains(SolrField.audiolinkscount_i)) addSolr(solrdoc, SolrField.audiolinkscount_i, yacydoc.getAudiolinks().size());
if (allAttr || contains(SolrField.videolinkscount_i)) addSolr(solrdoc, SolrField.videolinkscount_i, yacydoc.getVideolinks().size());
if (allAttr || contains(SolrField.applinkscount_i)) addSolr(solrdoc, SolrField.applinkscount_i, yacydoc.getApplinks().size());
return solrdoc;
}

@ -120,7 +120,21 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField
ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"),
failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty");
failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
// values used additionally by URIMetadataRow
load_date_dt(SolrType.date, true, true, "time when resource was loaded"),
fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"),
host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
referrer_id_ss(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash();
md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5();
publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher();
language_ss(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language();
ranking_i(SolrType.integer, true, true, "an external ranking value"),// long ranking();
size_i(SolrType.integer, true, true, "the size of the raw source"),// int size();
audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio();
videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo();
applinkscount_i(SolrType.integer, true, true, "number of links to application resources");// int lapp();
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type;

Loading…
Cancel
Save