added more solr fields to integrate values from URIMetadataRow. All

writings to the Metadata-DB are now also done to solr. This includes
metadata transfer during search and rwi transfer.

The new/added solr fields are:

## time when resource was loaded
load_date_dt

## date until resource shall be considered as fresh
fresh_date_dt

## id of the host, a 6-byte hash that is part of the document id
host_id_s

## ids of referrer to this document
referrer_id_ss

## the md5 of the raw source
md5_s

## the name of the publisher of the document
publisher_t

## the language used in the document; starts with primary language
language_ss

## an external ranking value
ranking_i

## the size of the raw source
size_i

## number of links to audio resources
audiolinkscount_i

## number of links to video resources
videolinkscount_i

## number of links to application resources
applinkscount_i
pull/1/head
orbiter 13 years ago
parent e432bb9cd9
commit d9173ba7ed

@ -267,3 +267,45 @@ failreason_t
## response time of target server in milliseconds, int ## response time of target server in milliseconds, int
responsetime_i responsetime_i
### values used additionally by URIMetadataRow, part of the index transfer process
## time when resource was loaded
load_date_dt
## date until resource shall be considered as fresh
fresh_date_dt
## id of the host, a 6-byte hash that is part of the document id
host_id_s
## ids of referrer to this document
referrer_id_ss
## the md5 of the raw source
md5_s
## the name of the publisher of the document
publisher_t
## the language used in the document; starts with primary language
language_ss
## an external ranking value
ranking_i
## the size of the raw source
size_i
## number of links to audio resources
audiolinkscount_i
## number of links to video resources
videolinkscount_i
## number of links to application resources
applinkscount_i
## index creation comment
process_s

@ -197,17 +197,6 @@ public class IndexFederated_p {
if (field.getComment() != null) prop.putHTML("scheme_" + c + "_comment",field.getComment()); if (field.getComment() != null) prop.putHTML("scheme_" + c + "_comment",field.getComment());
c++; c++;
} }
/* final Iterator<ConfigurationSet.Entry> i = sb.solrScheme.entryIterator();
ConfigurationSet.Entry entry;
while (i.hasNext()) {
entry = i.next();
prop.put("scheme_" + c + "_dark", dark ? 1 : 0); dark = !dark;
prop.put("scheme_" + c + "_checked", entry.enabled() ? 1 : 0);
prop.putHTML("scheme_" + c + "_key", entry.key());
prop.putHTML("scheme_" + c + "_solrfieldname",entry.getValue() == null ? "" : entry.getValue());
if (entry.getComment() != null) prop.putHTML("scheme_" + c + "_comment",entry.getComment());
c++;
}*/
prop.put("scheme", c); prop.put("scheme", c);
// fill attribute fields // fill attribute fields

@ -147,6 +147,9 @@ public final class crawlReceipt {
if ("fill".equals(result)) try { if ("fill".equals(result)) try {
// put new entry into database // put new entry into database
sb.index.urlMetadata().store(entry); sb.index.urlMetadata().store(entry);
if (!sb.index.urlMetadata().getSolr().exists(ASCII.String(entry.url().hash()))) {
sb.index.urlMetadata().getSolr().add(sb.index.urlMetadata().getSolrScheme().metadata2solr(entry));
}
ResultURLs.stack(entry, youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS); ResultURLs.stack(entry, youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS);
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done
if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false, true)); if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false, true));

@ -30,6 +30,7 @@ import java.io.IOException;
import java.text.ParseException; import java.text.ParseException;
import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -141,6 +142,9 @@ public final class transferURL {
if (Network.log.isFine()) Network.log.logFine("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.url().toNormalform(true, false)); if (Network.log.isFine()) Network.log.logFine("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.url().toNormalform(true, false));
try { try {
sb.index.urlMetadata().store(lEntry); sb.index.urlMetadata().store(lEntry);
if (!sb.index.urlMetadata().getSolr().exists(ASCII.String(lEntry.url().hash()))) {
sb.index.urlMetadata().getSolr().add(sb.index.urlMetadata().getSolrScheme().metadata2solr(lEntry));
}
ResultURLs.stack(lEntry, iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER); ResultURLs.stack(lEntry, iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER);
if (Network.log.isFine()) Network.log.logFine("transferURL: received URL '" + lEntry.url().toNormalform(false, true) + "' from peer " + otherPeerName); if (Network.log.isFine()) Network.log.logFine("transferURL: received URL '" + lEntry.url().toNormalform(false, true) + "' from peer " + otherPeerName);
received++; received++;

@ -37,6 +37,7 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.ScoreMap;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.order.Bitfield;
@ -96,7 +97,7 @@ public final class ResultURLs {
} }
public static void stack( public static void stack(
final URIMetadataRow e, final URIMetadata e,
final byte[] initiatorHash, final byte[] initiatorHash,
final byte[] executorHash, final byte[] executorHash,
final EventOrigin stackType) { final EventOrigin stackType) {

@ -70,29 +70,28 @@ public class Response {
// doctype calculation // doctype calculation
public static char docType(final MultiProtocolURI url) { public static char docType(final MultiProtocolURI url) {
final String path = url.getPath().toLowerCase(); String ext = url.getFileExtension();
// serverLog.logFinest("PLASMA", "docType URL=" + path); if (ext == null) return DT_UNKNOWN;
char doctype = DT_UNKNOWN; if (ext.equals(".gif")) return DT_IMAGE;
if (path.endsWith(".gif")) { doctype = DT_IMAGE; } if (ext.equals(".ico")) return DT_IMAGE;
else if (path.endsWith(".ico")) { doctype = DT_IMAGE; } if (ext.equals(".bmp")) return DT_IMAGE;
else if (path.endsWith(".bmp")) { doctype = DT_IMAGE; } if (ext.equals(".jpg")) return DT_IMAGE;
else if (path.endsWith(".jpg")) { doctype = DT_IMAGE; } if (ext.equals(".jpeg")) return DT_IMAGE;
else if (path.endsWith(".jpeg")) { doctype = DT_IMAGE; } if (ext.equals(".png")) return DT_IMAGE;
else if (path.endsWith(".png")) { doctype = DT_IMAGE; } if (ext.equals(".html")) return DT_HTML;
else if (path.endsWith(".html")) { doctype = DT_HTML; } if (ext.equals(".txt")) return DT_TEXT;
else if (path.endsWith(".txt")) { doctype = DT_TEXT; } if (ext.equals(".doc")) return DT_DOC;
else if (path.endsWith(".doc")) { doctype = DT_DOC; } if (ext.equals(".rtf")) return DT_DOC;
else if (path.endsWith(".rtf")) { doctype = DT_DOC; } if (ext.equals(".pdf")) return DT_PDFPS;
else if (path.endsWith(".pdf")) { doctype = DT_PDFPS; } if (ext.equals(".ps")) return DT_PDFPS;
else if (path.endsWith(".ps")) { doctype = DT_PDFPS; } if (ext.equals(".avi")) return DT_MOVIE;
else if (path.endsWith(".avi")) { doctype = DT_MOVIE; } if (ext.equals(".mov")) return DT_MOVIE;
else if (path.endsWith(".mov")) { doctype = DT_MOVIE; } if (ext.equals(".qt")) return DT_MOVIE;
else if (path.endsWith(".qt")) { doctype = DT_MOVIE; } if (ext.equals(".mpg")) return DT_MOVIE;
else if (path.endsWith(".mpg")) { doctype = DT_MOVIE; } if (ext.equals(".md5")) return DT_SHARE;
else if (path.endsWith(".md5")) { doctype = DT_SHARE; } if (ext.equals(".mpeg")) return DT_MOVIE;
else if (path.endsWith(".mpeg")) { doctype = DT_MOVIE; } if (ext.equals(".asf")) return DT_FLASH;
else if (path.endsWith(".asf")) { doctype = DT_FLASH; } return DT_UNKNOWN;
return doctype;
} }
public static char docType(final String mime) { public static char docType(final String mime) {
@ -115,30 +114,20 @@ public class Response {
else if (mime.startsWith("image/")) doctype = DT_IMAGE; else if (mime.startsWith("image/")) doctype = DT_IMAGE;
else if (mime.startsWith("audio/")) doctype = DT_AUDIO; else if (mime.startsWith("audio/")) doctype = DT_AUDIO;
else if (mime.startsWith("video/")) doctype = DT_MOVIE; else if (mime.startsWith("video/")) doctype = DT_MOVIE;
//bz2 = application/x-bzip2
//dvi = application/x-dvi
//gz = application/gzip
//hqx = application/mac-binhex40
//lha = application/x-lzh
//lzh = application/x-lzh
//pac = application/x-ns-proxy-autoconfig
//php = application/x-httpd-php
//phtml = application/x-httpd-php
//rss = application/xml
//tar = application/tar
//tex = application/x-tex
//tgz = application/tar
//torrent = application/x-bittorrent
//xhtml = application/xhtml+xml
//xla = application/msexcel
//xls = application/msexcel
//xsl = application/xml
//xml = application/xml
//Z = application/x-compress
//zip = application/zip
return doctype; return doctype;
} }
public static String doctype2mime(String ext, char doctype) {
String mime = Classification.ext2mime(ext);
int p = mime.indexOf('/');
if (p < 0) return mime;
if (doctype == DT_TEXT) return "text" + mime.substring(p);
if (doctype == DT_IMAGE) return "image" + mime.substring(p);
if (doctype == DT_AUDIO) return "audio" + mime.substring(p);
if (doctype == DT_MOVIE) return "video" + mime.substring(p);
return mime;
}
public static final int QUEUE_STATE_FRESH = 0; public static final int QUEUE_STATE_FRESH = 0;
public static final int QUEUE_STATE_PARSING = 1; public static final int QUEUE_STATE_PARSING = 1;
public static final int QUEUE_STATE_CONDENSING = 2; public static final int QUEUE_STATE_CONDENSING = 2;

@ -52,6 +52,10 @@ public class SolrDoc extends SolrInputDocument {
this.setField(key.getSolrFieldName(), value); this.setField(key.getSolrFieldName(), value);
} }
public final void addSolr(final SolrField key, final long value) {
this.setField(key.getSolrFieldName(), value);
}
public final void addSolr(final SolrField key, final String[] value) { public final void addSolr(final SolrField key, final String[] value) {
this.setField(key.getSolrFieldName(), value); this.setField(key.getSolrFieldName(), value);
} }

@ -260,11 +260,7 @@ public class ConfigurationSet extends TreeMap<String,Entry> implements Serializa
} }
writer.close(); writer.close();
} }
/*
public Iterator<String> iterator() {
return this.keySet().iterator();
}
*/
public Iterator<Entry> entryIterator() { public Iterator<Entry> entryIterator() {
return this.values().iterator(); return this.values().iterator();
} }

@ -56,54 +56,22 @@ public final class Condenser {
// category flags that show how the page can be distinguished in different interest groups // category flags that show how the page can be distinguished in different interest groups
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of') public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
public static final int flag_cat_opencontent = 1; // open source, any free stuff
public static final int flag_cat_business = 2; // web shops, marketing, trade
public static final int flag_cat_stockfinance = 3; // stock exchange (quotes), finance, economy
public static final int flag_cat_health = 4; // health
public static final int flag_cat_sport = 5; // any sport, cars etc.
public static final int flag_cat_lifestyle = 6; // travel, lifestyle
public static final int flag_cat_politics = 7; // politics
public static final int flag_cat_news = 8; // blogs, news pages
public static final int flag_cat_children = 9; // toys, childrens education, help for parents
public static final int flag_cat_entertainment = 10; // boulevard, entertainment, cultural content
public static final int flag_cat_knowledge = 11; // science, school stuff, help for homework
public static final int flag_cat_computer = 12; // any computer related stuff, networks, operation systems
public static final int flag_cat_p2p = 13; // p2p support, file-sharing archives etc.
public static final int flag_cat_sex = 14; // sexual content
public static final int flag_cat_spam = 15; // pages that anybody would consider as not interesting
public static final int flag_cat_linux = 16; // pages about linux software
public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os
public static final int flag_cat_windows = 18; // pages about windows os and software
public static final int flag_cat_haslocation = 19; // the page has a location metadata attached public static final int flag_cat_haslocation = 19; // the page has a location metadata attached
public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
//private Properties analysis; //private Properties analysis;
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
//public int RESULT_NUMB_TEXT_BYTES = -1;
public int RESULT_NUMB_WORDS = -1; public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1; public int RESULT_DIFF_WORDS = -1;
public int RESULT_NUMB_SENTENCES = -1; public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1; public int RESULT_DIFF_SENTENCES = -1;
public Bitfield RESULT_FLAGS = new Bitfield(4); public Bitfield RESULT_FLAGS = new Bitfield(4);
private final Identificator languageIdentificator; private final Identificator languageIdentificator;
/*
private final static int numlength = 5;
private static final ThreadLocal <NumberFormat> intStringFormatter =
new ThreadLocal <NumberFormat>() {
@Override protected NumberFormat initialValue() {
NumberFormat n = NumberFormat.getIntegerInstance();
n.setMinimumIntegerDigits(numlength);
n.setMaximumIntegerDigits(numlength);
return n;
}
};
*/
public Condenser( public Condenser(
final Document document, final Document document,

@ -153,20 +153,13 @@ public class pdfParser extends AbstractParser implements Parser {
if (t.isAlive()) t.interrupt(); if (t.isAlive()) t.interrupt();
pdfDoc.close(); pdfDoc.close();
contentBytes = writer.getBytes(); // get final text before closing writer contentBytes = writer.getBytes(); // get final text before closing writer
} catch (final IOException e) { } catch (final Throwable e) {
// close the writer // close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {} if (writer != null) try { writer.close(); } catch (final Exception ex) {}
try {pdfDoc.close();} catch (final IOException ee) {} try {pdfDoc.close();} catch (final Throwable ee) {}
//throw new Parser.Failure(e.getMessage(), location);
} catch (final NullPointerException e) {
// this exception appeared after the insertion of the jempbox-1.5.0.jar library
Log.logException(e);
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
try {pdfDoc.close();} catch (final IOException ee) {}
//throw new Parser.Failure(e.getMessage(), location); //throw new Parser.Failure(e.getMessage(), location);
} finally { } finally {
try {pdfDoc.close();} catch (final IOException e) {} try {pdfDoc.close();} catch (final Throwable e) {}
writer.close(); writer.close();
} }

@ -357,7 +357,9 @@ public final class Protocol
if ( p < 0 ) { if ( p < 0 ) {
return -1; return -1;
} }
final String host = Domains.dnsResolve(address.substring(0, p)).getHostAddress(); InetAddress ia = Domains.dnsResolve(address.substring(0, p));
if (ia == null) continue;
final String host = ia.getHostAddress();
s = Seed.genRemoteSeed(seedStr, false, host); s = Seed.genRemoteSeed(seedStr, false, host);
} else { } else {
s = Seed.genRemoteSeed(seedStr, false, null); s = Seed.genRemoteSeed(seedStr, false, null);
@ -752,6 +754,9 @@ public final class Protocol
// passed all checks, store url // passed all checks, store url
try { try {
indexSegment.urlMetadata().store(urlEntry); indexSegment.urlMetadata().store(urlEntry);
if (!indexSegment.urlMetadata().getSolr().exists(ASCII.String(urlEntry.url().hash()))) {
indexSegment.urlMetadata().getSolr().add(indexSegment.urlMetadata().getSolrScheme().metadata2solr(urlEntry));
}
ResultURLs.stack( ResultURLs.stack(
urlEntry, urlEntry,
mySeed.hash.getBytes(), mySeed.hash.getBytes(),
@ -1081,7 +1086,7 @@ public final class Protocol
final String process, final String process,
final String result, final String result,
final String reason, final String reason,
final URIMetadataRow entry, final URIMetadata entry,
final String wordhashes) { final String wordhashes) {
assert (target != null); assert (target != null);
assert (mySeed != null); assert (mySeed != null);

@ -111,7 +111,6 @@ import net.yacy.gui.Tray;
import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.index.RowSpaceExceededException;
@ -2514,7 +2513,7 @@ public final class Switchboard extends serverSwitch
this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url); this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url);
// STORE WORD INDEX // STORE WORD INDEX
URIMetadataRow newEntry = null; URIMetadata newEntry = null;
try { try {
newEntry = newEntry =
this.index.storeDocument( this.index.storeDocument(
@ -2761,9 +2760,9 @@ public final class Switchboard extends serverSwitch
public class receiptSending implements Runnable public class receiptSending implements Runnable
{ {
private final Seed initiatorPeer; private final Seed initiatorPeer;
private final URIMetadataRow reference; private final URIMetadata reference;
public receiptSending(final Seed initiatorPeer, final URIMetadataRow reference) { public receiptSending(final Seed initiatorPeer, final URIMetadata reference) {
this.initiatorPeer = initiatorPeer; this.initiatorPeer = initiatorPeer;
this.reference = reference; this.reference = reference;
} }

@ -42,7 +42,6 @@ import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.search.query.QueryParams; import net.yacy.search.query.QueryParams;
import net.yacy.search.query.RWIProcess; import net.yacy.search.query.RWIProcess;
@ -54,8 +53,7 @@ import net.yacy.search.ranking.ReferenceOrder;
* *
* @author Michael Christen * @author Michael Christen
*/ */
public class DocumentIndex extends Segment public class DocumentIndex extends Segment {
{
private static final RankingProfile textRankingDefault = new RankingProfile(Classification.ContentDomain.TEXT); private static final RankingProfile textRankingDefault = new RankingProfile(Classification.ContentDomain.TEXT);
//private Bitfield zeroConstraint = new Bitfield(4); //private Bitfield zeroConstraint = new Bitfield(4);
@ -102,12 +100,12 @@ public class DocumentIndex extends Segment
@Override @Override
public void run() { public void run() {
DigestURI f; DigestURI f;
URIMetadataRow[] resultRows; URIMetadata[] resultRows;
try { try {
while ( (f = DocumentIndex.this.queue.take()) != poison ) { while ( (f = DocumentIndex.this.queue.take()) != poison ) {
try { try {
resultRows = add(f); resultRows = add(f);
for ( final URIMetadataRow resultRow : resultRows ) { for ( final URIMetadata resultRow : resultRows ) {
if ( DocumentIndex.this.callback != null ) { if ( DocumentIndex.this.callback != null ) {
if ( resultRow == null ) { if ( resultRow == null ) {
DocumentIndex.this.callback.fail(f, "result is null"); DocumentIndex.this.callback.fail(f, "result is null");
@ -139,7 +137,7 @@ public class DocumentIndex extends Segment
this.queue.clear(); this.queue.clear();
} }
private URIMetadataRow[] add(final DigestURI url) throws IOException { private URIMetadata[] add(final DigestURI url) throws IOException {
if ( url == null ) { if ( url == null ) {
throw new IOException("file = null"); throw new IOException("file = null");
} }
@ -162,7 +160,7 @@ public class DocumentIndex extends Segment
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
} }
//Document document = Document.mergeDocuments(url, null, documents); //Document document = Document.mergeDocuments(url, null, documents);
final URIMetadataRow[] rows = new URIMetadataRow[documents.length]; final URIMetadata[] rows = new URIMetadata[documents.length];
int c = 0; int c = 0;
for ( final Document document : documents ) { for ( final Document document : documents ) {
if (document == null) continue; if (document == null) continue;
@ -274,7 +272,7 @@ public class DocumentIndex extends Segment
public interface CallbackListener public interface CallbackListener
{ {
public void commit(DigestURI f, URIMetadataRow resultRow); public void commit(DigestURI f, URIMetadata resultRow);
public void fail(DigestURI f, String failReason); public void fail(DigestURI f, String failReason);
} }
@ -295,7 +293,7 @@ public class DocumentIndex extends Segment
System.out.println("using index files at " + segmentPath.getAbsolutePath()); System.out.println("using index files at " + segmentPath.getAbsolutePath());
final CallbackListener callback = new CallbackListener() { final CallbackListener callback = new CallbackListener() {
@Override @Override
public void commit(final DigestURI f, final URIMetadataRow resultRow) { public void commit(final DigestURI f, final URIMetadata resultRow) {
System.out.println("indexed: " + f.toString()); System.out.println("indexed: " + f.toString());
} }

@ -71,14 +71,16 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
private String tablename; private String tablename;
private ArrayList<HostStat> statsDump; private ArrayList<HostStat> statsDump;
private final DoubleSolrConnector solr; private final DoubleSolrConnector solr;
private final SolrConfiguration solrScheme;
public MetadataRepository(final File path) { public MetadataRepository(final File path, final SolrConfiguration solrScheme) {
this.location = path; this.location = path;
this.tablename = null; this.tablename = null;
this.urlIndexFile = null; this.urlIndexFile = null;
this.exportthread = null; // will have a export thread assigned if exporter is running this.exportthread = null; // will have a export thread assigned if exporter is running
this.statsDump = null; this.statsDump = null;
this.solr = new DoubleSolrConnector(); this.solr = new DoubleSolrConnector();
this.solrScheme = solrScheme;
} }
public boolean connectedUrlDb() { public boolean connectedUrlDb() {
@ -97,6 +99,10 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.urlIndexFile = null; this.urlIndexFile = null;
} }
public SolrConfiguration getSolrScheme() {
return this.solrScheme;
}
public boolean connectedLocalSolr() { public boolean connectedLocalSolr() {
return this.solr.isConnected0(); return this.solr.isConnected0();
} }

@ -99,7 +99,6 @@ public class Segment {
private final Log log; private final Log log;
private final File segmentPath; private final File segmentPath;
private final SolrConfiguration solrScheme;
protected final MetadataRepository urlMetadata; protected final MetadataRepository urlMetadata;
protected IndexCell<WordReference> termIndex; protected IndexCell<WordReference> termIndex;
protected IndexCell<CitationReference> urlCitationIndex; protected IndexCell<CitationReference> urlCitationIndex;
@ -108,10 +107,9 @@ public class Segment {
log.logInfo("Initializing Segment '" + segmentPath + "."); log.logInfo("Initializing Segment '" + segmentPath + ".");
this.log = log; this.log = log;
this.segmentPath = segmentPath; this.segmentPath = segmentPath;
this.solrScheme = solrScheme;
// create LURL-db // create LURL-db
this.urlMetadata = new MetadataRepository(segmentPath); this.urlMetadata = new MetadataRepository(segmentPath, solrScheme);
} }
public boolean connectedRWI() { public boolean connectedRWI() {
@ -203,7 +201,7 @@ public class Segment {
} }
public SolrConfiguration getSolrScheme() { public SolrConfiguration getSolrScheme() {
return this.solrScheme; return this.urlMetadata.getSolrScheme();
} }
public SolrConnector getRemoteSolr() { public SolrConnector getRemoteSolr() {
@ -398,7 +396,7 @@ public class Segment {
return language; return language;
} }
public URIMetadataRow storeDocument( public URIMetadata storeDocument(
final DigestURI url, final DigestURI url,
final DigestURI referrerURL, final DigestURI referrerURL,
Date modDate, Date modDate,
@ -420,22 +418,10 @@ public class Segment {
final String urlNormalform = url.toNormalform(true, false); final String urlNormalform = url.toNormalform(true, false);
final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language
// STORE TO SOLR
boolean localSolr = this.connectedLocalSolr();
boolean remoteSolr = this.connectedRemoteSolr();
if (localSolr || remoteSolr) {
try {
SolrDoc solrDoc = this.solrScheme.yacy2solr(id, responseHeader, document);
this.getSolr().add(solrDoc);
} catch ( final IOException e ) {
Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage());
}
}
// STORE URL TO LOADED-URL-DB // STORE URL TO LOADED-URL-DB
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader
char docType = Response.docType(document.dc_format()); char docType = Response.docType(document.dc_format());
final URIMetadataRow newEntry = new URIMetadataRow( final URIMetadata metadata = new URIMetadataRow(
url, // URL url, // URL
dc_title, // document description dc_title, // document description
document.dc_creator(), // author document.dc_creator(), // author
@ -460,9 +446,21 @@ public class Segment {
document.getVideolinks().size(), // lvideo document.getVideolinks().size(), // lvideo
document.getApplinks().size() // lapp document.getApplinks().size() // lapp
); );
this.urlMetadata.store(newEntry); this.urlMetadata.store(metadata);
final long storageEndTime = System.currentTimeMillis(); final long storageEndTime = System.currentTimeMillis();
// STORE TO SOLR
boolean localSolr = this.connectedLocalSolr();
boolean remoteSolr = this.connectedRemoteSolr();
if (localSolr || remoteSolr) {
try {
SolrDoc solrDoc = this.urlMetadata.getSolrScheme().yacy2solr(id, responseHeader, document, metadata);
this.getSolr().add(solrDoc);
} catch ( final IOException e ) {
Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage());
}
}
// STORE PAGE INDEX INTO WORD INDEX DB // STORE PAGE INDEX INTO WORD INDEX DB
int outlinksSame = document.inboundLinks().size(); int outlinksSame = document.inboundLinks().size();
int outlinksOther = document.outboundLinks().size(); int outlinksOther = document.outboundLinks().size();
@ -545,7 +543,7 @@ public class Segment {
} }
// finished // finished
return newEntry; return metadata;
} }
public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) { public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {

@ -24,7 +24,6 @@
package net.yacy.search.index; package net.yacy.search.index;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
@ -41,18 +40,24 @@ import java.util.Set;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.services.federated.solr.SolrDoc; import net.yacy.cora.services.federated.solr.SolrDoc;
import net.yacy.cora.storage.ConfigurationSet; import net.yacy.cora.storage.ConfigurationSet;
import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import de.anomic.crawler.retrieval.Response;
public class SolrConfiguration extends ConfigurationSet implements Serializable { public class SolrConfiguration extends ConfigurationSet implements Serializable {
private static final long serialVersionUID=-499100932212840385L; private static final long serialVersionUID=-499100932212840385L;
@ -88,46 +93,63 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
it.remove(); it.remove();
} }
} }
// check consistency the other way: look if all enum constants in SolrField appear in the configuration file
for (SolrField field: SolrField.values()) {
if (this.get(field.name()) == null) {
Log.logWarning("SolrScheme", " solr scheme file " + configurationFile.getAbsolutePath() + " is missing declaration for '" + field.name() + "'");
}
}
this.lazy = lazy; this.lazy = lazy;
} }
private boolean contains(SolrField field) {
return this.contains(field.name());
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final byte[] value) {
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length != 0))) solrdoc.addSolr(key, UTF8.String(value));
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value) { protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value); if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value);
} }
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value, final float boost) { protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value, final float boost) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value, boost); if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value, boost);
} }
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final Date value) { protected void addSolr(final SolrDoc solrdoc, final SolrField key, final Date value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.getTime() > 0))) solrdoc.addSolr(key, value); if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.getTime() > 0))) solrdoc.addSolr(key, value);
} }
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String[] value) { protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String[] value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.length > 0))) solrdoc.addSolr(key, value); if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) solrdoc.addSolr(key, value);
} }
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final List<String> value) { protected void addSolr(final SolrDoc solrdoc, final SolrField key, final List<String> value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value); if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value);
} }
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final int value) { protected void addSolr(final SolrDoc solrdoc, final SolrField key, final int value) {
if ((isEmpty() || contains(key.name())) && (!this.lazy || value > 0)) solrdoc.addSolr(key, value); if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) solrdoc.addSolr(key, value);
}
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final long value) {
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) solrdoc.addSolr(key, value);
} }
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final float value) { protected void addSolr(final SolrDoc solrdoc, final SolrField key, final float value) {
if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); if ((isEmpty() || contains(key)) && (!this.lazy || value != 0.0f)) solrdoc.addSolr(key, value);
} }
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final double value) { protected void addSolr(final SolrDoc solrdoc, final SolrField key, final double value) {
if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); if ((isEmpty() || contains(key)) && (!this.lazy || value != 0.0d)) solrdoc.addSolr(key, value);
} }
protected void addSolr(final SolrDoc solrdoc, final SolrField key, final boolean value) { protected void addSolr(final SolrDoc solrdoc, final SolrField key, final boolean value) {
if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); if (isEmpty() || contains(key)) solrdoc.addSolr(key, value);
} }
/** /**
* save configuration to file and update enum SolrFields * save configuration to file and update enum SolrFields
* @throws IOException * @throws IOException
@ -149,32 +171,102 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
} catch (final IOException e) {} } catch (final IOException e) {}
} }
public SolrDoc yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) { public SolrDoc metadata2solr(final URIMetadata md) {
final SolrDoc solrdoc = new SolrDoc();
final DigestURI digestURI = new DigestURI(md.url());
boolean allAttr = this.isEmpty();
if (allAttr || contains(SolrField.failreason_t)) addSolr(solrdoc, SolrField.failreason_t, "");
addSolr(solrdoc, SolrField.id, ASCII.String(md.hash()));
addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false));
if (allAttr || contains(SolrField.ip_s)) {
final InetAddress address = digestURI.getInetAddress();
if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress());
}
if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost());
if (allAttr || contains(SolrField.title)) addSolr(solrdoc, SolrField.title, md.dc_title());
if (allAttr || contains(SolrField.author)) addSolr(solrdoc, SolrField.author, md.dc_creator());
if (allAttr || contains(SolrField.description)) addSolr(solrdoc, SolrField.description, md.snippet());
if (allAttr || contains(SolrField.content_type)) addSolr(solrdoc, SolrField.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype()));
if (allAttr || contains(SolrField.last_modified)) addSolr(solrdoc, SolrField.last_modified, md.moddate());
if (allAttr || contains(SolrField.text_t)) addSolr(solrdoc, SolrField.text_t, ""); // not delivered in metadata
if (allAttr || contains(SolrField.wordcount_i)) addSolr(solrdoc, SolrField.wordcount_i, md.wordCount());
if (allAttr || contains(SolrField.keywords)) {
String keywords = md.dc_subject();
Bitfield flags = md.flags();
if (flags.get(Condenser.flag_cat_indexof)) {
if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else {
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
}
}
addSolr(solrdoc, SolrField.keywords, keywords);
}
// path elements of link
final String path = digestURI.getPath();
if (path != null && (allAttr || contains(SolrField.paths_txt))) {
final String[] paths = path.split("/");
if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths);
}
if (allAttr || contains(SolrField.imagescount_i)) addSolr(solrdoc, SolrField.imagescount_i, md.limage());
if (allAttr || contains(SolrField.inboundlinkscount_i)) addSolr(solrdoc, SolrField.inboundlinkscount_i, md.llocal());
if (allAttr || contains(SolrField.outboundlinkscount_i)) addSolr(solrdoc, SolrField.outboundlinkscount_i, md.lother());
if (allAttr || contains(SolrField.charset_s)) addSolr(solrdoc, SolrField.charset_s, "UTF8");
// coordinates
if (md.lat() != 0.0f && md.lon() != 0.0f) {
if (allAttr || contains(SolrField.lon_coordinate)) addSolr(solrdoc, SolrField.lon_coordinate, md.lon());
if (allAttr || contains(SolrField.lat_coordinate)) addSolr(solrdoc, SolrField.lat_coordinate, md.lat());
}
if (allAttr || contains(SolrField.httpstatus_i)) addSolr(solrdoc, SolrField.httpstatus_i, 200);
// fields that are in URIMetadataRow additional to yacy2solr basic requirement
if (allAttr || contains(SolrField.load_date_dt)) addSolr(solrdoc, SolrField.load_date_dt, md.loaddate());
if (allAttr || contains(SolrField.fresh_date_dt)) addSolr(solrdoc, SolrField.fresh_date_dt, md.freshdate());
if (allAttr || contains(SolrField.host_id_s)) addSolr(solrdoc, SolrField.host_id_s, md.hosthash());
if ((allAttr || contains(SolrField.referrer_id_ss)) && md.referrerHash() != null) addSolr(solrdoc, SolrField.referrer_id_ss, new String[]{ASCII.String(md.referrerHash())});
if (allAttr || contains(SolrField.md5_s)) addSolr(solrdoc, SolrField.md5_s, md.md5());
if (allAttr || contains(SolrField.publisher_t)) addSolr(solrdoc, SolrField.publisher_t, md.dc_publisher());
if ((allAttr || contains(SolrField.language_ss)) && md.language() != null) addSolr(solrdoc, SolrField.language_ss,new String[]{UTF8.String(md.language())});
if (allAttr || contains(SolrField.ranking_i)) addSolr(solrdoc, SolrField.ranking_i, md.ranking());
if (allAttr || contains(SolrField.size_i)) addSolr(solrdoc, SolrField.size_i, md.size());
if (allAttr || contains(SolrField.audiolinkscount_i)) addSolr(solrdoc, SolrField.audiolinkscount_i, md.laudio());
if (allAttr || contains(SolrField.videolinkscount_i)) addSolr(solrdoc, SolrField.videolinkscount_i, md.lvideo());
if (allAttr || contains(SolrField.applinkscount_i)) addSolr(solrdoc, SolrField.applinkscount_i, md.lapp());
return solrdoc;
}
public SolrDoc yacy2solr(final String id, final ResponseHeader header, final Document yacydoc, final URIMetadata metadata) {
// we use the SolrCell design as index scheme // we use the SolrCell design as index scheme
final SolrDoc solrdoc = new SolrDoc(); final SolrDoc solrdoc = new SolrDoc();
final DigestURI digestURI = new DigestURI(yacydoc.dc_source()); final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
addSolr(solrdoc, SolrField.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) boolean allAttr = this.isEmpty();
addSolr(solrdoc, SolrField.id, id); addSolr(solrdoc, SolrField.id, id);
addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false)); addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false));
if (allAttr || contains(SolrField.failreason_t)) addSolr(solrdoc, SolrField.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
if (allAttr || contains(SolrField.ip_s)) {
final InetAddress address = digestURI.getInetAddress(); final InetAddress address = digestURI.getInetAddress();
if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress()); if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress());
}
if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost()); if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost());
addSolr(solrdoc, SolrField.title, yacydoc.dc_title()); if (allAttr || contains(SolrField.title)) addSolr(solrdoc, SolrField.title, yacydoc.dc_title());
addSolr(solrdoc, SolrField.author, yacydoc.dc_creator()); if (allAttr || contains(SolrField.author)) addSolr(solrdoc, SolrField.author, yacydoc.dc_creator());
addSolr(solrdoc, SolrField.description, yacydoc.dc_description()); if (allAttr || contains(SolrField.description)) addSolr(solrdoc, SolrField.description, yacydoc.dc_description());
addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format()); if (allAttr || contains(SolrField.content_type)) addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format());
addSolr(solrdoc, SolrField.last_modified, header == null ? new Date() : header.lastModified()); if (allAttr || contains(SolrField.last_modified)) addSolr(solrdoc, SolrField.last_modified, header == null ? new Date() : header.lastModified());
addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' ')); if (allAttr || contains(SolrField.keywords)) addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' '));
final String content = yacydoc.getTextString(); final String content = yacydoc.getTextString();
addSolr(solrdoc, SolrField.text_t, content); if (allAttr || contains(SolrField.text_t)) addSolr(solrdoc, SolrField.text_t, content);
if (isEmpty() || contains(SolrField.wordcount_i.name())) { if (allAttr || contains(SolrField.wordcount_i)) {
final int contentwc = content.split(" ").length; final int contentwc = content.split(" ").length;
addSolr(solrdoc, SolrField.wordcount_i, contentwc); addSolr(solrdoc, SolrField.wordcount_i, contentwc);
} }
// path elements of link // path elements of link
final String path = digestURI.getPath(); final String path = digestURI.getPath();
if (path != null && (isEmpty() || contains(SolrField.paths_txt.name()))) { if (path != null && (allAttr || contains(SolrField.paths_txt))) {
final String[] paths = path.split("/"); final String[] paths = path.split("/");
if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths); if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths);
} }
@ -250,7 +342,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
addSolr(solrdoc, SolrField.boldcount_i, bold.length); addSolr(solrdoc, SolrField.boldcount_i, bold.length);
if (bold.length > 0) { if (bold.length > 0) {
addSolr(solrdoc, SolrField.bold_txt, bold); addSolr(solrdoc, SolrField.bold_txt, bold);
if (isEmpty() || contains(SolrField.bold_val.name())) { if (allAttr || contains(SolrField.bold_val)) {
addSolr(solrdoc, SolrField.bold_val, html.getBoldCount(bold)); addSolr(solrdoc, SolrField.bold_val, html.getBoldCount(bold));
} }
} }
@ -258,7 +350,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
addSolr(solrdoc, SolrField.italiccount_i, italic.length); addSolr(solrdoc, SolrField.italiccount_i, italic.length);
if (italic.length > 0) { if (italic.length > 0) {
addSolr(solrdoc, SolrField.italic_txt, italic); addSolr(solrdoc, SolrField.italic_txt, italic);
if (isEmpty() || contains(SolrField.italic_val.name())) { if (allAttr || contains(SolrField.italic_val)) {
addSolr(solrdoc, SolrField.italic_val, html.getItalicCount(italic)); addSolr(solrdoc, SolrField.italic_val, html.getItalicCount(italic));
} }
} }
@ -282,14 +374,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
imgstubs.add(uri.toString().substring(protocol.length() + 3)); imgstubs.add(uri.toString().substring(protocol.length() + 3));
imgalts.add(ie.alt()); imgalts.add(ie.alt());
} }
addSolr(solrdoc, SolrField.imagescount_i, imgtags.size()); if (allAttr || contains(SolrField.imagescount_i)) addSolr(solrdoc, SolrField.imagescount_i, imgtags.size());
if (isEmpty() || contains(SolrField.images_tag_txt.name())) addSolr(solrdoc, SolrField.images_tag_txt, imgtags); if (allAttr || contains(SolrField.images_tag_txt)) addSolr(solrdoc, SolrField.images_tag_txt, imgtags);
if (isEmpty() || contains(SolrField.images_protocol_txt.name())) addSolr(solrdoc, SolrField.images_protocol_txt, protocolList2indexedList(imgprots)); if (allAttr || contains(SolrField.images_protocol_txt)) addSolr(solrdoc, SolrField.images_protocol_txt, protocolList2indexedList(imgprots));
if (isEmpty() || contains(SolrField.images_urlstub_txt.name())) addSolr(solrdoc, SolrField.images_urlstub_txt, imgstubs); if (allAttr || contains(SolrField.images_urlstub_txt)) addSolr(solrdoc, SolrField.images_urlstub_txt, imgstubs);
if (isEmpty() || contains(SolrField.images_alt_txt.name())) addSolr(solrdoc, SolrField.images_alt_txt, imgalts); if (allAttr || contains(SolrField.images_alt_txt)) addSolr(solrdoc, SolrField.images_alt_txt, imgalts);
// style sheets // style sheets
if (isEmpty() || contains(SolrField.css_tag_txt.name())) { if (allAttr || contains(SolrField.css_tag_txt)) {
final Map<MultiProtocolURI, String> csss = html.getCSS(); final Map<MultiProtocolURI, String> csss = html.getCSS();
final String[] css_tag = new String[csss.size()]; final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()]; final String[] css_url = new String[csss.size()];
@ -310,7 +402,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
} }
// Scripts // Scripts
if (isEmpty() || contains(SolrField.scripts_txt.name())) { if (allAttr || contains(SolrField.scripts_txt)) {
final Set<MultiProtocolURI> scriptss = html.getScript(); final Set<MultiProtocolURI> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()]; final String[] scripts = new String[scriptss.size()];
c = 0; c = 0;
@ -324,7 +416,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
} }
// Frames // Frames
if (isEmpty() || contains(SolrField.frames_txt.name())) { if (allAttr || contains(SolrField.frames_txt)) {
final Set<MultiProtocolURI> framess = html.getFrames(); final Set<MultiProtocolURI> framess = html.getFrames();
final String[] frames = new String[framess.size()]; final String[] frames = new String[framess.size()];
c = 0; c = 0;
@ -338,7 +430,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
} }
// IFrames // IFrames
if (isEmpty() || contains(SolrField.iframes_txt.name())) { if (allAttr || contains(SolrField.iframes_txt)) {
final Set<MultiProtocolURI> iframess = html.getIFrames(); final Set<MultiProtocolURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()]; final String[] iframes = new String[iframess.size()];
c = 0; c = 0;
@ -352,7 +444,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
} }
// canonical tag // canonical tag
if (isEmpty() || contains(SolrField.canonical_s.name())) { if (allAttr || contains(SolrField.canonical_s)) {
final MultiProtocolURI canonical = html.getCanonical(); final MultiProtocolURI canonical = html.getCanonical();
if (canonical != null) { if (canonical != null) {
inboundLinks.remove(canonical); inboundLinks.remove(canonical);
@ -362,7 +454,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
} }
// meta refresh tag // meta refresh tag
if (isEmpty() || contains(SolrField.refresh_s.name())) { if (allAttr || contains(SolrField.refresh_s)) {
String refresh = html.getRefreshPath(); String refresh = html.getRefreshPath();
if (refresh != null && refresh.length() > 0) { if (refresh != null && refresh.length() > 0) {
MultiProtocolURI refreshURL; MultiProtocolURI refreshURL;
@ -380,7 +472,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
} }
// flash embedded // flash embedded
if (isEmpty() || contains(SolrField.flash_b.name())) { if (allAttr || contains(SolrField.flash_b)) {
MultiProtocolURI[] flashURLs = html.getFlash(); MultiProtocolURI[] flashURLs = html.getFlash();
for (MultiProtocolURI u: flashURLs) { for (MultiProtocolURI u: flashURLs) {
// remove all flash links from ibound/outbound links // remove all flash links from ibound/outbound links
@ -392,7 +484,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// generic evaluation pattern // generic evaluation pattern
for (final String model: html.getEvaluationModelNames()) { for (final String model: html.getEvaluationModelNames()) {
if (isEmpty() || contains("ext_" + model + "_txt")) { if (allAttr || contains("ext_" + model + "_txt")) {
final String[] scorenames = html.getEvaluationModelScoreNames(model); final String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) { if (scorenames.length > 0) {
addSolr(solrdoc, SolrField.valueOf("ext_" + model + "_txt"), scorenames); addSolr(solrdoc, SolrField.valueOf("ext_" + model + "_txt"), scorenames);
@ -408,8 +500,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// list all links // list all links
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors(); final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
c = 0; c = 0;
if (isEmpty() || contains(SolrField.inboundlinkscount_i.name())) addSolr(solrdoc, SolrField.inboundlinkscount_i, inboundLinks.size()); if (allAttr || contains(SolrField.inboundlinkscount_i)) addSolr(solrdoc, SolrField.inboundlinkscount_i, inboundLinks.size());
if (isEmpty() || contains(SolrField.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount()); if (allAttr || contains(SolrField.inboundlinksnofollowcount_i)) addSolr(solrdoc, SolrField.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
final List<String> inboundlinksTag = new ArrayList<String>(inboundLinks.size()); final List<String> inboundlinksTag = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksURLProtocol = new ArrayList<String>(inboundLinks.size()); final List<String> inboundlinksURLProtocol = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksURLStub = new ArrayList<String>(inboundLinks.size()); final List<String> inboundlinksURLStub = new ArrayList<String>(inboundLinks.size());
@ -437,17 +529,17 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
((text.length() > 0) ? text : "") + "</a>"); ((text.length() > 0) ? text : "") + "</a>");
c++; c++;
} }
if (isEmpty() || contains(SolrField.inboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_tag_txt, inboundlinksTag); if (allAttr || contains(SolrField.inboundlinks_tag_txt)) addSolr(solrdoc, SolrField.inboundlinks_tag_txt, inboundlinksTag);
if (isEmpty() || contains(SolrField.inboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol)); if (allAttr || contains(SolrField.inboundlinks_protocol_txt)) addSolr(solrdoc, SolrField.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
if (isEmpty() || contains(SolrField.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_urlstub_txt, inboundlinksURLStub); if (allAttr || contains(SolrField.inboundlinks_urlstub_txt)) addSolr(solrdoc, SolrField.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (isEmpty() || contains(SolrField.inboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_name_txt, inboundlinksName); if (allAttr || contains(SolrField.inboundlinks_name_txt)) addSolr(solrdoc, SolrField.inboundlinks_name_txt, inboundlinksName);
if (isEmpty() || contains(SolrField.inboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_rel_txt, inboundlinksRel); if (allAttr || contains(SolrField.inboundlinks_rel_txt)) addSolr(solrdoc, SolrField.inboundlinks_rel_txt, inboundlinksRel);
if (isEmpty() || contains(SolrField.inboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_relflags_txt, relEval(inboundlinksRel)); if (allAttr || contains(SolrField.inboundlinks_relflags_txt)) addSolr(solrdoc, SolrField.inboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(SolrField.inboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_text_txt, inboundlinksText); if (allAttr || contains(SolrField.inboundlinks_text_txt)) addSolr(solrdoc, SolrField.inboundlinks_text_txt, inboundlinksText);
c = 0; c = 0;
if (isEmpty() || contains(SolrField.outboundlinkscount_i.name())) addSolr(solrdoc, SolrField.outboundlinkscount_i, ouboundLinks.size()); if (allAttr || contains(SolrField.outboundlinkscount_i)) addSolr(solrdoc, SolrField.outboundlinkscount_i, ouboundLinks.size());
if (isEmpty() || contains(SolrField.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount()); if (allAttr || contains(SolrField.outboundlinksnofollowcount_i)) addSolr(solrdoc, SolrField.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
final List<String> outboundlinksTag = new ArrayList<String>(ouboundLinks.size()); final List<String> outboundlinksTag = new ArrayList<String>(ouboundLinks.size());
final List<String> outboundlinksURLProtocol = new ArrayList<String>(ouboundLinks.size()); final List<String> outboundlinksURLProtocol = new ArrayList<String>(ouboundLinks.size());
final List<String> outboundlinksURLStub = new ArrayList<String>(ouboundLinks.size()); final List<String> outboundlinksURLStub = new ArrayList<String>(ouboundLinks.size());
@ -475,23 +567,37 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
((text.length() > 0) ? text : "") + "</a>"); ((text.length() > 0) ? text : "") + "</a>");
c++; c++;
} }
if (isEmpty() || contains(SolrField.outboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_tag_txt, outboundlinksTag); if (allAttr || contains(SolrField.outboundlinks_tag_txt)) addSolr(solrdoc, SolrField.outboundlinks_tag_txt, outboundlinksTag);
if (isEmpty() || contains(SolrField.outboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol)); if (allAttr || contains(SolrField.outboundlinks_protocol_txt)) addSolr(solrdoc, SolrField.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
if (isEmpty() || contains(SolrField.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_urlstub_txt, outboundlinksURLStub); if (allAttr || contains(SolrField.outboundlinks_urlstub_txt)) addSolr(solrdoc, SolrField.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (isEmpty() || contains(SolrField.outboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_name_txt, outboundlinksName); if (allAttr || contains(SolrField.outboundlinks_name_txt)) addSolr(solrdoc, SolrField.outboundlinks_name_txt, outboundlinksName);
if (isEmpty() || contains(SolrField.outboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_rel_txt, outboundlinksRel); if (allAttr || contains(SolrField.outboundlinks_rel_txt)) addSolr(solrdoc, SolrField.outboundlinks_rel_txt, outboundlinksRel);
if (isEmpty() || contains(SolrField.outboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_relflags_txt, relEval(inboundlinksRel)); if (allAttr || contains(SolrField.outboundlinks_relflags_txt)) addSolr(solrdoc, SolrField.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(SolrField.outboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_text_txt, outboundlinksText); if (allAttr || contains(SolrField.outboundlinks_text_txt)) addSolr(solrdoc, SolrField.outboundlinks_text_txt, outboundlinksText);
// charset // charset
addSolr(solrdoc, SolrField.charset_s, yacydoc.getCharset()); if (allAttr || contains(SolrField.charset_s)) addSolr(solrdoc, SolrField.charset_s, yacydoc.getCharset());
// coordinates // coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon()); if (allAttr || contains(SolrField.lon_coordinate)) addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon());
addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat()); if (allAttr || contains(SolrField.lat_coordinate)) addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat());
} }
addSolr(solrdoc, SolrField.httpstatus_i, header == null ? 200 : header.getStatusCode()); if (allAttr || contains(SolrField.httpstatus_i)) addSolr(solrdoc, SolrField.httpstatus_i, header == null ? 200 : header.getStatusCode());
// fields that are additionally in URIMetadataRow
if (allAttr || contains(SolrField.load_date_dt)) addSolr(solrdoc, SolrField.load_date_dt, metadata.loaddate());
if (allAttr || contains(SolrField.fresh_date_dt)) addSolr(solrdoc, SolrField.fresh_date_dt, metadata.freshdate());
if (allAttr || contains(SolrField.host_id_s)) addSolr(solrdoc, SolrField.host_id_s, metadata.hosthash());
if ((allAttr || contains(SolrField.referrer_id_ss)) && metadata.referrerHash() != null) addSolr(solrdoc, SolrField.referrer_id_ss, new String[]{ASCII.String(metadata.referrerHash())});
//if (allAttr || contains(SolrField.md5_s)) addSolr(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(SolrField.publisher_t)) addSolr(solrdoc, SolrField.publisher_t, yacydoc.dc_publisher());
if ((allAttr || contains(SolrField.language_ss)) && metadata.language() != null) addSolr(solrdoc, SolrField.language_ss,new String[]{UTF8.String(metadata.language())});
if (allAttr || contains(SolrField.ranking_i)) addSolr(solrdoc, SolrField.ranking_i, metadata.ranking());
if (allAttr || contains(SolrField.size_i)) addSolr(solrdoc, SolrField.size_i, metadata.size());
if (allAttr || contains(SolrField.audiolinkscount_i)) addSolr(solrdoc, SolrField.audiolinkscount_i, yacydoc.getAudiolinks().size());
if (allAttr || contains(SolrField.videolinkscount_i)) addSolr(solrdoc, SolrField.videolinkscount_i, yacydoc.getVideolinks().size());
if (allAttr || contains(SolrField.applinkscount_i)) addSolr(solrdoc, SolrField.applinkscount_i, yacydoc.getApplinks().size());
return solrdoc; return solrdoc;
} }

@ -120,7 +120,21 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField
ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"), ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"), ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"), ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"),
failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"); failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
// values used additionally by URIMetadataRow
load_date_dt(SolrType.date, true, true, "time when resource was loaded"),
fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"),
host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
referrer_id_ss(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash();
md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5();
publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher();
language_ss(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language();
ranking_i(SolrType.integer, true, true, "an external ranking value"),// long ranking();
size_i(SolrType.integer, true, true, "the size of the raw source"),// int size();
audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio();
videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo();
applinkscount_i(SolrType.integer, true, true, "number of links to application resources");// int lapp();
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() ) private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type; private final SolrType type;

Loading…
Cancel
Save