From d9173ba7edf3c971ae1fbc49505adb20cc3a44c7 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 5 Aug 2012 15:49:27 +0200 Subject: [PATCH] added more solr fields to integrate values from URIMetadataRow. All writings to the Metadata-DB are now also done to solr. This includes metadata transfer during search and rwi transfer. The new/added solr fields are: ## time when resource was loaded load_date_dt ## date until resource shall be considered as fresh fresh_date_dt ## id of the host, a 6-byte hash that is part of the document id host_id_s ## ids of referrer to this document referrer_id_ss ## the md5 of the raw source md5_s ## the name of the publisher of the document publisher_t ## the language used in the document; starts with primary language language_ss ## an external ranking value ranking_i ## the size of the raw source size_i ## number of links to audio resources audiolinkscount_i ## number of links to video resources videolinkscount_i ## number of links to application resources applinkscount_i --- defaults/solr.keys.list | 42 ++++ htroot/IndexFederated_p.java | 11 - htroot/yacy/crawlReceipt.java | 3 + htroot/yacy/transferURL.java | 4 + source/de/anomic/crawler/ResultURLs.java | 3 +- .../de/anomic/crawler/retrieval/Response.java | 77 +++--- .../cora/services/federated/solr/SolrDoc.java | 4 + .../yacy/cora/storage/ConfigurationSet.java | 8 +- source/net/yacy/document/Condenser.java | 32 --- .../net/yacy/document/parser/pdfParser.java | 13 +- source/net/yacy/peers/Protocol.java | 9 +- source/net/yacy/search/Switchboard.java | 7 +- .../net/yacy/search/index/DocumentIndex.java | 16 +- .../yacy/search/index/MetadataRepository.java | 8 +- source/net/yacy/search/index/Segment.java | 38 ++- .../yacy/search/index/SolrConfiguration.java | 232 +++++++++++++----- source/net/yacy/search/index/SolrField.java | 16 +- 17 files changed, 319 insertions(+), 204 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 05fde7e4a..f0ed6743d 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -267,3 +267,45 @@ failreason_t ## response time of target server in milliseconds, int responsetime_i + + +### values used additionally by URIMetadataRow, part of the index transfer process + +## time when resource was loaded +load_date_dt + +## date until resource shall be considered as fresh +fresh_date_dt + +## id of the host, a 6-byte hash that is part of the document id +host_id_s + +## ids of referrer to this document +referrer_id_ss + +## the md5 of the raw source +md5_s + +## the name of the publisher of the document +publisher_t + +## the language used in the document; starts with primary language +language_ss + +## an external ranking value +ranking_i + +## the size of the raw source +size_i + +## number of links to audio resources +audiolinkscount_i + +## number of links to video resources +videolinkscount_i + +## number of links to application resources +applinkscount_i + +## index creation comment +process_s \ No newline at end of file diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index 08b842bd4..f2afcec22 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -197,17 +197,6 @@ public class IndexFederated_p { if (field.getComment() != null) prop.putHTML("scheme_" + c + "_comment",field.getComment()); c++; } - /* final Iterator i = sb.solrScheme.entryIterator(); - ConfigurationSet.Entry entry; - while (i.hasNext()) { - entry = i.next(); - prop.put("scheme_" + c + "_dark", dark ? 1 : 0); dark = !dark; - prop.put("scheme_" + c + "_checked", entry.enabled() ? 1 : 0); - prop.putHTML("scheme_" + c + "_key", entry.key()); - prop.putHTML("scheme_" + c + "_solrfieldname",entry.getValue() == null ? "" : entry.getValue()); - if (entry.getComment() != null) prop.putHTML("scheme_" + c + "_comment",entry.getComment()); - c++; - }*/ prop.put("scheme", c); // fill attribute fields diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index cfa385b19..06fe81c0a 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -147,6 +147,9 @@ public final class crawlReceipt { if ("fill".equals(result)) try { // put new entry into database sb.index.urlMetadata().store(entry); + if (!sb.index.urlMetadata().getSolr().exists(ASCII.String(entry.url().hash()))) { + sb.index.urlMetadata().getSolr().add(sb.index.urlMetadata().getSolrScheme().metadata2solr(entry)); + } ResultURLs.stack(entry, youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS); sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false, true)); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 9279c5467..cf50c88f3 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -30,6 +30,7 @@ import java.io.IOException; import java.text.ParseException; import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.document.ASCII; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -141,6 +142,9 @@ public final class transferURL { if (Network.log.isFine()) Network.log.logFine("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.url().toNormalform(true, false)); try { sb.index.urlMetadata().store(lEntry); + if (!sb.index.urlMetadata().getSolr().exists(ASCII.String(lEntry.url().hash()))) { + sb.index.urlMetadata().getSolr().add(sb.index.urlMetadata().getSolrScheme().metadata2solr(lEntry)); + } ResultURLs.stack(lEntry, iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER); if (Network.log.isFine()) Network.log.logFine("transferURL: received URL '" + lEntry.url().toNormalform(false, true) + "' from peer " + otherPeerName); received++; diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java index 14d818e30..460107638 100644 --- a/source/de/anomic/crawler/ResultURLs.java +++ b/source/de/anomic/crawler/ResultURLs.java @@ -37,6 +37,7 @@ import net.yacy.cora.document.UTF8; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ScoreMap; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Bitfield; @@ -96,7 +97,7 @@ public final class ResultURLs { } public static void stack( - final URIMetadataRow e, + final URIMetadata e, final byte[] initiatorHash, final byte[] executorHash, final EventOrigin stackType) { diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index b588e9e14..b9c75c845 100644 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -70,29 +70,28 @@ public class Response { // doctype calculation public static char docType(final MultiProtocolURI url) { - final String path = url.getPath().toLowerCase(); - // serverLog.logFinest("PLASMA", "docType URL=" + path); - char doctype = DT_UNKNOWN; - if (path.endsWith(".gif")) { doctype = DT_IMAGE; } - else if (path.endsWith(".ico")) { doctype = DT_IMAGE; } - else if (path.endsWith(".bmp")) { doctype = DT_IMAGE; } - else if (path.endsWith(".jpg")) { doctype = DT_IMAGE; } - else if (path.endsWith(".jpeg")) { doctype = DT_IMAGE; } - else if (path.endsWith(".png")) { doctype = DT_IMAGE; } - else if (path.endsWith(".html")) { doctype = DT_HTML; } - else if (path.endsWith(".txt")) { doctype = DT_TEXT; } - else if (path.endsWith(".doc")) { doctype = DT_DOC; } - else if (path.endsWith(".rtf")) { doctype = DT_DOC; } - else if (path.endsWith(".pdf")) { doctype = DT_PDFPS; } - else if (path.endsWith(".ps")) { doctype = DT_PDFPS; } - else if (path.endsWith(".avi")) { doctype = DT_MOVIE; } - else if (path.endsWith(".mov")) { doctype = DT_MOVIE; } - else if (path.endsWith(".qt")) { doctype = DT_MOVIE; } - else if (path.endsWith(".mpg")) { doctype = DT_MOVIE; } - else if (path.endsWith(".md5")) { doctype = DT_SHARE; } - else if (path.endsWith(".mpeg")) { doctype = DT_MOVIE; } - else if (path.endsWith(".asf")) { doctype = DT_FLASH; } - return doctype; + String ext = url.getFileExtension(); + if (ext == null) return DT_UNKNOWN; + if (ext.equals(".gif")) return DT_IMAGE; + if (ext.equals(".ico")) return DT_IMAGE; + if (ext.equals(".bmp")) return DT_IMAGE; + if (ext.equals(".jpg")) return DT_IMAGE; + if (ext.equals(".jpeg")) return DT_IMAGE; + if (ext.equals(".png")) return DT_IMAGE; + if (ext.equals(".html")) return DT_HTML; + if (ext.equals(".txt")) return DT_TEXT; + if (ext.equals(".doc")) return DT_DOC; + if (ext.equals(".rtf")) return DT_DOC; + if (ext.equals(".pdf")) return DT_PDFPS; + if (ext.equals(".ps")) return DT_PDFPS; + if (ext.equals(".avi")) return DT_MOVIE; + if (ext.equals(".mov")) return DT_MOVIE; + if (ext.equals(".qt")) return DT_MOVIE; + if (ext.equals(".mpg")) return DT_MOVIE; + if (ext.equals(".md5")) return DT_SHARE; + if (ext.equals(".mpeg")) return DT_MOVIE; + if (ext.equals(".asf")) return DT_FLASH; + return DT_UNKNOWN; } public static char docType(final String mime) { @@ -115,30 +114,20 @@ public class Response { else if (mime.startsWith("image/")) doctype = DT_IMAGE; else if (mime.startsWith("audio/")) doctype = DT_AUDIO; else if (mime.startsWith("video/")) doctype = DT_MOVIE; - //bz2 = application/x-bzip2 - //dvi = application/x-dvi - //gz = application/gzip - //hqx = application/mac-binhex40 - //lha = application/x-lzh - //lzh = application/x-lzh - //pac = application/x-ns-proxy-autoconfig - //php = application/x-httpd-php - //phtml = application/x-httpd-php - //rss = application/xml - //tar = application/tar - //tex = application/x-tex - //tgz = application/tar - //torrent = application/x-bittorrent - //xhtml = application/xhtml+xml - //xla = application/msexcel - //xls = application/msexcel - //xsl = application/xml - //xml = application/xml - //Z = application/x-compress - //zip = application/zip return doctype; } + public static String doctype2mime(String ext, char doctype) { + String mime = Classification.ext2mime(ext); + int p = mime.indexOf('/'); + if (p < 0) return mime; + if (doctype == DT_TEXT) return "text" + mime.substring(p); + if (doctype == DT_IMAGE) return "image" + mime.substring(p); + if (doctype == DT_AUDIO) return "audio" + mime.substring(p); + if (doctype == DT_MOVIE) return "video" + mime.substring(p); + return mime; + } + public static final int QUEUE_STATE_FRESH = 0; public static final int QUEUE_STATE_PARSING = 1; public static final int QUEUE_STATE_CONDENSING = 2; diff --git a/source/net/yacy/cora/services/federated/solr/SolrDoc.java b/source/net/yacy/cora/services/federated/solr/SolrDoc.java index ee43f61ca..36237527e 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrDoc.java +++ b/source/net/yacy/cora/services/federated/solr/SolrDoc.java @@ -52,6 +52,10 @@ public class SolrDoc extends SolrInputDocument { this.setField(key.getSolrFieldName(), value); } + public final void addSolr(final SolrField key, final long value) { + this.setField(key.getSolrFieldName(), value); + } + public final void addSolr(final SolrField key, final String[] value) { this.setField(key.getSolrFieldName(), value); } diff --git a/source/net/yacy/cora/storage/ConfigurationSet.java b/source/net/yacy/cora/storage/ConfigurationSet.java index a079e4855..d490a5e9f 100644 --- a/source/net/yacy/cora/storage/ConfigurationSet.java +++ b/source/net/yacy/cora/storage/ConfigurationSet.java @@ -123,7 +123,7 @@ public class ConfigurationSet extends TreeMap implements Serializa /** * override the abstract implementation because that is not stable in concurrent requests */ - public boolean contains (String key) { + public boolean contains(String key) { if (key == null) return false; Entry e = this.get(key); return e == null ? false : e.enabled(); @@ -260,11 +260,7 @@ public class ConfigurationSet extends TreeMap implements Serializa } writer.close(); } - /* - public Iterator iterator() { - return this.keySet().iterator(); - } -*/ + public Iterator entryIterator() { return this.values().iterator(); } diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index e6091018e..2ebe25eb2 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -56,54 +56,22 @@ public final class Condenser { // category flags that show how the page can be distinguished in different interest groups public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of') - public static final int flag_cat_opencontent = 1; // open source, any free stuff - public static final int flag_cat_business = 2; // web shops, marketing, trade - public static final int flag_cat_stockfinance = 3; // stock exchange (quotes), finance, economy - public static final int flag_cat_health = 4; // health - public static final int flag_cat_sport = 5; // any sport, cars etc. - public static final int flag_cat_lifestyle = 6; // travel, lifestyle - public static final int flag_cat_politics = 7; // politics - public static final int flag_cat_news = 8; // blogs, news pages - public static final int flag_cat_children = 9; // toys, childrens education, help for parents - public static final int flag_cat_entertainment = 10; // boulevard, entertainment, cultural content - public static final int flag_cat_knowledge = 11; // science, school stuff, help for homework - public static final int flag_cat_computer = 12; // any computer related stuff, networks, operation systems - public static final int flag_cat_p2p = 13; // p2p support, file-sharing archives etc. - public static final int flag_cat_sex = 14; // sexual content - public static final int flag_cat_spam = 15; // pages that anybody would consider as not interesting - public static final int flag_cat_linux = 16; // pages about linux software - public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os - public static final int flag_cat_windows = 18; // pages about windows os and software public static final int flag_cat_haslocation = 19; // the page has a location metadata attached public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file - //private Properties analysis; private final Map words; // a string (the words) to (indexWord) - relation private final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging - //public int RESULT_NUMB_TEXT_BYTES = -1; public int RESULT_NUMB_WORDS = -1; public int RESULT_DIFF_WORDS = -1; public int RESULT_NUMB_SENTENCES = -1; public int RESULT_DIFF_SENTENCES = -1; public Bitfield RESULT_FLAGS = new Bitfield(4); private final Identificator languageIdentificator; - /* - private final static int numlength = 5; - private static final ThreadLocal intStringFormatter = - new ThreadLocal () { - @Override protected NumberFormat initialValue() { - NumberFormat n = NumberFormat.getIntegerInstance(); - n.setMinimumIntegerDigits(numlength); - n.setMaximumIntegerDigits(numlength); - return n; - } - }; - */ public Condenser( final Document document, diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index b6daef6d7..874f328e5 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -153,20 +153,13 @@ public class pdfParser extends AbstractParser implements Parser { if (t.isAlive()) t.interrupt(); pdfDoc.close(); contentBytes = writer.getBytes(); // get final text before closing writer - } catch (final IOException e) { - // close the writer - if (writer != null) try { writer.close(); } catch (final Exception ex) {} - try {pdfDoc.close();} catch (final IOException ee) {} - //throw new Parser.Failure(e.getMessage(), location); - } catch (final NullPointerException e) { - // this exception appeared after the insertion of the jempbox-1.5.0.jar library - Log.logException(e); + } catch (final Throwable e) { // close the writer if (writer != null) try { writer.close(); } catch (final Exception ex) {} - try {pdfDoc.close();} catch (final IOException ee) {} + try {pdfDoc.close();} catch (final Throwable ee) {} //throw new Parser.Failure(e.getMessage(), location); } finally { - try {pdfDoc.close();} catch (final IOException e) {} + try {pdfDoc.close();} catch (final Throwable e) {} writer.close(); } diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index b5902dc54..b5b883ccf 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -357,7 +357,9 @@ public final class Protocol if ( p < 0 ) { return -1; } - final String host = Domains.dnsResolve(address.substring(0, p)).getHostAddress(); + InetAddress ia = Domains.dnsResolve(address.substring(0, p)); + if (ia == null) continue; + final String host = ia.getHostAddress(); s = Seed.genRemoteSeed(seedStr, false, host); } else { s = Seed.genRemoteSeed(seedStr, false, null); @@ -752,6 +754,9 @@ public final class Protocol // passed all checks, store url try { indexSegment.urlMetadata().store(urlEntry); + if (!indexSegment.urlMetadata().getSolr().exists(ASCII.String(urlEntry.url().hash()))) { + indexSegment.urlMetadata().getSolr().add(indexSegment.urlMetadata().getSolrScheme().metadata2solr(urlEntry)); + } ResultURLs.stack( urlEntry, mySeed.hash.getBytes(), @@ -1081,7 +1086,7 @@ public final class Protocol final String process, final String result, final String reason, - final URIMetadataRow entry, + final URIMetadata entry, final String wordhashes) { assert (target != null); assert (mySeed != null); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 8a29de350..5301117fd 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -111,7 +111,6 @@ import net.yacy.gui.Tray; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; @@ -2514,7 +2513,7 @@ public final class Switchboard extends serverSwitch this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url); // STORE WORD INDEX - URIMetadataRow newEntry = null; + URIMetadata newEntry = null; try { newEntry = this.index.storeDocument( @@ -2761,9 +2760,9 @@ public final class Switchboard extends serverSwitch public class receiptSending implements Runnable { private final Seed initiatorPeer; - private final URIMetadataRow reference; + private final URIMetadata reference; - public receiptSending(final Seed initiatorPeer, final URIMetadataRow reference) { + public receiptSending(final Seed initiatorPeer, final URIMetadata reference) { this.initiatorPeer = initiatorPeer; this.reference = reference; } diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 518d2c08f..79aebaf56 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -42,7 +42,6 @@ import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.search.query.QueryParams; import net.yacy.search.query.RWIProcess; @@ -54,8 +53,7 @@ import net.yacy.search.ranking.ReferenceOrder; * * @author Michael Christen */ -public class DocumentIndex extends Segment -{ +public class DocumentIndex extends Segment { private static final RankingProfile textRankingDefault = new RankingProfile(Classification.ContentDomain.TEXT); //private Bitfield zeroConstraint = new Bitfield(4); @@ -102,12 +100,12 @@ public class DocumentIndex extends Segment @Override public void run() { DigestURI f; - URIMetadataRow[] resultRows; + URIMetadata[] resultRows; try { while ( (f = DocumentIndex.this.queue.take()) != poison ) { try { resultRows = add(f); - for ( final URIMetadataRow resultRow : resultRows ) { + for ( final URIMetadata resultRow : resultRows ) { if ( DocumentIndex.this.callback != null ) { if ( resultRow == null ) { DocumentIndex.this.callback.fail(f, "result is null"); @@ -139,7 +137,7 @@ public class DocumentIndex extends Segment this.queue.clear(); } - private URIMetadataRow[] add(final DigestURI url) throws IOException { + private URIMetadata[] add(final DigestURI url) throws IOException { if ( url == null ) { throw new IOException("file = null"); } @@ -162,7 +160,7 @@ public class DocumentIndex extends Segment throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } //Document document = Document.mergeDocuments(url, null, documents); - final URIMetadataRow[] rows = new URIMetadataRow[documents.length]; + final URIMetadata[] rows = new URIMetadata[documents.length]; int c = 0; for ( final Document document : documents ) { if (document == null) continue; @@ -274,7 +272,7 @@ public class DocumentIndex extends Segment public interface CallbackListener { - public void commit(DigestURI f, URIMetadataRow resultRow); + public void commit(DigestURI f, URIMetadata resultRow); public void fail(DigestURI f, String failReason); } @@ -295,7 +293,7 @@ public class DocumentIndex extends Segment System.out.println("using index files at " + segmentPath.getAbsolutePath()); final CallbackListener callback = new CallbackListener() { @Override - public void commit(final DigestURI f, final URIMetadataRow resultRow) { + public void commit(final DigestURI f, final URIMetadata resultRow) { System.out.println("indexed: " + f.toString()); } diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java index fd458f54c..b63d852d5 100644 --- a/source/net/yacy/search/index/MetadataRepository.java +++ b/source/net/yacy/search/index/MetadataRepository.java @@ -71,14 +71,16 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable private String tablename; private ArrayList statsDump; private final DoubleSolrConnector solr; + private final SolrConfiguration solrScheme; - public MetadataRepository(final File path) { + public MetadataRepository(final File path, final SolrConfiguration solrScheme) { this.location = path; this.tablename = null; this.urlIndexFile = null; this.exportthread = null; // will have a export thread assigned if exporter is running this.statsDump = null; this.solr = new DoubleSolrConnector(); + this.solrScheme = solrScheme; } public boolean connectedUrlDb() { @@ -97,6 +99,10 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable this.urlIndexFile = null; } + public SolrConfiguration getSolrScheme() { + return this.solrScheme; + } + public boolean connectedLocalSolr() { return this.solr.isConnected0(); } diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 51b7a3ee4..cef155213 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -99,7 +99,6 @@ public class Segment { private final Log log; private final File segmentPath; - private final SolrConfiguration solrScheme; protected final MetadataRepository urlMetadata; protected IndexCell termIndex; protected IndexCell urlCitationIndex; @@ -108,10 +107,9 @@ public class Segment { log.logInfo("Initializing Segment '" + segmentPath + "."); this.log = log; this.segmentPath = segmentPath; - this.solrScheme = solrScheme; // create LURL-db - this.urlMetadata = new MetadataRepository(segmentPath); + this.urlMetadata = new MetadataRepository(segmentPath, solrScheme); } public boolean connectedRWI() { @@ -203,7 +201,7 @@ public class Segment { } public SolrConfiguration getSolrScheme() { - return this.solrScheme; + return this.urlMetadata.getSolrScheme(); } public SolrConnector getRemoteSolr() { @@ -398,7 +396,7 @@ public class Segment { return language; } - public URIMetadataRow storeDocument( + public URIMetadata storeDocument( final DigestURI url, final DigestURI referrerURL, Date modDate, @@ -420,22 +418,10 @@ public class Segment { final String urlNormalform = url.toNormalform(true, false); final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language - // STORE TO SOLR - boolean localSolr = this.connectedLocalSolr(); - boolean remoteSolr = this.connectedRemoteSolr(); - if (localSolr || remoteSolr) { - try { - SolrDoc solrDoc = this.solrScheme.yacy2solr(id, responseHeader, document); - this.getSolr().add(solrDoc); - } catch ( final IOException e ) { - Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage()); - } - } - // STORE URL TO LOADED-URL-DB if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader char docType = Response.docType(document.dc_format()); - final URIMetadataRow newEntry = new URIMetadataRow( + final URIMetadata metadata = new URIMetadataRow( url, // URL dc_title, // document description document.dc_creator(), // author @@ -460,9 +446,21 @@ public class Segment { document.getVideolinks().size(), // lvideo document.getApplinks().size() // lapp ); - this.urlMetadata.store(newEntry); + this.urlMetadata.store(metadata); final long storageEndTime = System.currentTimeMillis(); + // STORE TO SOLR + boolean localSolr = this.connectedLocalSolr(); + boolean remoteSolr = this.connectedRemoteSolr(); + if (localSolr || remoteSolr) { + try { + SolrDoc solrDoc = this.urlMetadata.getSolrScheme().yacy2solr(id, responseHeader, document, metadata); + this.getSolr().add(solrDoc); + } catch ( final IOException e ) { + Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage()); + } + } + // STORE PAGE INDEX INTO WORD INDEX DB int outlinksSame = document.inboundLinks().size(); int outlinksOther = document.outboundLinks().size(); @@ -545,7 +543,7 @@ public class Segment { } // finished - return newEntry; + return metadata; } public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) { diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index ef7a3b093..decce547c 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -24,7 +24,6 @@ package net.yacy.search.index; - import java.io.File; import java.io.IOException; import java.io.Serializable; @@ -41,18 +40,24 @@ import java.util.Set; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.services.federated.solr.SolrDoc; import net.yacy.cora.storage.ConfigurationSet; +import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.order.Bitfield; import org.apache.solr.common.SolrDocument; +import de.anomic.crawler.retrieval.Response; + public class SolrConfiguration extends ConfigurationSet implements Serializable { private static final long serialVersionUID=-499100932212840385L; @@ -88,46 +93,63 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable it.remove(); } } + // check consistency the other way: look if all enum constants in SolrField appear in the configuration file + for (SolrField field: SolrField.values()) { + if (this.get(field.name()) == null) { + Log.logWarning("SolrScheme", " solr scheme file " + configurationFile.getAbsolutePath() + " is missing declaration for '" + field.name() + "'"); + } + } this.lazy = lazy; } + private boolean contains(SolrField field) { + return this.contains(field.name()); + } + + protected void addSolr(final SolrDoc solrdoc, final SolrField key, final byte[] value) { + if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length != 0))) solrdoc.addSolr(key, UTF8.String(value)); + } + protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value) { - if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value); + if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value, final float boost) { - if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value, boost); + if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value, boost); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final Date value) { - if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.getTime() > 0))) solrdoc.addSolr(key, value); + if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.getTime() > 0))) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String[] value) { - if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.length > 0))) solrdoc.addSolr(key, value); + if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final List value) { - if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value); + if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final int value) { - if ((isEmpty() || contains(key.name())) && (!this.lazy || value > 0)) solrdoc.addSolr(key, value); + if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) solrdoc.addSolr(key, value); + } + + protected void addSolr(final SolrDoc solrdoc, final SolrField key, final long value) { + if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final float value) { - if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); + if ((isEmpty() || contains(key)) && (!this.lazy || value != 0.0f)) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final double value) { - if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); + if ((isEmpty() || contains(key)) && (!this.lazy || value != 0.0d)) solrdoc.addSolr(key, value); } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final boolean value) { - if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); + if (isEmpty() || contains(key)) solrdoc.addSolr(key, value); } - - + /** * save configuration to file and update enum SolrFields * @throws IOException @@ -148,33 +170,103 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } } catch (final IOException e) {} } + + public SolrDoc metadata2solr(final URIMetadata md) { + final SolrDoc solrdoc = new SolrDoc(); + final DigestURI digestURI = new DigestURI(md.url()); + boolean allAttr = this.isEmpty(); + + if (allAttr || contains(SolrField.failreason_t)) addSolr(solrdoc, SolrField.failreason_t, ""); + addSolr(solrdoc, SolrField.id, ASCII.String(md.hash())); + addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false)); + if (allAttr || contains(SolrField.ip_s)) { + final InetAddress address = digestURI.getInetAddress(); + if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress()); + } + if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost()); + if (allAttr || contains(SolrField.title)) addSolr(solrdoc, SolrField.title, md.dc_title()); + if (allAttr || contains(SolrField.author)) addSolr(solrdoc, SolrField.author, md.dc_creator()); + if (allAttr || contains(SolrField.description)) addSolr(solrdoc, SolrField.description, md.snippet()); + if (allAttr || contains(SolrField.content_type)) addSolr(solrdoc, SolrField.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype())); + if (allAttr || contains(SolrField.last_modified)) addSolr(solrdoc, SolrField.last_modified, md.moddate()); + if (allAttr || contains(SolrField.text_t)) addSolr(solrdoc, SolrField.text_t, ""); // not delivered in metadata + if (allAttr || contains(SolrField.wordcount_i)) addSolr(solrdoc, SolrField.wordcount_i, md.wordCount()); + if (allAttr || contains(SolrField.keywords)) { + String keywords = md.dc_subject(); + Bitfield flags = md.flags(); + if (flags.get(Condenser.flag_cat_indexof)) { + if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else { + if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof"; + } + } + addSolr(solrdoc, SolrField.keywords, keywords); + } + + // path elements of link + final String path = digestURI.getPath(); + if (path != null && (allAttr || contains(SolrField.paths_txt))) { + final String[] paths = path.split("/"); + if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths); + } + + if (allAttr || contains(SolrField.imagescount_i)) addSolr(solrdoc, SolrField.imagescount_i, md.limage()); + if (allAttr || contains(SolrField.inboundlinkscount_i)) addSolr(solrdoc, SolrField.inboundlinkscount_i, md.llocal()); + if (allAttr || contains(SolrField.outboundlinkscount_i)) addSolr(solrdoc, SolrField.outboundlinkscount_i, md.lother()); + if (allAttr || contains(SolrField.charset_s)) addSolr(solrdoc, SolrField.charset_s, "UTF8"); + + // coordinates + if (md.lat() != 0.0f && md.lon() != 0.0f) { + if (allAttr || contains(SolrField.lon_coordinate)) addSolr(solrdoc, SolrField.lon_coordinate, md.lon()); + if (allAttr || contains(SolrField.lat_coordinate)) addSolr(solrdoc, SolrField.lat_coordinate, md.lat()); + } + if (allAttr || contains(SolrField.httpstatus_i)) addSolr(solrdoc, SolrField.httpstatus_i, 200); + + // fields that are in URIMetadataRow additional to yacy2solr basic requirement + if (allAttr || contains(SolrField.load_date_dt)) addSolr(solrdoc, SolrField.load_date_dt, md.loaddate()); + if (allAttr || contains(SolrField.fresh_date_dt)) addSolr(solrdoc, SolrField.fresh_date_dt, md.freshdate()); + if (allAttr || contains(SolrField.host_id_s)) addSolr(solrdoc, SolrField.host_id_s, md.hosthash()); + if ((allAttr || contains(SolrField.referrer_id_ss)) && md.referrerHash() != null) addSolr(solrdoc, SolrField.referrer_id_ss, new String[]{ASCII.String(md.referrerHash())}); + if (allAttr || contains(SolrField.md5_s)) addSolr(solrdoc, SolrField.md5_s, md.md5()); + if (allAttr || contains(SolrField.publisher_t)) addSolr(solrdoc, SolrField.publisher_t, md.dc_publisher()); + if ((allAttr || contains(SolrField.language_ss)) && md.language() != null) addSolr(solrdoc, SolrField.language_ss,new String[]{UTF8.String(md.language())}); + if (allAttr || contains(SolrField.ranking_i)) addSolr(solrdoc, SolrField.ranking_i, md.ranking()); + if (allAttr || contains(SolrField.size_i)) addSolr(solrdoc, SolrField.size_i, md.size()); + if (allAttr || contains(SolrField.audiolinkscount_i)) addSolr(solrdoc, SolrField.audiolinkscount_i, md.laudio()); + if (allAttr || contains(SolrField.videolinkscount_i)) addSolr(solrdoc, SolrField.videolinkscount_i, md.lvideo()); + if (allAttr || contains(SolrField.applinkscount_i)) addSolr(solrdoc, SolrField.applinkscount_i, md.lapp()); + + return solrdoc; + } - public SolrDoc yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) { + public SolrDoc yacy2solr(final String id, final ResponseHeader header, final Document yacydoc, final URIMetadata metadata) { // we use the SolrCell design as index scheme final SolrDoc solrdoc = new SolrDoc(); final DigestURI digestURI = new DigestURI(yacydoc.dc_source()); - addSolr(solrdoc, SolrField.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) + boolean allAttr = this.isEmpty(); addSolr(solrdoc, SolrField.id, id); addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false)); - final InetAddress address = digestURI.getInetAddress(); - if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress()); + if (allAttr || contains(SolrField.failreason_t)) addSolr(solrdoc, SolrField.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) + if (allAttr || contains(SolrField.ip_s)) { + final InetAddress address = digestURI.getInetAddress(); + if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress()); + } if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost()); - addSolr(solrdoc, SolrField.title, yacydoc.dc_title()); - addSolr(solrdoc, SolrField.author, yacydoc.dc_creator()); - addSolr(solrdoc, SolrField.description, yacydoc.dc_description()); - addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format()); - addSolr(solrdoc, SolrField.last_modified, header == null ? new Date() : header.lastModified()); - addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' ')); + if (allAttr || contains(SolrField.title)) addSolr(solrdoc, SolrField.title, yacydoc.dc_title()); + if (allAttr || contains(SolrField.author)) addSolr(solrdoc, SolrField.author, yacydoc.dc_creator()); + if (allAttr || contains(SolrField.description)) addSolr(solrdoc, SolrField.description, yacydoc.dc_description()); + if (allAttr || contains(SolrField.content_type)) addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format()); + if (allAttr || contains(SolrField.last_modified)) addSolr(solrdoc, SolrField.last_modified, header == null ? new Date() : header.lastModified()); + if (allAttr || contains(SolrField.keywords)) addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' ')); final String content = yacydoc.getTextString(); - addSolr(solrdoc, SolrField.text_t, content); - if (isEmpty() || contains(SolrField.wordcount_i.name())) { + if (allAttr || contains(SolrField.text_t)) addSolr(solrdoc, SolrField.text_t, content); + if (allAttr || contains(SolrField.wordcount_i)) { final int contentwc = content.split(" ").length; addSolr(solrdoc, SolrField.wordcount_i, contentwc); } // path elements of link final String path = digestURI.getPath(); - if (path != null && (isEmpty() || contains(SolrField.paths_txt.name()))) { + if (path != null && (allAttr || contains(SolrField.paths_txt))) { final String[] paths = path.split("/"); if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths); } @@ -250,7 +342,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable addSolr(solrdoc, SolrField.boldcount_i, bold.length); if (bold.length > 0) { addSolr(solrdoc, SolrField.bold_txt, bold); - if (isEmpty() || contains(SolrField.bold_val.name())) { + if (allAttr || contains(SolrField.bold_val)) { addSolr(solrdoc, SolrField.bold_val, html.getBoldCount(bold)); } } @@ -258,7 +350,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable addSolr(solrdoc, SolrField.italiccount_i, italic.length); if (italic.length > 0) { addSolr(solrdoc, SolrField.italic_txt, italic); - if (isEmpty() || contains(SolrField.italic_val.name())) { + if (allAttr || contains(SolrField.italic_val)) { addSolr(solrdoc, SolrField.italic_val, html.getItalicCount(italic)); } } @@ -282,14 +374,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable imgstubs.add(uri.toString().substring(protocol.length() + 3)); imgalts.add(ie.alt()); } - addSolr(solrdoc, SolrField.imagescount_i, imgtags.size()); - if (isEmpty() || contains(SolrField.images_tag_txt.name())) addSolr(solrdoc, SolrField.images_tag_txt, imgtags); - if (isEmpty() || contains(SolrField.images_protocol_txt.name())) addSolr(solrdoc, SolrField.images_protocol_txt, protocolList2indexedList(imgprots)); - if (isEmpty() || contains(SolrField.images_urlstub_txt.name())) addSolr(solrdoc, SolrField.images_urlstub_txt, imgstubs); - if (isEmpty() || contains(SolrField.images_alt_txt.name())) addSolr(solrdoc, SolrField.images_alt_txt, imgalts); + if (allAttr || contains(SolrField.imagescount_i)) addSolr(solrdoc, SolrField.imagescount_i, imgtags.size()); + if (allAttr || contains(SolrField.images_tag_txt)) addSolr(solrdoc, SolrField.images_tag_txt, imgtags); + if (allAttr || contains(SolrField.images_protocol_txt)) addSolr(solrdoc, SolrField.images_protocol_txt, protocolList2indexedList(imgprots)); + if (allAttr || contains(SolrField.images_urlstub_txt)) addSolr(solrdoc, SolrField.images_urlstub_txt, imgstubs); + if (allAttr || contains(SolrField.images_alt_txt)) addSolr(solrdoc, SolrField.images_alt_txt, imgalts); // style sheets - if (isEmpty() || contains(SolrField.css_tag_txt.name())) { + if (allAttr || contains(SolrField.css_tag_txt)) { final Map csss = html.getCSS(); final String[] css_tag = new String[csss.size()]; final String[] css_url = new String[csss.size()]; @@ -310,7 +402,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // Scripts - if (isEmpty() || contains(SolrField.scripts_txt.name())) { + if (allAttr || contains(SolrField.scripts_txt)) { final Set scriptss = html.getScript(); final String[] scripts = new String[scriptss.size()]; c = 0; @@ -324,7 +416,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // Frames - if (isEmpty() || contains(SolrField.frames_txt.name())) { + if (allAttr || contains(SolrField.frames_txt)) { final Set framess = html.getFrames(); final String[] frames = new String[framess.size()]; c = 0; @@ -338,7 +430,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // IFrames - if (isEmpty() || contains(SolrField.iframes_txt.name())) { + if (allAttr || contains(SolrField.iframes_txt)) { final Set iframess = html.getIFrames(); final String[] iframes = new String[iframess.size()]; c = 0; @@ -352,7 +444,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // canonical tag - if (isEmpty() || contains(SolrField.canonical_s.name())) { + if (allAttr || contains(SolrField.canonical_s)) { final MultiProtocolURI canonical = html.getCanonical(); if (canonical != null) { inboundLinks.remove(canonical); @@ -362,7 +454,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // meta refresh tag - if (isEmpty() || contains(SolrField.refresh_s.name())) { + if (allAttr || contains(SolrField.refresh_s)) { String refresh = html.getRefreshPath(); if (refresh != null && refresh.length() > 0) { MultiProtocolURI refreshURL; @@ -380,7 +472,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // flash embedded - if (isEmpty() || contains(SolrField.flash_b.name())) { + if (allAttr || contains(SolrField.flash_b)) { MultiProtocolURI[] flashURLs = html.getFlash(); for (MultiProtocolURI u: flashURLs) { // remove all flash links from ibound/outbound links @@ -392,7 +484,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable // generic evaluation pattern for (final String model: html.getEvaluationModelNames()) { - if (isEmpty() || contains("ext_" + model + "_txt")) { + if (allAttr || contains("ext_" + model + "_txt")) { final String[] scorenames = html.getEvaluationModelScoreNames(model); if (scorenames.length > 0) { addSolr(solrdoc, SolrField.valueOf("ext_" + model + "_txt"), scorenames); @@ -408,8 +500,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable // list all links final Map alllinks = yacydoc.getAnchors(); c = 0; - if (isEmpty() || contains(SolrField.inboundlinkscount_i.name())) addSolr(solrdoc, SolrField.inboundlinkscount_i, inboundLinks.size()); - if (isEmpty() || contains(SolrField.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount()); + if (allAttr || contains(SolrField.inboundlinkscount_i)) addSolr(solrdoc, SolrField.inboundlinkscount_i, inboundLinks.size()); + if (allAttr || contains(SolrField.inboundlinksnofollowcount_i)) addSolr(solrdoc, SolrField.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount()); final List inboundlinksTag = new ArrayList(inboundLinks.size()); final List inboundlinksURLProtocol = new ArrayList(inboundLinks.size()); final List inboundlinksURLStub = new ArrayList(inboundLinks.size()); @@ -437,17 +529,17 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable ((text.length() > 0) ? text : "") + ""); c++; } - if (isEmpty() || contains(SolrField.inboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_tag_txt, inboundlinksTag); - if (isEmpty() || contains(SolrField.inboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol)); - if (isEmpty() || contains(SolrField.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_urlstub_txt, inboundlinksURLStub); - if (isEmpty() || contains(SolrField.inboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_name_txt, inboundlinksName); - if (isEmpty() || contains(SolrField.inboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_rel_txt, inboundlinksRel); - if (isEmpty() || contains(SolrField.inboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_relflags_txt, relEval(inboundlinksRel)); - if (isEmpty() || contains(SolrField.inboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_text_txt, inboundlinksText); + if (allAttr || contains(SolrField.inboundlinks_tag_txt)) addSolr(solrdoc, SolrField.inboundlinks_tag_txt, inboundlinksTag); + if (allAttr || contains(SolrField.inboundlinks_protocol_txt)) addSolr(solrdoc, SolrField.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol)); + if (allAttr || contains(SolrField.inboundlinks_urlstub_txt)) addSolr(solrdoc, SolrField.inboundlinks_urlstub_txt, inboundlinksURLStub); + if (allAttr || contains(SolrField.inboundlinks_name_txt)) addSolr(solrdoc, SolrField.inboundlinks_name_txt, inboundlinksName); + if (allAttr || contains(SolrField.inboundlinks_rel_txt)) addSolr(solrdoc, SolrField.inboundlinks_rel_txt, inboundlinksRel); + if (allAttr || contains(SolrField.inboundlinks_relflags_txt)) addSolr(solrdoc, SolrField.inboundlinks_relflags_txt, relEval(inboundlinksRel)); + if (allAttr || contains(SolrField.inboundlinks_text_txt)) addSolr(solrdoc, SolrField.inboundlinks_text_txt, inboundlinksText); c = 0; - if (isEmpty() || contains(SolrField.outboundlinkscount_i.name())) addSolr(solrdoc, SolrField.outboundlinkscount_i, ouboundLinks.size()); - if (isEmpty() || contains(SolrField.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount()); + if (allAttr || contains(SolrField.outboundlinkscount_i)) addSolr(solrdoc, SolrField.outboundlinkscount_i, ouboundLinks.size()); + if (allAttr || contains(SolrField.outboundlinksnofollowcount_i)) addSolr(solrdoc, SolrField.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount()); final List outboundlinksTag = new ArrayList(ouboundLinks.size()); final List outboundlinksURLProtocol = new ArrayList(ouboundLinks.size()); final List outboundlinksURLStub = new ArrayList(ouboundLinks.size()); @@ -475,24 +567,38 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable ((text.length() > 0) ? text : "") + ""); c++; } - if (isEmpty() || contains(SolrField.outboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_tag_txt, outboundlinksTag); - if (isEmpty() || contains(SolrField.outboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol)); - if (isEmpty() || contains(SolrField.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_urlstub_txt, outboundlinksURLStub); - if (isEmpty() || contains(SolrField.outboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_name_txt, outboundlinksName); - if (isEmpty() || contains(SolrField.outboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_rel_txt, outboundlinksRel); - if (isEmpty() || contains(SolrField.outboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_relflags_txt, relEval(inboundlinksRel)); - if (isEmpty() || contains(SolrField.outboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_text_txt, outboundlinksText); + if (allAttr || contains(SolrField.outboundlinks_tag_txt)) addSolr(solrdoc, SolrField.outboundlinks_tag_txt, outboundlinksTag); + if (allAttr || contains(SolrField.outboundlinks_protocol_txt)) addSolr(solrdoc, SolrField.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol)); + if (allAttr || contains(SolrField.outboundlinks_urlstub_txt)) addSolr(solrdoc, SolrField.outboundlinks_urlstub_txt, outboundlinksURLStub); + if (allAttr || contains(SolrField.outboundlinks_name_txt)) addSolr(solrdoc, SolrField.outboundlinks_name_txt, outboundlinksName); + if (allAttr || contains(SolrField.outboundlinks_rel_txt)) addSolr(solrdoc, SolrField.outboundlinks_rel_txt, outboundlinksRel); + if (allAttr || contains(SolrField.outboundlinks_relflags_txt)) addSolr(solrdoc, SolrField.outboundlinks_relflags_txt, relEval(inboundlinksRel)); + if (allAttr || contains(SolrField.outboundlinks_text_txt)) addSolr(solrdoc, SolrField.outboundlinks_text_txt, outboundlinksText); // charset - addSolr(solrdoc, SolrField.charset_s, yacydoc.getCharset()); + if (allAttr || contains(SolrField.charset_s)) addSolr(solrdoc, SolrField.charset_s, yacydoc.getCharset()); // coordinates if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { - addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon()); - addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat()); + if (allAttr || contains(SolrField.lon_coordinate)) addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon()); + if (allAttr || contains(SolrField.lat_coordinate)) addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat()); } - addSolr(solrdoc, SolrField.httpstatus_i, header == null ? 200 : header.getStatusCode()); - + if (allAttr || contains(SolrField.httpstatus_i)) addSolr(solrdoc, SolrField.httpstatus_i, header == null ? 200 : header.getStatusCode()); + + // fields that are additionally in URIMetadataRow + if (allAttr || contains(SolrField.load_date_dt)) addSolr(solrdoc, SolrField.load_date_dt, metadata.loaddate()); + if (allAttr || contains(SolrField.fresh_date_dt)) addSolr(solrdoc, SolrField.fresh_date_dt, metadata.freshdate()); + if (allAttr || contains(SolrField.host_id_s)) addSolr(solrdoc, SolrField.host_id_s, metadata.hosthash()); + if ((allAttr || contains(SolrField.referrer_id_ss)) && metadata.referrerHash() != null) addSolr(solrdoc, SolrField.referrer_id_ss, new String[]{ASCII.String(metadata.referrerHash())}); + //if (allAttr || contains(SolrField.md5_s)) addSolr(solrdoc, SolrField.md5_s, new byte[0]); + if (allAttr || contains(SolrField.publisher_t)) addSolr(solrdoc, SolrField.publisher_t, yacydoc.dc_publisher()); + if ((allAttr || contains(SolrField.language_ss)) && metadata.language() != null) addSolr(solrdoc, SolrField.language_ss,new String[]{UTF8.String(metadata.language())}); + if (allAttr || contains(SolrField.ranking_i)) addSolr(solrdoc, SolrField.ranking_i, metadata.ranking()); + if (allAttr || contains(SolrField.size_i)) addSolr(solrdoc, SolrField.size_i, metadata.size()); + if (allAttr || contains(SolrField.audiolinkscount_i)) addSolr(solrdoc, SolrField.audiolinkscount_i, yacydoc.getAudiolinks().size()); + if (allAttr || contains(SolrField.videolinkscount_i)) addSolr(solrdoc, SolrField.videolinkscount_i, yacydoc.getVideolinks().size()); + if (allAttr || contains(SolrField.applinkscount_i)) addSolr(solrdoc, SolrField.applinkscount_i, yacydoc.getApplinks().size()); + return solrdoc; } diff --git a/source/net/yacy/search/index/SolrField.java b/source/net/yacy/search/index/SolrField.java index 3fcbfd721..d3a1bdfc3 100644 --- a/source/net/yacy/search/index/SolrField.java +++ b/source/net/yacy/search/index/SolrField.java @@ -120,7 +120,21 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"), ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"), ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"), - failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"); + failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty"), + + // values used additionally by URIMetadataRow + load_date_dt(SolrType.date, true, true, "time when resource was loaded"), + fresh_date_dt(SolrType.date, true, true, "date until resource shall be considered as fresh"), + host_id_s(SolrType.string, true, true, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash(); + referrer_id_ss(SolrType.string, true, true, true, "ids of referrer to this document"),// byte[] referrerHash(); + md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5(); + publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher(); + language_ss(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language(); + ranking_i(SolrType.integer, true, true, "an external ranking value"),// long ranking(); + size_i(SolrType.integer, true, true, "the size of the raw source"),// int size(); + audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio(); + videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo(); + applinkscount_i(SolrType.integer, true, true, "number of links to application resources");// int lapp(); private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() ) private final SolrType type;