From ccc3760a479ec89362c2eb347ccf172f5eae9dfc Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 18 Oct 2012 14:29:11 +0200 Subject: [PATCH] Refactoring and redesign of data architecture to make URIMetadataRow superfluous. The target is to make a solr document as the core of YaCy documents which would cause that many conversions can be removed. On the way to this target the Equivalence of URIMetadataRow and URIMetadataNode had to be removed to expose the usage of the old URIMetadataRow data structure. This refactoring already removes unneccessary conversions and should make memory usage during indexing lower. --- htroot/IndexControlRWIs_p.java | 11 +- htroot/IndexControlURLs_p.java | 9 +- htroot/api/yacydoc.java | 4 +- htroot/yacy/crawlReceipt.java | 5 +- htroot/yacy/transferURL.java | 3 +- .../yacy/cora/document/MultiProtocolURI.java | 16 +- source/net/yacy/crawler/data/ResultURLs.java | 37 +-- source/net/yacy/document/Document.java | 2 +- .../yacy/kelondro/data/meta/DigestURI.java | 4 + .../yacy/kelondro/data/meta/URIMetadata.java | 132 --------- .../kelondro/data/meta/URIMetadataNode.java | 186 +++++------- .../kelondro/data/meta/URIMetadataRow.java | 266 ++++++------------ .../kelondro/data/word/WordReferenceVars.java | 4 +- source/net/yacy/peers/Protocol.java | 20 +- source/net/yacy/peers/Transmission.java | 7 +- source/net/yacy/repository/Blacklist.java | 16 +- source/net/yacy/search/Switchboard.java | 20 +- .../net/yacy/search/index/DocumentIndex.java | 19 +- source/net/yacy/search/index/Fulltext.java | 23 +- source/net/yacy/search/index/Segment.java | 55 +--- .../yacy/search/index/SolrConfiguration.java | 147 +++++----- .../net/yacy/search/query/SnippetProcess.java | 1 - .../net/yacy/search/snippet/TextSnippet.java | 1 - 23 files changed, 327 insertions(+), 661 deletions(-) delete mode 100644 source/net/yacy/kelondro/data/meta/URIMetadata.java diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index c7e1f6af2..dd584ec11 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -45,7 +45,6 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ListManager; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -270,8 +269,8 @@ public class IndexControlRWIs_p { index = segment.termIndex().get(keyhash, null); // built urlCache final Iterator urlIter = index.entries(); - final TreeMap knownURLs = - new TreeMap(Base64Order.enhancedCoder); + final TreeMap knownURLs = + new TreeMap(Base64Order.enhancedCoder); final HandleSet unknownURLEntries = new RowHandleSet( WordReferenceRow.urlEntryRow.primaryKeyLength, @@ -290,7 +289,7 @@ public class IndexControlRWIs_p { } urlIter.remove(); } else { - knownURLs.put(iEntry.urlhash(), lurl.toRow()); + knownURLs.put(iEntry.urlhash(), lurl); } } @@ -376,7 +375,7 @@ public class IndexControlRWIs_p { } catch ( final SpaceExceededException e ) { Log.logException(e); } - final URIMetadata e = segment.fulltext().getMetadata(b); + final URIMetadataNode e = segment.fulltext().getMetadata(b); segment.fulltext().remove(b); if ( e != null ) { url = e.url(); @@ -411,7 +410,7 @@ public class IndexControlRWIs_p { } catch ( final SpaceExceededException e ) { Log.logException(e); } - final URIMetadata e = segment.fulltext().getMetadata(b); + final URIMetadataNode e = segment.fulltext().getMetadata(b); segment.fulltext().remove(b); if ( e != null ) { url = e.url(); diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index f1f2c9fc8..1384d81c8 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -41,7 +41,6 @@ import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.ResultURLs; import net.yacy.data.WorkTables; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; @@ -206,7 +205,7 @@ public class IndexControlURLs_p { final DigestURI url = new DigestURI(urlstring); urlhash = ASCII.String(url.hash()); prop.put("urlhash", urlhash); - final URIMetadata entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); + final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); if (entry == null) { prop.putHTML("result", "No Entry for URL " + url.toNormalform(true)); prop.putHTML("urlstring", urlstring); @@ -222,7 +221,7 @@ public class IndexControlURLs_p { } if (post.containsKey("urlhashsearch")) { - final URIMetadata entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); + final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); if (entry == null) { prop.putHTML("result", "No Entry for URL hash " + urlhash); } else { @@ -236,7 +235,7 @@ public class IndexControlURLs_p { if (post.containsKey("urlhashsimilar")) { final Iterator entryIt = new RotateIterator(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount()); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:
"); - URIMetadata entry; + URIMetadataNode entry; int i = 0, rows = 0, cols = 0; prop.put("urlhashsimilar", "1"); while (entryIt.hasNext() && i < 256) { @@ -341,7 +340,7 @@ public class IndexControlURLs_p { return prop; } - private static serverObjects genUrlProfile(final Segment segment, final URIMetadata entry, final String urlhash) { + private static serverObjects genUrlProfile(final Segment segment, final URIMetadataNode entry, final String urlhash) { final serverObjects prop = new serverObjects(); if (entry == null) { prop.put("genUrlProfile", "1"); diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index 54777795e..76281b190 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -36,7 +36,6 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader.FileType; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; @@ -48,7 +47,6 @@ import net.yacy.server.serverSwitch; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.RDFNode; - public class yacydoc { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { @@ -106,7 +104,7 @@ public class yacydoc { if (entry.url() == null) { return prop; } - final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash()); + final URIMetadataNode le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash()); prop.putXML("dc_title", entry.dc_title()); prop.putXML("dc_creator", entry.dc_creator()); diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 662bb2941..fdf952781 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -34,7 +34,6 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; import net.yacy.crawler.data.ZURL.FailCategory; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.peers.Protocol; @@ -116,7 +115,7 @@ public final class crawlReceipt { } // generating a new loaded URL entry - final URIMetadata entry = URIMetadataRow.importEntry(propStr); + final URIMetadataRow entry = URIMetadataRow.importEntry(propStr); if (entry == null) { if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "3600"); @@ -148,7 +147,7 @@ public final class crawlReceipt { if ("fill".equals(result)) try { // put new entry into database sb.index.fulltext().putMetadata(entry); - ResultURLs.stack(entry, youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS); + ResultURLs.stack(ASCII.String(entry.url().hash()), entry.url().getHost(), youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS); sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false)); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 227d139c4..acfa99138 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -30,6 +30,7 @@ import java.io.IOException; import java.text.ParseException; import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.document.ASCII; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.ResultURLs; @@ -149,7 +150,7 @@ public final class transferURL { if (Network.log.isFine()) Network.log.logFine("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.url().toNormalform(true)); try { sb.index.fulltext().putMetadata(lEntry); - ResultURLs.stack(lEntry, iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER); + ResultURLs.stack(ASCII.String(lEntry.url().hash()), lEntry.url().getHost(), iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER); if (Network.log.isFine()) Network.log.logFine("transferURL: received URL '" + lEntry.url().toNormalform(false) + "' from peer " + otherPeerName); received++; } catch (final IOException e) { diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index ebfab47b9..3f422f927 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -1976,16 +1976,18 @@ public class MultiProtocolURI implements Serializable, Comparable resultStack = getStack(stackType); if (resultStack != null) { - resultStack.put(ASCII.String(urlEntry.hash()), new InitExecEntry(initiatorHash, executorHash)); + resultStack.put(urlhash, new InitExecEntry(initiatorHash, executorHash)); } } catch (final Exception ex) { System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); @@ -116,7 +108,7 @@ public final class ResultURLs { try { final ScoreMap domains = getDomains(stackType); if (domains != null) { - domains.inc(urlEntry.url().getHost()); + domains.inc(hostname); } } catch (final Exception ex) { System.out.println("INTERNAL ERROR in newEntry/3: " + ex.toString()); @@ -216,23 +208,4 @@ public final class ResultURLs { return true; } - /** - * test and benchmark - * @param args - */ - public static void main(final String[] args) { - try { - final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/"); - final URIMetadata urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0d, 0.0d, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0, new String[0]); - final EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING; - System.out.println("valid test:\n======="); - // add - stack(urlRef, urlRef.hash(), url.hash(), stackNo); - // size - System.out.println("size of stack:\t"+ getStackSize(stackNo)); - } catch (final MalformedURLException e) { - Log.logException(e); - } - } - } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 788adbb44..a64e3915f 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -293,7 +293,7 @@ dc_rights return this.source.toNormalform(true); } - public MultiProtocolURI dc_source() { + public DigestURI dc_source() { return this.source; } diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index fb65695f6..332b2948a 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -197,6 +197,10 @@ public class DigestURI extends MultiProtocolURI implements Serializable { return this.hash; } + public String hosthash() { + return ASCII.String(this.hash(), 6, 6); + } + /** * calculated YaCy-Hash of this URI * diff --git a/source/net/yacy/kelondro/data/meta/URIMetadata.java b/source/net/yacy/kelondro/data/meta/URIMetadata.java deleted file mode 100644 index 17f6c6e6a..000000000 --- a/source/net/yacy/kelondro/data/meta/URIMetadata.java +++ /dev/null @@ -1,132 +0,0 @@ -/** - * URIMetadata - * Copyright 2012 by Michael Peter Christen - * First released 3.4.2012 at http://yacy.net - * - * This file is part of YaCy Content Integration - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.kelondro.data.meta; - -import java.util.Date; -import java.util.regex.Pattern; - -import net.yacy.crawler.retrieval.Request; -import net.yacy.kelondro.data.word.WordReference; -import net.yacy.kelondro.util.Bitfield; - - -public interface URIMetadata { - - /** - * The hash of a URIReference is a unique key for the stored URL. - * It is in fact equal to url().hash() - * @return the hash of the stored url - */ - public byte[] hash(); - - /** - * the second half of a uri hash is the host hash - * @return - */ - public String hosthash(); - - /** - * The modification date of the URIReference is given if - * the record was created first and is defined with the - * creation date. If the record is modified later, the date shall change. - * @return the modification date of this record - */ - public Date moddate(); - - /** - * The DigestURI is the payload of the URIReference - * @return the url as DigestURI with assigned URL hash according to the record hash - */ - public DigestURI url(); - - /** - * check if the url matches agains a given matcher - * @param matcher - * @return true if the url() matches - */ - public boolean matches(final Pattern matcher); - - /** - * produce a visible representation of the record - * @return a string for the url() - */ - @Override - public String toString(); - - public String dc_title(); - - public String dc_creator(); - - public String dc_publisher(); - - public String dc_subject(); - - public double lat(); - - public double lon(); - - public long ranking(); - - public Date loaddate(); - - public Date freshdate(); - - public String md5(); - - public char doctype(); - - public byte[] language(); - - public int size(); - - public Bitfield flags(); - - public int wordCount(); - - public int llocal(); - - public int lother(); - - public int limage(); - - public int laudio(); - - public int lvideo(); - - public int lapp(); - - public String snippet(); - - public String[] collections(); - - public WordReference word(); - - public boolean isOlder(final URIMetadata other); - - public String toString(final String snippet); - - public byte[] referrerHash(); - - public Request toBalancerEntry(final String initiatorHash); - -} diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 6bc410f80..0c6ae6a9f 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -35,7 +35,6 @@ import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.order.Base64Order; -import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.kelondro.data.word.WordReference; @@ -43,7 +42,9 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.Bitfield; import net.yacy.utils.crypt; +import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrInputDocument; /** @@ -51,7 +52,7 @@ import org.apache.solr.common.SolrDocument; * The purpose of this object is the migration from the old metadata structure to solr document. * Future implementations should try to replace URIMetadata objects completely by SolrDocument objects */ -public class URIMetadataNode implements URIMetadata { +public class URIMetadataNode { private byte[] hash = null; private String urlRaw = null, keywords = null; @@ -64,6 +65,10 @@ public class URIMetadataNode implements URIMetadata { private String snippet = null; private WordReference word = null; // this is only used if the url is transported via remote search requests + public URIMetadataNode(final SolrInputDocument doc) { + this(ClientUtils.toSolrDocument(doc)); + } + public URIMetadataNode(final SolrDocument doc) { this.doc = doc; this.snippet = ""; @@ -79,123 +84,58 @@ public class URIMetadataNode implements URIMetadata { } } - public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) { - this(doc); + public URIMetadataNode(final SolrInputDocument doc, final WordReference searchedWord, final long ranking) { + this(ClientUtils.toSolrDocument(doc)); this.word = searchedWord; this.ranking = ranking; } - public URIMetadataRow toRow() { - return URIMetadataRow.importEntry(this.toString()); + public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) { + this(doc); + this.word = searchedWord; + this.ranking = ranking; } public SolrDocument getDocument() { return this.doc; } - private int getInt(YaCySchema field) { - assert !field.isMultiValued(); - assert field.getType() == SolrType.integer; - Integer x = (Integer) this.doc.getFieldValue(field.name()); - if (x == null) return 0; - return x.intValue(); - } - - private Date getDate(YaCySchema field) { - assert !field.isMultiValued(); - assert field.getType() == SolrType.date; - Date x = (Date) this.doc.getFieldValue(field.name()); - if (x == null) return new Date(0); - Date now = new Date(); - return x.after(now) ? now : x; - } - - private String getString(YaCySchema field) { - assert !field.isMultiValued(); - assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight; - Object x = this.doc.getFieldValue(field.name()); - if (x == null) return ""; - if (x instanceof ArrayList) { - @SuppressWarnings("unchecked") - ArrayList xa = (ArrayList) x; - return xa.size() == 0 ? "" : xa.get(0); - } - return (String) x; - } - - @SuppressWarnings("unchecked") - private ArrayList getStringList(YaCySchema field) { - assert field.isMultiValued(); - assert field.getType() == SolrType.string || field.getType() == SolrType.text_general; - Object r = this.doc.getFieldValue(field.name()); - if (r == null) return new ArrayList(0); - if (r instanceof ArrayList) { - return (ArrayList) r; - } - ArrayList a = new ArrayList(1); - a.add((String) r); - return a; - } - - @SuppressWarnings("unchecked") - private ArrayList getIntList(YaCySchema field) { - assert field.isMultiValued(); - assert field.getType() == SolrType.integer; - Object r = this.doc.getFieldValue(field.name()); - if (r == null) return new ArrayList(0); - if (r instanceof ArrayList) { - return (ArrayList) r; - } - ArrayList a = new ArrayList(1); - a.add((Integer) r); - return a; - } - - @Override public byte[] hash() { return this.hash; } - @Override public String hosthash() { String hosthash = (String) this.doc.getFieldValue(YaCySchema.host_id_s.name()); if (hosthash == null) hosthash = ASCII.String(this.hash, 6, 6); return hosthash; } - @Override public Date moddate() { return getDate(YaCySchema.last_modified); } - @Override public DigestURI url() { return this.url; } - @Override public boolean matches(Pattern matcher) { return matcher.matcher(this.urlRaw.toLowerCase()).matches(); } - @Override public String dc_title() { ArrayList a = getStringList(YaCySchema.title); if (a == null || a.size() == 0) return ""; return a.get(0); } - @Override public String dc_creator() { return getString(YaCySchema.author); } - @Override public String dc_publisher() { return getString(YaCySchema.publisher_t); } - @Override public String dc_subject() { if (this.keywords == null) { this.keywords = getString(YaCySchema.keywords); @@ -203,7 +143,6 @@ public class URIMetadataNode implements URIMetadata { return this.keywords; } - @Override public double lat() { if (this.lat == Double.NaN) { this.lon = 0.0d; @@ -220,60 +159,49 @@ public class URIMetadataNode implements URIMetadata { return this.lat; } - @Override public double lon() { if (this.lon == Double.NaN) lat(); return this.lon; } - @Override public long ranking() { return this.ranking; } - @Override public Date loaddate() { return getDate(YaCySchema.load_date_dt); } - @Override public Date freshdate() { return getDate(YaCySchema.fresh_date_dt); } - @Override public String md5() { return getString(YaCySchema.md5_s); } - @Override public char doctype() { ArrayList a = getStringList(YaCySchema.content_type); if (a == null || a.size() == 0) return Response.docType(url()); return Response.docType(a.get(0)); } - @Override public byte[] language() { String language = getString(YaCySchema.language_s); if (language == null || language.length() == 0) return ASCII.getBytes("en"); return UTF8.getBytes(language); } - - @Override public byte[] referrerHash() { ArrayList referrer = getStringList(YaCySchema.referrer_id_txt); if (referrer == null || referrer.size() == 0) return null; return ASCII.getBytes(referrer.get(0)); } - @Override public int size() { return getInt(YaCySchema.size_i); } - @Override public Bitfield flags() { if (flags == null) { this.flags = new Bitfield(); @@ -287,22 +215,18 @@ public class URIMetadataNode implements URIMetadata { return this.flags; } - @Override public int wordCount() { return getInt(YaCySchema.wordcount_i); } - @Override public int llocal() { return getInt(YaCySchema.inboundlinkscount_i); } - @Override public int lother() { return getInt(YaCySchema.outboundlinkscount_i); } - @Override public int limage() { if (this.imagec == -1) { this.imagec = getInt(YaCySchema.imagescount_i); @@ -310,7 +234,6 @@ public class URIMetadataNode implements URIMetadata { return this.imagec; } - @Override public int laudio() { if (this.audioc == -1) { this.audioc = getInt(YaCySchema.audiolinkscount_i); @@ -318,7 +241,6 @@ public class URIMetadataNode implements URIMetadata { return this.audioc; } - @Override public int lvideo() { if (this.videoc == -1) { this.videoc = getInt(YaCySchema.videolinkscount_i); @@ -326,7 +248,6 @@ public class URIMetadataNode implements URIMetadata { return this.videoc; } - @Override public int lapp() { if (this.appc == -1) { this.appc = getInt(YaCySchema.videolinkscount_i); @@ -348,24 +269,20 @@ public class URIMetadataNode implements URIMetadata { return getInt(YaCySchema.url_chars_i); } - @Override public String snippet() { return this.snippet; } - @Override public String[] collections() { ArrayList a = getStringList(YaCySchema.collection_sxt); return a.toArray(new String[a.size()]); } - @Override public WordReference word() { return this.word; } - @Override - public boolean isOlder(URIMetadata other) { + public boolean isOlder(URIMetadataRow other) { if (other == null) return false; final Date tmoddate = moddate(); final Date omoddate = other.moddate(); @@ -379,7 +296,7 @@ public class URIMetadataNode implements URIMetadata { return false; } - protected static StringBuilder corePropList(URIMetadata md) { + private static StringBuilder corePropList(URIMetadataNode md) { // generate a parseable string; this is a simple property-list final StringBuilder s = new StringBuilder(300); @@ -427,7 +344,6 @@ public class URIMetadataNode implements URIMetadata { * the toString format must be completely identical to URIMetadataRow because that is used * to transport the data over p2p connections. */ - @Override public String toString(String snippet) { // add information needed for remote transport final StringBuilder core = corePropList(this); @@ -457,19 +373,65 @@ public class URIMetadataNode implements URIMetadata { core.append('}'); return core.toString(); } + + private int getInt(YaCySchema field) { + assert !field.isMultiValued(); + assert field.getType() == SolrType.integer; + Object x = this.doc.getFieldValue(field.name()); + if (x == null) return 0; + if (x instanceof Integer) return ((Integer) x).intValue(); + if (x instanceof Long) return ((Long) x).intValue(); + return 0; + } - @Override - public Request toBalancerEntry(final String initiatorHash) { - return new Request( - ASCII.getBytes(initiatorHash), - url(), - referrerHash(), - dc_title(), - moddate(), - null, - 0, - 0, - 0, - 0); + private Date getDate(YaCySchema field) { + assert !field.isMultiValued(); + assert field.getType() == SolrType.date; + Date x = (Date) this.doc.getFieldValue(field.name()); + if (x == null) return new Date(0); + Date now = new Date(); + return x.after(now) ? now : x; } + + private String getString(YaCySchema field) { + assert !field.isMultiValued(); + assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight; + Object x = this.doc.getFieldValue(field.name()); + if (x == null) return ""; + if (x instanceof ArrayList) { + @SuppressWarnings("unchecked") + ArrayList xa = (ArrayList) x; + return xa.size() == 0 ? "" : xa.get(0); + } + return (String) x; + } + + @SuppressWarnings("unchecked") + private ArrayList getStringList(YaCySchema field) { + assert field.isMultiValued(); + assert field.getType() == SolrType.string || field.getType() == SolrType.text_general; + Object r = this.doc.getFieldValue(field.name()); + if (r == null) return new ArrayList(0); + if (r instanceof ArrayList) { + return (ArrayList) r; + } + ArrayList a = new ArrayList(1); + a.add((String) r); + return a; + } + + @SuppressWarnings("unchecked") + private ArrayList getIntList(YaCySchema field) { + assert field.isMultiValued(); + assert field.getType() == SolrType.integer; + Object r = this.doc.getFieldValue(field.name()); + if (r == null) return new ArrayList(0); + if (r instanceof ArrayList) { + return (ArrayList) r; + } + ArrayList a = new ArrayList(1); + a.add((Integer) r); + return a; + } + } \ No newline at end of file diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index d191b2dff..c771ded1b 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -54,7 +54,7 @@ import net.yacy.kelondro.util.kelondroException; import net.yacy.search.query.QueryParams; import net.yacy.utils.crypt; -public class URIMetadataRow implements URIMetadata { +public class URIMetadataRow { // this object stores attributes for URL entries @@ -104,120 +104,14 @@ public class URIMetadataRow implements URIMetadata { private final Row.Entry entry; private final String snippet; - private final String[] collections; private WordReference word; // this is only used if the url is transported via remote search requests - private final long ranking; // during generation of a search result this value is set private Components comp; - public URIMetadataRow() { - // create a dummy entry, good to produce poison objects - this.entry = rowdef.newEntry(); - this.snippet = ""; - this.collections = new String[0]; - this.word = null; - this.ranking = 0; - this.comp = null; - } - - public URIMetadataRow( - final DigestURI url, - final String dc_title, - final String dc_creator, - final String dc_subject, - final String dc_publisher, - final double lon, final double lat, // decimal degrees as in WGS84; if unknown both values may be 0.0d; - final Date mod, - final Date load, - final Date fresh, - final String referrer, - final byte[] md5, - final long size, - final int wc, - final char dt, - final Bitfield flags, - final byte[] lang, - final int llocal, - final int lother, - final int laudio, - final int limage, - final int lvideo, - final int lapp, - final String[] collections) { - // create new entry - this.entry = rowdef.newEntry(); - this.entry.setCol(col_hash, url.hash()); - this.entry.setCol(col_comp, encodeComp(url, dc_title, dc_creator, dc_subject, dc_publisher, lat, lon)); - encodeDate(col_mod, mod); - encodeDate(col_load, load); - encodeDate(col_fresh, fresh); - this.entry.setCol(col_referrer, (referrer == null) ? null : UTF8.getBytes(referrer)); - this.entry.setCol(col_md5, md5); - this.entry.setCol(col_size, size); - this.entry.setCol(col_wc, wc); - this.entry.setCol(col_dt, new byte[]{(byte) dt}); - this.entry.setCol(col_flags, flags.bytes()); - this.entry.setCol(col_lang, lang); - this.entry.setCol(col_llocal, llocal); - this.entry.setCol(col_lother, lother); - this.entry.setCol(col_limage, limage); - this.entry.setCol(col_laudio, laudio); - this.entry.setCol(col_lvideo, lvideo); - this.entry.setCol(col_lapp, lapp); - //System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString()); - this.snippet = ""; - this.collections = collections; - this.word = null; - this.ranking = 0; - this.comp = null; - } - - private void encodeDate(final int col, final Date d) { - // calculates the number of days since 1.1.1970 and returns this as 4-byte array - // 86400000 is the number of milliseconds in one day - long time = d.getTime(); - long now = System.currentTimeMillis(); - this.entry.setCol(col, NaturalOrder.encodeLong((time > now ? now : time) / 86400000L, 4)); - } - - private Date decodeDate(final int col) { - final long t = this.entry.getColLong(col); - /*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch - /* - if (t < 350400) return new Date(3600000L * t); // hours since epoch - if (t < 21024000) return new Date(60000L * t); // minutes since epoch - */ - } - - private static byte[] encodeComp( - final DigestURI url, - final String dc_title, - final String dc_creator, - final String dc_subject, - final String dc_publisher, - final double lat, - final double lon) { - final CharBuffer s = new CharBuffer(3600, 360); - s.append(url.toNormalform(true)).appendLF(); - s.append(dc_title).appendLF(); - if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); - s.appendLF(); - if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject); - s.appendLF(); - if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher); - s.appendLF(); - if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF(); - String s0 = s.toString(); - s.close(); - return UTF8.getBytes(s0); - } - - public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord, final long ranking) { + public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) { this.entry = entry; this.snippet = ""; this.word = searchedWord; - this.ranking = ranking; this.comp = null; - this.collections = new String[0]; } private URIMetadataRow(final Properties prop) throws kelondroException { @@ -278,17 +172,15 @@ public class URIMetadataRow implements URIMetadata { this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0"))); this.snippet = crypt.simpleDecode(prop.getProperty("snippet", "")); this.word = null; - if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported"); if (prop.containsKey("wi")) { this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", "")))); } - this.ranking = 0; this.comp = null; - this.collections = new String[0]; } - + public static URIMetadataRow importEntry(final String propStr) { - if (propStr == null || (!propStr.isEmpty() && propStr.charAt(0) != '{') || !propStr.endsWith("}")) { + if (propStr == null || propStr.isEmpty() || propStr.charAt(0) != '{' || !propStr.endsWith("}")) { + Log.logSevere("URIMetadataRow", "importEntry: propStr is not proper: " + propStr); return null; } try { @@ -300,7 +192,46 @@ public class URIMetadataRow implements URIMetadata { } } - @Override + private void encodeDate(final int col, final Date d) { + // calculates the number of days since 1.1.1970 and returns this as 4-byte array + // 86400000 is the number of milliseconds in one day + long time = d.getTime(); + long now = System.currentTimeMillis(); + this.entry.setCol(col, NaturalOrder.encodeLong((time > now ? now : time) / 86400000L, 4)); + } + + private Date decodeDate(final int col) { + final long t = this.entry.getColLong(col); + /*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch + /* + if (t < 350400) return new Date(3600000L * t); // hours since epoch + if (t < 21024000) return new Date(60000L * t); // minutes since epoch + */ + } + + private static byte[] encodeComp( + final DigestURI url, + final String dc_title, + final String dc_creator, + final String dc_subject, + final String dc_publisher, + final double lat, + final double lon) { + final CharBuffer s = new CharBuffer(3600, 360); + s.append(url.toNormalform(true)).appendLF(); + s.append(dc_title).appendLF(); + if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); + s.appendLF(); + if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject); + s.appendLF(); + if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher); + s.appendLF(); + if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF(); + String s0 = s.toString(); + s.close(); + return UTF8.getBytes(s0); + } + public byte[] hash() { // return a url-hash, based on the md5 algorithm // the result is a String of 12 bytes within a 72-bit space @@ -310,54 +241,40 @@ public class URIMetadataRow implements URIMetadata { } private String hostHash = null; - @Override public String hosthash() { if (this.hostHash != null) return this.hostHash; this.hostHash = ASCII.String(this.entry.getPrimaryKeyBytes(), 6, 6); return this.hostHash; } - @Override - public long ranking() { - return this.ranking; - } - - @Override public boolean matches(final Pattern matcher) { return this.metadata().matches(matcher); } - @Override public DigestURI url() { return this.metadata().url(); } - @Override public String dc_title() { return this.metadata().dc_title(); } - @Override public String dc_creator() { return this.metadata().dc_creator(); } - @Override public String dc_publisher() { return this.metadata().dc_publisher(); } - @Override public String dc_subject() { return this.metadata().dc_subject(); } - @Override public double lat() { return this.metadata().lat(); } - @Override public double lon() { return this.metadata().lon(); } @@ -379,22 +296,18 @@ public class URIMetadataRow implements URIMetadata { return this.comp; } - @Override public Date moddate() { return decodeDate(col_mod); } - @Override public Date loaddate() { return decodeDate(col_load); } - @Override public Date freshdate() { return decodeDate(col_fresh); } - @Override public byte[] referrerHash() { // return the creator's hash or null if there is none // FIXME: There seem to be some malformed entries in the databasees like "null\0\0\0\0\0\0\0\0" @@ -408,18 +321,15 @@ public class URIMetadataRow implements URIMetadata { return r; } - @Override public String md5() { // returns the md5 in hex representation return Digest.encodeHex(this.entry.getColBytes(col_md5, true)); } - @Override public char doctype() { return (char) this.entry.getColByte(col_dt); } - @Override public byte[] language() { byte[] b = this.entry.getColBytes(col_lang, true); if ((b == null || b[0] == (byte)'[') && this.metadata().url != null) { @@ -430,100 +340,98 @@ public class URIMetadataRow implements URIMetadata { return b; } - @Override public int size() { return (int) this.entry.getColLong(col_size); } - @Override public Bitfield flags() { return new Bitfield(this.entry.getColBytes(col_flags, true)); } - @Override public int wordCount() { return (int) this.entry.getColLong(col_wc); } - @Override public int llocal() { return (int) this.entry.getColLong(col_llocal); } - @Override public int lother() { return (int) this.entry.getColLong(col_lother); } - @Override public int limage() { return (int) this.entry.getColLong(col_limage); } - @Override public int laudio() { return (int) this.entry.getColLong(col_laudio); } - @Override public int lvideo() { return (int) this.entry.getColLong(col_lvideo); } - @Override public int lapp() { return (int) this.entry.getColLong(col_lapp); } - @Override public String snippet() { // the snippet may appear here if the url was transported in a remote search // it will not be saved anywhere, but can only be requested here return this.snippet; } - @Override - public String[] collections() { - return this.collections; - } + - @Override public WordReference word() { return this.word; } - @Override - public boolean isOlder(final URIMetadata other) { - if (other == null) return false; - final Date tmoddate = moddate(); - final Date omoddate = other.moddate(); - if (tmoddate.before(omoddate)) return true; - if (tmoddate.equals(omoddate)) { - final Date tloaddate = loaddate(); - final Date oloaddate = other.loaddate(); - if (tloaddate.before(oloaddate)) return true; - if (tloaddate.equals(oloaddate)) return true; - } - return false; - } + private static StringBuilder corePropList(URIMetadataRow md) { + // generate a parseable string; this is a simple property-list + final StringBuilder s = new StringBuilder(300); - @Override - public String toString(final String snippet) { - // add information needed for remote transport - final StringBuilder core = URIMetadataNode.corePropList(this); - if (core == null) - return null; - - core.ensureCapacity(core.length() + snippet.length() * 2); - core.insert(0, "{"); - core.append(",snippet=").append(crypt.simpleEncode(snippet)); - core.append("}"); + // create new formatters to make concurrency possible + final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); - return core.toString(); - //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; + try { + s.append("hash=").append(ASCII.String(md.hash())); + s.append(",url=").append(crypt.simpleEncode(md.url().toNormalform(true))); + s.append(",descr=").append(crypt.simpleEncode(md.dc_title())); + s.append(",author=").append(crypt.simpleEncode(md.dc_creator())); + s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(md.dc_subject()))); + s.append(",publisher=").append(crypt.simpleEncode(md.dc_publisher())); + s.append(",lat=").append(md.lat()); + s.append(",lon=").append(md.lon()); + s.append(",mod=").append(formatter.format(md.moddate())); + s.append(",load=").append(formatter.format(md.loaddate())); + s.append(",fresh=").append(formatter.format(md.freshdate())); + s.append(",referrer=").append(md.referrerHash() == null ? "" : ASCII.String(md.referrerHash())); + s.append(",md5=").append(md.md5()); + s.append(",size=").append(md.size()); + s.append(",wc=").append(md.wordCount()); + s.append(",dt=").append(md.doctype()); + s.append(",flags=").append(md.flags().exportB64()); + s.append(",lang=").append(md.language() == null ? "EN" : UTF8.String(md.language())); + s.append(",llocal=").append(md.llocal()); + s.append(",lother=").append(md.lother()); + s.append(",limage=").append(md.limage()); + s.append(",laudio=").append(md.laudio()); + s.append(",lvideo=").append(md.lvideo()); + s.append(",lapp=").append(md.lapp()); + if (md.word() != null) { + // append also word properties + final String wprop = md.word().toPropertyForm(); + s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop)); + } + return s; + } catch (final Throwable e) { + Log.logException(e); + return null; + } } - @Override public Request toBalancerEntry(final String initiatorHash) { return new Request( ASCII.getBytes(initiatorHash), @@ -545,7 +453,7 @@ public class URIMetadataRow implements URIMetadata { */ @Override public String toString() { - final StringBuilder core = URIMetadataNode.corePropList(this); + final StringBuilder core = corePropList(this); if (core == null) return null; core.insert(0, "{"); diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index ff12169f4..01299beed 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -37,7 +37,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.Base64Order; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.logging.Log; @@ -71,7 +71,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc private final Queue positions; private double termFrequency; - public WordReferenceVars(final URIMetadata md) { + public WordReferenceVars(final URIMetadataRow md) { this.language = md.language(); this.flags = md.flags(); this.lastModified = md.moddate().getTime(); diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 4471edd45..613c127f1 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -81,7 +81,6 @@ import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -812,7 +811,8 @@ public final class Protocol try { event.getQuery().getSegment().fulltext().putMetadata(urlEntry); ResultURLs.stack( - urlEntry, + ASCII.String(urlEntry.url().hash()), + urlEntry.url().getHost(), event.peers.mySeed().hash.getBytes(), UTF8.getBytes(target.hash), EventOrigin.QUERIES); @@ -1103,7 +1103,8 @@ public final class Protocol try { event.getQuery().getSegment().fulltext().putDocument(ClientUtils.toSolrInputDocument(doc)); ResultURLs.stack( - urlEntry, + ASCII.String(urlEntry.url().hash()), + urlEntry.url().getHost(), event.peers.mySeed().hash.getBytes(), UTF8.getBytes(target.hash), EventOrigin.QUERIES); @@ -1187,7 +1188,7 @@ public final class Protocol final String process, final String result, final String reason, - final URIMetadata entry, + final URIMetadataNode entry, final String wordhashes) { assert (target != null); assert (mySeed != null); @@ -1225,8 +1226,7 @@ public final class Protocol // send request try { // prepare request - final Map parts = - basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); + final Map parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); parts.put("process", UTF8.StringBody(process)); parts.put("urlhash", UTF8.StringBody(((entry == null) ? "" : ASCII.String(entry.hash())))); parts.put("result", UTF8.StringBody(result)); @@ -1266,7 +1266,7 @@ public final class Protocol public static String transferIndex( final Seed targetSeed, final ReferenceContainerCache indexes, - final SortedMap urlCache, + final SortedMap urlCache, final boolean gzipBody, final int timeout) { @@ -1327,7 +1327,7 @@ public final class Protocol } // all url's known // extract the urlCache from the result - final URIMetadata[] urls = new URIMetadata[uhs.length]; + final URIMetadataNode[] urls = new URIMetadataNode[uhs.length]; for ( int i = 0; i < uhs.length; i++ ) { urls[i] = urlCache.get(ASCII.getBytes(uhs[i])); if ( urls[i] == null ) { @@ -1435,7 +1435,7 @@ public final class Protocol private static Map transferURL( final Seed targetSeed, - final URIMetadata[] urls, + final URIMetadataNode[] urls, boolean gzipBody, final int timeout) { // this post a message to the remote message board @@ -1457,7 +1457,7 @@ public final class Protocol String resource; int urlc = 0; int urlPayloadSize = 0; - for ( final URIMetadata url : urls ) { + for ( final URIMetadataNode url : urls ) { if ( url != null ) { resource = url.toString(); //System.out.println("*** DEBUG resource = " + resource); diff --git a/source/net/yacy/peers/Transmission.java b/source/net/yacy/peers/Transmission.java index 9928ab7db..61e27900d 100644 --- a/source/net/yacy/peers/Transmission.java +++ b/source/net/yacy/peers/Transmission.java @@ -36,7 +36,6 @@ import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -89,7 +88,7 @@ public class Transmission { */ private final byte[] primaryTarget; private final ReferenceContainerCache containers; - private final SortedMap references; + private final SortedMap references; private final HandleSet badReferences; private final List targets; private int hit, miss; @@ -105,7 +104,7 @@ public class Transmission { super(); this.primaryTarget = primaryTarget; this.containers = new ReferenceContainerCache(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength); - this.references = new TreeMap(Base64Order.enhancedCoder); + this.references = new TreeMap(Base64Order.enhancedCoder); this.badReferences = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); this.targets = targets; this.hit = 0; @@ -179,7 +178,7 @@ public class Transmission { notFoundx.add(e.urlhash()); this.badReferences.put(e.urlhash()); } else { - this.references.put(e.urlhash(), r.toRow()); + this.references.put(e.urlhash(), r); } } // now delete all references that were not found diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index 0546e9b14..19652e757 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -46,7 +46,7 @@ import java.util.regex.PatternSyntaxException; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.logging.Log; @@ -328,17 +328,19 @@ public class Blacklist { return ret; } + public boolean isListed(final BlacklistType blacklistType, final URIMetadataNode entry) { + return isListed(blacklistType, entry.url()); + } + public boolean isListed(final BlacklistType blacklistType, final URIMetadataRow entry) { + return isListed(blacklistType, entry.url()); + } + /** * Checks whether the given entry is listed in given blacklist type * @param blacklistType The used blacklist * @param entry Entry to be checked - * @return Whether the given entry is blacklisted + * @return Whether the given entry is blacklisted */ - public boolean isListed(final BlacklistType blacklistType, final URIMetadata entry) { - // Call inner method - return isListed(blacklistType, entry.url()); - } - public boolean isListed(final BlacklistType blacklistType, final DigestURI url) { if (url == null) { throw new IllegalArgumentException("url may not be null"); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index b008eb975..bbc3aba32 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -76,6 +76,8 @@ import java.util.zip.GZIPOutputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import org.apache.solr.common.SolrInputDocument; + import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Classification; @@ -146,9 +148,7 @@ import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.interaction.contentcontrol.ContentControlImportThread; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.rwi.ReferenceContainer; @@ -2589,13 +2589,10 @@ public final class Switchboard extends serverSwitch this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url); // STORE WORD INDEX - URIMetadataRow newEntry = + SolrInputDocument newEntry = this.index.storeDocument( url, referrerURL, - queueEntry.lastModified(), - new Date(), - queueEntry.size(), queueEntry.profile(), queueEntry.getResponseHeader(), document, @@ -2628,7 +2625,9 @@ public final class Switchboard extends serverSwitch } // update url result list statistics - ResultURLs.stack(newEntry, // loaded url db entry + ResultURLs.stack( + ASCII.String(url.hash()), // loaded url db entry + url.getHost(), queueEntry.initiator(), // initiator peer hash UTF8.getBytes(this.peers.mySeed().hash), // executor peer hash processCase // process case @@ -2654,8 +2653,7 @@ public final class Switchboard extends serverSwitch initiatorPeer.setAlternativeAddress(this.clusterhashes.get(queueEntry.initiator())); } // start a thread for receipt sending to avoid a blocking here - new Thread(new receiptSending(initiatorPeer, newEntry), "sending receipt to " - + ASCII.String(queueEntry.initiator())).start(); + new Thread(new receiptSending(initiatorPeer, new URIMetadataNode(newEntry)), "sending receipt to " + ASCII.String(queueEntry.initiator())).start(); } } } @@ -2820,9 +2818,9 @@ public final class Switchboard extends serverSwitch public class receiptSending implements Runnable { private final Seed initiatorPeer; - private final URIMetadata reference; + private final URIMetadataNode reference; - public receiptSending(final Seed initiatorPeer, final URIMetadata reference) { + public receiptSending(final Seed initiatorPeer, final URIMetadataNode reference) { this.initiatorPeer = initiatorPeer; this.reference = reference; } diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index c7539e59e..8f9601486 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -30,10 +30,11 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.Date; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import org.apache.solr.common.SolrInputDocument; + import net.yacy.cora.document.Classification; import net.yacy.cora.document.UTF8; import net.yacy.document.Condenser; @@ -41,7 +42,6 @@ import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.logging.Log; import net.yacy.search.query.QueryParams; @@ -101,12 +101,12 @@ public class DocumentIndex extends Segment { @Override public void run() { DigestURI f; - URIMetadata[] resultRows; + SolrInputDocument[] resultRows; try { while ( (f = DocumentIndex.this.queue.take()) != poison ) { try { resultRows = add(f); - for ( final URIMetadata resultRow : resultRows ) { + for ( final SolrInputDocument resultRow : resultRows ) { if ( DocumentIndex.this.callback != null ) { if ( resultRow == null ) { DocumentIndex.this.callback.fail(f, "result is null"); @@ -138,7 +138,7 @@ public class DocumentIndex extends Segment { this.queue.clear(); } - private URIMetadata[] add(final DigestURI url) throws IOException { + private SolrInputDocument[] add(final DigestURI url) throws IOException { if ( url == null ) { throw new IOException("file = null"); } @@ -161,7 +161,7 @@ public class DocumentIndex extends Segment { throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } //Document document = Document.mergeDocuments(url, null, documents); - final URIMetadata[] rows = new URIMetadata[documents.length]; + final SolrInputDocument[] rows = new SolrInputDocument[documents.length]; int c = 0; for ( final Document document : documents ) { if (document == null) continue; @@ -170,9 +170,6 @@ public class DocumentIndex extends Segment { super.storeDocument( url, null, - new Date(url.lastModified()), - new Date(), - url.length(), null, null, document, @@ -275,7 +272,7 @@ public class DocumentIndex extends Segment { public interface CallbackListener { - public void commit(DigestURI f, URIMetadata resultRow); + public void commit(DigestURI f, SolrInputDocument resultRow); public void fail(DigestURI f, String failReason); } @@ -296,7 +293,7 @@ public class DocumentIndex extends Segment { System.out.println("using index files at " + segmentPath.getAbsolutePath()); final CallbackListener callback = new CallbackListener() { @Override - public void commit(final DigestURI f, final URIMetadata resultRow) { + public void commit(final DigestURI f, final SolrInputDocument resultRow) { System.out.println("indexed: " + f.toString()); } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 036aff09a..dcab10d5b 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -50,7 +50,6 @@ import net.yacy.cora.storage.ZIPReader; import net.yacy.cora.storage.ZIPWriter; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReference; @@ -64,7 +63,6 @@ import net.yacy.kelondro.util.MergeIterator; import net.yacy.search.Switchboard; import org.apache.lucene.util.Version; -import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; @@ -226,10 +224,10 @@ public final class Fulltext implements Iterable { // slow migration to solr final Row.Entry entry = this.urlIndexFile.remove(urlHash); if (entry == null) return null; - URIMetadataRow row = new URIMetadataRow(entry, wre, weight); + URIMetadataRow row = new URIMetadataRow(entry, wre); SolrInputDocument solrInput = this.solrScheme.metadata2solr(row); this.putDocument(solrInput); - return new URIMetadataNode(ClientUtils.toSolrDocument(solrInput), wre, weight); + return new URIMetadataNode(solrInput, wre, weight); } catch (final IOException e) { Log.logException(e); } @@ -244,9 +242,9 @@ public final class Fulltext implements Iterable { if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); SolrDocument sd = this.solr.get(id); Date now = new Date(); - Date sdDate = this.solrScheme.getDate(sd, YaCySchema.last_modified); - if (sdDate.after(now)) sdDate = now; - Date docDate = this.solrScheme.getDate(doc, YaCySchema.last_modified); + Date sdDate = sd == null ? null : SolrConfiguration.getDate(sd, YaCySchema.last_modified); + if (sdDate == null || sdDate.after(now)) sdDate = now; + Date docDate = SolrConfiguration.getDate(doc, YaCySchema.last_modified); if (docDate.after(now)) docDate = now; if (sd == null || sdDate.before(docDate)) { if (this.solrScheme.contains(YaCySchema.ip_s)) { @@ -263,13 +261,8 @@ public final class Fulltext implements Iterable { if (MemoryControl.shortStatus()) clearCache(); } - public void putMetadata(final URIMetadata entry) throws IOException { - if (entry instanceof URIMetadataNode) { - putDocument(ClientUtils.toSolrInputDocument(((URIMetadataNode) entry).getDocument())); - return; - } - assert entry instanceof URIMetadataRow; - URIMetadataRow row = (URIMetadataRow) entry; + public void putMetadata(final URIMetadataRow entry) throws IOException { + URIMetadataRow row = entry; byte[] idb = row.hash(); String id = ASCII.String(idb); @@ -516,7 +509,7 @@ public final class Fulltext implements Iterable { } } else { final Iterator i = entries(); // iterates indexURLEntry objects - URIMetadata entry; + URIMetadataNode entry; String url; while (i.hasNext()) { entry = i.next(); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 3c6bab11b..e68931ff9 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -35,6 +35,8 @@ import java.util.Properties; import java.util.Set; import java.util.concurrent.BlockingQueue; +import org.apache.solr.common.SolrInputDocument; + import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; @@ -57,7 +59,6 @@ import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReferenceFactory; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; @@ -284,7 +285,7 @@ public class Segment { if (this.urlCitationIndex != null) this.urlCitationIndex.close(); } - private String votedLanguage( + private static String votedLanguage( final DigestURI url, final String urlNormalform, final Document document, @@ -295,23 +296,17 @@ public class Segment { if (language == null) { // no statistics available, we take either the metadata (if given) or the TLD language = (bymetadata == null) ? url.language() : bymetadata; - if (this.log.isFine()) this.log.logFine("LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language); } else { if (bymetadata == null) { // two possible results: compare and report conflicts - if (language.equals(url.language())) - if (this.log.isFine()) this.log.logFine("LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language); - else { - final String error = "LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")"; + if (!language.equals(url.language())) { // see if we have a hint in the url that the statistic was right final String u = urlNormalform.toLowerCase(); if (!u.contains("/" + language + "/") && !u.contains("/" + ISO639.country(language).toLowerCase() + "/")) { // no confirmation using the url, use the TLD language = url.language(); - if (this.log.isFine()) this.log.logFine(error + ", corrected using the TLD"); } else { // this is a strong hint that the statistics was in fact correct - if (this.log.isFine()) this.log.logFine(error + ", but the url proves that the statistic is correct"); } } } else { @@ -340,12 +335,9 @@ public class Segment { if (this.termIndex != null) this.termIndex.add(termHash, entry); } - public URIMetadataRow storeDocument( + public SolrInputDocument storeDocument( final DigestURI url, final DigestURI referrerURL, - Date modDate, - final Date loadDate, - final long sourcesize, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, @@ -359,44 +351,21 @@ public class Segment { // CREATE INDEX // load some document metadata + final Date loadDate = new Date(); final String id = ASCII.String(url.hash()); final String dc_title = document.dc_title(); final String urlNormalform = url.toNormalform(true); final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language // STORE URL TO LOADED-URL-DB - if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader + Date modDate = responseHeader.lastModified(); + if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; char docType = Response.docType(document.dc_format()); - final URIMetadataRow metadata = new URIMetadataRow( - url, // URL - dc_title, // document description - document.dc_creator(), // author - document.dc_subject(' '), // tags - document.dc_publisher(), // publisher (may be important to get location data) - document.lon(), // decimal degrees as in WGS84; - document.lat(), // if unknown both values may be 0.0d; - modDate, // modification date - loadDate, // loaded date - new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula - (referrerURL == null) ? null : ASCII.String(referrerURL.hash()), // referer hash - new byte[0], // md5 - (int) sourcesize, // size - condenser.RESULT_NUMB_WORDS, // word count - docType, // doctype - condenser.RESULT_FLAGS, // flags - UTF8.getBytes(language), // language - document.inboundLinks().size(), // inbound links - document.outboundLinks().size(), // outbound links - document.getAudiolinks().size(), // laudio - document.getImages().size(), // limage - document.getVideolinks().size(), // lvideo - document.getApplinks().size(), // lapp - profile.collections() // collections - ); - + // STORE TO SOLR + final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language); try { - this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, metadata)); + this.fulltext.putDocument(solrInputDoc); } catch ( final IOException e ) { Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage()); } @@ -487,7 +456,7 @@ public class Segment { } // finished - return metadata; + return solrInputDoc; } public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) { diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index ca3527695..460df3c58 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -55,13 +55,10 @@ import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; -import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.Bitfield; -import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; @@ -174,18 +171,12 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (isEmpty() || contains(key)) key.add(doc, value); } - public Date getDate(SolrInputDocument doc, final YaCySchema key) { + public static Date getDate(SolrInputDocument doc, final YaCySchema key) { Date x = (Date) doc.getFieldValue(key.name()); Date now = new Date(); return (x == null) ? new Date(0) : x.after(now) ? now : x; } - public Date getDate(SolrDocument doc, final YaCySchema key) { - Date x = doc == null ? null : (Date) doc.getFieldValue(key.name()); - Date now = new Date(); - return (x == null) ? new Date(0) : x.after(now) ? now : x; - } - /** * save configuration to file and update enum SolrFields * @throws IOException @@ -207,11 +198,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } catch (final IOException e) {} } - public SolrInputDocument metadata2solr(final URIMetadata md) { - assert md instanceof URIMetadataRow; - if (md instanceof URIMetadataNode) { - return ClientUtils.toSolrInputDocument(((URIMetadataNode) md).getDocument()); - } + public SolrInputDocument metadata2solr(final URIMetadataRow md) { final SolrInputDocument doc = new SolrInputDocument(); final DigestURI digestURI = DigestURI.toDigestURI(md.url()); @@ -339,10 +326,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); } - public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, Condenser condenser, final URIMetadata metadata) { + public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, Condenser condenser, DigestURI referrerURL, String language) { // we use the SolrCell design as index scheme final SolrInputDocument doc = new SolrInputDocument(); - final DigestURI digestURI = DigestURI.toDigestURI(yacydoc.dc_source()); + final DigestURI digestURI = DigestURI.toDigestURI(document.dc_source()); boolean allAttr = this.isEmpty(); add(doc, YaCySchema.id, id); if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) @@ -377,7 +364,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom); } - List titles = yacydoc.titles(); + List titles = document.titles(); if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, titles); if (allAttr || contains(YaCySchema.title_count_i)) add(doc, YaCySchema.title_count_i, titles.size()); if (allAttr || contains(YaCySchema.title_chars_val)) { @@ -391,7 +378,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable add(doc, YaCySchema.title_words_val, cv); } - String description = yacydoc.dc_description(); + String description = document.dc_description(); List descriptions = new ArrayList(); for (String s: description.split("\n")) descriptions.add(s); if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, description); @@ -407,11 +394,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable add(doc, YaCySchema.description_words_val, cv); } - if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, yacydoc.dc_creator()); - if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, new String[]{yacydoc.dc_format()}); - if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, header == null ? new Date() : header.lastModified()); - if (allAttr || contains(YaCySchema.keywords)) add(doc, YaCySchema.keywords, yacydoc.dc_subject(' ')); - final String content = yacydoc.getTextString(); + if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, document.dc_creator()); + if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, new String[]{document.dc_format()}); + if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified()); + if (allAttr || contains(YaCySchema.keywords)) add(doc, YaCySchema.keywords, document.dc_subject(' ')); + final String content = document.getTextString(); if (allAttr || contains(YaCySchema.text_t)) add(doc, YaCySchema.text_t, content); if (allAttr || contains(YaCySchema.wordcount_i)) { final int contentwc = content.split(" ").length; @@ -427,11 +414,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.url_file_ext_s)) add(doc, YaCySchema.url_file_ext_s, digestURI.getFileExtension()); // get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme - Set inboundLinks = yacydoc.inboundLinks(); - Set outboundLinks = yacydoc.outboundLinks(); + Set inboundLinks = document.inboundLinks(); + Set outboundLinks = document.outboundLinks(); int c = 0; - final Object parser = yacydoc.getParserObject(); + final Object parser = document.getParserObject(); Map images = new HashMap(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; @@ -482,10 +469,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (robots_meta.indexOf("nofollow",0) >= 0) b += 8; // set bit 3 } String x_robots_tag = ""; - if (header != null) { - x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, ""); + if (responseHeader != null) { + x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, ""); if (x_robots_tag.isEmpty()) { - x_robots_tag = header.get(HeaderFramework.X_ROBOTS, ""); + x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, ""); } } if (!x_robots_tag.isEmpty()) { @@ -670,14 +657,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // response time - add(doc, YaCySchema.responsetime_i, header == null ? 0 : Integer.parseInt(header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"))); + add(doc, YaCySchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"))); } // list all links - final Map alllinks = yacydoc.getAnchors(); + final Map alllinks = document.getAnchors(); c = 0; if (allAttr || contains(YaCySchema.inboundlinkscount_i)) add(doc, YaCySchema.inboundlinkscount_i, inboundLinks.size()); - if (allAttr || contains(YaCySchema.inboundlinksnofollowcount_i)) add(doc, YaCySchema.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount()); + if (allAttr || contains(YaCySchema.inboundlinksnofollowcount_i)) add(doc, YaCySchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount()); final List inboundlinksTag = new ArrayList(inboundLinks.size()); final List inboundlinksURLProtocol = new ArrayList(inboundLinks.size()); final List inboundlinksURLStub = new ArrayList(inboundLinks.size()); @@ -725,7 +712,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable c = 0; if (allAttr || contains(YaCySchema.outboundlinkscount_i)) add(doc, YaCySchema.outboundlinkscount_i, outboundLinks.size()); - if (allAttr || contains(YaCySchema.outboundlinksnofollowcount_i)) add(doc, YaCySchema.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount()); + if (allAttr || contains(YaCySchema.outboundlinksnofollowcount_i)) add(doc, YaCySchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); final List outboundlinksTag = new ArrayList(outboundLinks.size()); final List outboundlinksURLProtocol = new ArrayList(outboundLinks.size()); final List outboundlinksURLStub = new ArrayList(outboundLinks.size()); @@ -772,26 +759,30 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.outboundlinks_alttag_txt)) add(doc, YaCySchema.outboundlinks_alttag_txt, outboundlinksAltTag); // charset - if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, yacydoc.getCharset()); + if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, document.getCharset()); // coordinates - if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { - if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(yacydoc.lat()) + "," + Double.toString(yacydoc.lon())); + if (document.lat() != 0.0f && document.lon() != 0.0f) { + if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon())); } - if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, header == null ? 200 : header.getStatusCode()); - - // fields that are additionally in URIMetadataRow - if (allAttr || contains(YaCySchema.load_date_dt)) add(doc, YaCySchema.load_date_dt, metadata.loaddate()); - if (allAttr || contains(YaCySchema.fresh_date_dt)) add(doc, YaCySchema.fresh_date_dt, metadata.freshdate()); - if (allAttr || contains(YaCySchema.host_id_s)) add(doc, YaCySchema.host_id_s, metadata.hosthash()); - if ((allAttr || contains(YaCySchema.referrer_id_txt)) && metadata.referrerHash() != null) add(doc, YaCySchema.referrer_id_txt, new String[]{ASCII.String(metadata.referrerHash())}); + if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode()); + + // fields that were additionally in URIMetadataRow + Date loadDate = new Date(); + Date modDate = responseHeader.lastModified(); + if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; + int size = (int) Math.max(document.dc_source().length(), responseHeader.getContentLength()); + if (allAttr || contains(YaCySchema.load_date_dt)) add(doc, YaCySchema.load_date_dt, loadDate); + if (allAttr || contains(YaCySchema.fresh_date_dt)) add(doc, YaCySchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula + if (allAttr || contains(YaCySchema.host_id_s)) add(doc, YaCySchema.host_id_s, document.dc_source().hosthash()); + if ((allAttr || contains(YaCySchema.referrer_id_txt)) && referrerURL != null) add(doc, YaCySchema.referrer_id_txt, new String[]{ASCII.String(referrerURL.hash())}); //if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]); - if (allAttr || contains(YaCySchema.publisher_t)) add(doc, YaCySchema.publisher_t, yacydoc.dc_publisher()); - if ((allAttr || contains(YaCySchema.language_s)) && metadata.language() != null) add(doc, YaCySchema.language_s, UTF8.String(metadata.language())); - if (allAttr || contains(YaCySchema.size_i)) add(doc, YaCySchema.size_i, metadata.size()); - if (allAttr || contains(YaCySchema.audiolinkscount_i)) add(doc, YaCySchema.audiolinkscount_i, yacydoc.getAudiolinks().size()); - if (allAttr || contains(YaCySchema.videolinkscount_i)) add(doc, YaCySchema.videolinkscount_i, yacydoc.getVideolinks().size()); - if (allAttr || contains(YaCySchema.applinkscount_i)) add(doc, YaCySchema.applinkscount_i, yacydoc.getApplinks().size()); + if (allAttr || contains(YaCySchema.publisher_t)) add(doc, YaCySchema.publisher_t, document.dc_publisher()); + if ((allAttr || contains(YaCySchema.language_s)) && language != null) add(doc, YaCySchema.language_s, language); + if (allAttr || contains(YaCySchema.size_i)) add(doc, YaCySchema.size_i, size); + if (allAttr || contains(YaCySchema.audiolinkscount_i)) add(doc, YaCySchema.audiolinkscount_i, document.getAudiolinks().size()); + if (allAttr || contains(YaCySchema.videolinkscount_i)) add(doc, YaCySchema.videolinkscount_i, document.getVideolinks().size()); + if (allAttr || contains(YaCySchema.applinkscount_i)) add(doc, YaCySchema.applinkscount_i, document.getApplinks().size()); return doc; } @@ -827,6 +818,25 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable return a; } + /** + * encode a string containing attributes from anchor rel properties binary: + * bit 0: "me" contained in rel + * bit 1: "nofollow" contained in rel + * @param rel + * @return binary encoded information about rel + */ + private static List relEval(final List rel) { + List il = new ArrayList(rel.size()); + for (final String s: rel) { + int i = 0; + final String s0 = s.toLowerCase().trim(); + if ("me".equals(s0)) i += 1; + if ("nofollow".equals(s0)) i += 2; + il.add(i); + } + return il; + } + public static Iterator getLinks(SolrDocument doc, boolean inbound) { Collection urlstub = doc.getFieldValues((inbound ? YaCySchema.inboundlinks_urlstub_txt : YaCySchema.outboundlinks_urlstub_txt).name()); Collection urlprot = urlstub == null ? null : indexedList2protocolList(doc.getFieldValues((inbound ? YaCySchema.inboundlinks_protocol_sxt : YaCySchema.outboundlinks_protocol_sxt).name()), urlstub.size()); @@ -846,30 +856,17 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable return list.iterator(); } - /** - * encode a string containing attributes from anchor rel properties binary: - * bit 0: "me" contained in rel - * bit 1: "nofollow" contained in rel - * @param rel - * @return binary encoded information about rel - */ - private static List relEval(final List rel) { - List il = new ArrayList(rel.size()); - for (final String s: rel) { - int i = 0; - final String s0 = s.toLowerCase().trim(); - if ("me".equals(s0)) i += 1; - if ("nofollow".equals(s0)) i += 2; - il.add(i); - } - return il; + public static Date getDate(SolrDocument doc, final YaCySchema key) { + Date x = doc == null ? null : (Date) doc.getFieldValue(key.name()); + Date now = new Date(); + return (x == null) ? new Date(0) : x.after(now) ? now : x; } - public String solrGetID(final SolrDocument solr) { + public static String solrGetID(final SolrDocument solr) { return (String) solr.getFieldValue(YaCySchema.id.getSolrFieldName()); } - public DigestURI solrGetURL(final SolrDocument solr) { + public static DigestURI solrGetURL(final SolrDocument solr) { try { return new DigestURI((String) solr.getFieldValue(YaCySchema.sku.getSolrFieldName())); } catch (final MalformedURLException e) { @@ -877,29 +874,29 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } } - public String solrGetTitle(final SolrDocument solr) { + public static String solrGetTitle(final SolrDocument solr) { return (String) solr.getFieldValue(YaCySchema.title.getSolrFieldName()); } - public String solrGetText(final SolrDocument solr) { + public static String solrGetText(final SolrDocument solr) { return (String) solr.getFieldValue(YaCySchema.text_t.getSolrFieldName()); } - public String solrGetAuthor(final SolrDocument solr) { + public static String solrGetAuthor(final SolrDocument solr) { return (String) solr.getFieldValue(YaCySchema.author.getSolrFieldName()); } - public String solrGetDescription(final SolrDocument solr) { + public static String solrGetDescription(final SolrDocument solr) { return (String) solr.getFieldValue(YaCySchema.description.getSolrFieldName()); } - public Date solrGetDate(final SolrDocument solr) { + public static Date solrGetDate(final SolrDocument solr) { Date date = (Date) solr.getFieldValue(YaCySchema.last_modified.getSolrFieldName()); Date now = new Date(); return date.after(now) ? now : date; } - public Collection solrGetKeywords(final SolrDocument solr) { + public static Collection solrGetKeywords(final SolrDocument solr) { final Collection c = solr.getFieldValues(YaCySchema.keywords.getSolrFieldName()); final ArrayList a = new ArrayList(); for (final Object s: c) { diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index f1f425581..ebc06b3da 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -44,7 +44,6 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.Cache; import net.yacy.data.WorkTables; import net.yacy.document.Condenser; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 3fb382b46..5e119f06b 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -48,7 +48,6 @@ import net.yacy.document.SnippetExtractor; import net.yacy.document.WordTokenizer; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.util.ByteArray;