diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index c7e1f6af2..dd584ec11 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -45,7 +45,6 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ListManager; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -270,8 +269,8 @@ public class IndexControlRWIs_p { index = segment.termIndex().get(keyhash, null); // built urlCache final Iterator urlIter = index.entries(); - final TreeMap knownURLs = - new TreeMap(Base64Order.enhancedCoder); + final TreeMap knownURLs = + new TreeMap(Base64Order.enhancedCoder); final HandleSet unknownURLEntries = new RowHandleSet( WordReferenceRow.urlEntryRow.primaryKeyLength, @@ -290,7 +289,7 @@ public class IndexControlRWIs_p { } urlIter.remove(); } else { - knownURLs.put(iEntry.urlhash(), lurl.toRow()); + knownURLs.put(iEntry.urlhash(), lurl); } } @@ -376,7 +375,7 @@ public class IndexControlRWIs_p { } catch ( final SpaceExceededException e ) { Log.logException(e); } - final URIMetadata e = segment.fulltext().getMetadata(b); + final URIMetadataNode e = segment.fulltext().getMetadata(b); segment.fulltext().remove(b); if ( e != null ) { url = e.url(); @@ -411,7 +410,7 @@ public class IndexControlRWIs_p { } catch ( final SpaceExceededException e ) { Log.logException(e); } - final URIMetadata e = segment.fulltext().getMetadata(b); + final URIMetadataNode e = segment.fulltext().getMetadata(b); segment.fulltext().remove(b); if ( e != null ) { url = e.url(); diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index f1f2c9fc8..1384d81c8 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -41,7 +41,6 @@ import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.ResultURLs; import net.yacy.data.WorkTables; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; @@ -206,7 +205,7 @@ public class IndexControlURLs_p { final DigestURI url = new DigestURI(urlstring); urlhash = ASCII.String(url.hash()); prop.put("urlhash", urlhash); - final URIMetadata entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); + final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); if (entry == null) { prop.putHTML("result", "No Entry for URL " + url.toNormalform(true)); prop.putHTML("urlstring", urlstring); @@ -222,7 +221,7 @@ public class IndexControlURLs_p { } if (post.containsKey("urlhashsearch")) { - final URIMetadata entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); + final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); if (entry == null) { prop.putHTML("result", "No Entry for URL hash " + urlhash); } else { @@ -236,7 +235,7 @@ public class IndexControlURLs_p { if (post.containsKey("urlhashsimilar")) { final Iterator entryIt = new RotateIterator(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount()); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:
"); - URIMetadata entry; + URIMetadataNode entry; int i = 0, rows = 0, cols = 0; prop.put("urlhashsimilar", "1"); while (entryIt.hasNext() && i < 256) { @@ -341,7 +340,7 @@ public class IndexControlURLs_p { return prop; } - private static serverObjects genUrlProfile(final Segment segment, final URIMetadata entry, final String urlhash) { + private static serverObjects genUrlProfile(final Segment segment, final URIMetadataNode entry, final String urlhash) { final serverObjects prop = new serverObjects(); if (entry == null) { prop.put("genUrlProfile", "1"); diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index 54777795e..76281b190 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -36,7 +36,6 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader.FileType; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; @@ -48,7 +47,6 @@ import net.yacy.server.serverSwitch; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.RDFNode; - public class yacydoc { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { @@ -106,7 +104,7 @@ public class yacydoc { if (entry.url() == null) { return prop; } - final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash()); + final URIMetadataNode le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash()); prop.putXML("dc_title", entry.dc_title()); prop.putXML("dc_creator", entry.dc_creator()); diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 662bb2941..fdf952781 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -34,7 +34,6 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; import net.yacy.crawler.data.ZURL.FailCategory; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.peers.Protocol; @@ -116,7 +115,7 @@ public final class crawlReceipt { } // generating a new loaded URL entry - final URIMetadata entry = URIMetadataRow.importEntry(propStr); + final URIMetadataRow entry = URIMetadataRow.importEntry(propStr); if (entry == null) { if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "3600"); @@ -148,7 +147,7 @@ public final class crawlReceipt { if ("fill".equals(result)) try { // put new entry into database sb.index.fulltext().putMetadata(entry); - ResultURLs.stack(entry, youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS); + ResultURLs.stack(ASCII.String(entry.url().hash()), entry.url().getHost(), youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS); sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false)); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 227d139c4..acfa99138 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -30,6 +30,7 @@ import java.io.IOException; import java.text.ParseException; import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.document.ASCII; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.ResultURLs; @@ -149,7 +150,7 @@ public final class transferURL { if (Network.log.isFine()) Network.log.logFine("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.url().toNormalform(true)); try { sb.index.fulltext().putMetadata(lEntry); - ResultURLs.stack(lEntry, iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER); + ResultURLs.stack(ASCII.String(lEntry.url().hash()), lEntry.url().getHost(), iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER); if (Network.log.isFine()) Network.log.logFine("transferURL: received URL '" + lEntry.url().toNormalform(false) + "' from peer " + otherPeerName); received++; } catch (final IOException e) { diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index ebfab47b9..3f422f927 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -1976,16 +1976,18 @@ public class MultiProtocolURI implements Serializable, Comparable resultStack = getStack(stackType); if (resultStack != null) { - resultStack.put(ASCII.String(urlEntry.hash()), new InitExecEntry(initiatorHash, executorHash)); + resultStack.put(urlhash, new InitExecEntry(initiatorHash, executorHash)); } } catch (final Exception ex) { System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); @@ -116,7 +108,7 @@ public final class ResultURLs { try { final ScoreMap domains = getDomains(stackType); if (domains != null) { - domains.inc(urlEntry.url().getHost()); + domains.inc(hostname); } } catch (final Exception ex) { System.out.println("INTERNAL ERROR in newEntry/3: " + ex.toString()); @@ -216,23 +208,4 @@ public final class ResultURLs { return true; } - /** - * test and benchmark - * @param args - */ - public static void main(final String[] args) { - try { - final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/"); - final URIMetadata urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0d, 0.0d, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0, new String[0]); - final EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING; - System.out.println("valid test:\n======="); - // add - stack(urlRef, urlRef.hash(), url.hash(), stackNo); - // size - System.out.println("size of stack:\t"+ getStackSize(stackNo)); - } catch (final MalformedURLException e) { - Log.logException(e); - } - } - } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 788adbb44..a64e3915f 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -293,7 +293,7 @@ dc_rights return this.source.toNormalform(true); } - public MultiProtocolURI dc_source() { + public DigestURI dc_source() { return this.source; } diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index fb65695f6..332b2948a 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -197,6 +197,10 @@ public class DigestURI extends MultiProtocolURI implements Serializable { return this.hash; } + public String hosthash() { + return ASCII.String(this.hash(), 6, 6); + } + /** * calculated YaCy-Hash of this URI * diff --git a/source/net/yacy/kelondro/data/meta/URIMetadata.java b/source/net/yacy/kelondro/data/meta/URIMetadata.java deleted file mode 100644 index 17f6c6e6a..000000000 --- a/source/net/yacy/kelondro/data/meta/URIMetadata.java +++ /dev/null @@ -1,132 +0,0 @@ -/** - * URIMetadata - * Copyright 2012 by Michael Peter Christen - * First released 3.4.2012 at http://yacy.net - * - * This file is part of YaCy Content Integration - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.kelondro.data.meta; - -import java.util.Date; -import java.util.regex.Pattern; - -import net.yacy.crawler.retrieval.Request; -import net.yacy.kelondro.data.word.WordReference; -import net.yacy.kelondro.util.Bitfield; - - -public interface URIMetadata { - - /** - * The hash of a URIReference is a unique key for the stored URL. - * It is in fact equal to url().hash() - * @return the hash of the stored url - */ - public byte[] hash(); - - /** - * the second half of a uri hash is the host hash - * @return - */ - public String hosthash(); - - /** - * The modification date of the URIReference is given if - * the record was created first and is defined with the - * creation date. If the record is modified later, the date shall change. - * @return the modification date of this record - */ - public Date moddate(); - - /** - * The DigestURI is the payload of the URIReference - * @return the url as DigestURI with assigned URL hash according to the record hash - */ - public DigestURI url(); - - /** - * check if the url matches agains a given matcher - * @param matcher - * @return true if the url() matches - */ - public boolean matches(final Pattern matcher); - - /** - * produce a visible representation of the record - * @return a string for the url() - */ - @Override - public String toString(); - - public String dc_title(); - - public String dc_creator(); - - public String dc_publisher(); - - public String dc_subject(); - - public double lat(); - - public double lon(); - - public long ranking(); - - public Date loaddate(); - - public Date freshdate(); - - public String md5(); - - public char doctype(); - - public byte[] language(); - - public int size(); - - public Bitfield flags(); - - public int wordCount(); - - public int llocal(); - - public int lother(); - - public int limage(); - - public int laudio(); - - public int lvideo(); - - public int lapp(); - - public String snippet(); - - public String[] collections(); - - public WordReference word(); - - public boolean isOlder(final URIMetadata other); - - public String toString(final String snippet); - - public byte[] referrerHash(); - - public Request toBalancerEntry(final String initiatorHash); - -} diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 6bc410f80..0c6ae6a9f 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -35,7 +35,6 @@ import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.order.Base64Order; -import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.kelondro.data.word.WordReference; @@ -43,7 +42,9 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.Bitfield; import net.yacy.utils.crypt; +import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrInputDocument; /** @@ -51,7 +52,7 @@ import org.apache.solr.common.SolrDocument; * The purpose of this object is the migration from the old metadata structure to solr document. * Future implementations should try to replace URIMetadata objects completely by SolrDocument objects */ -public class URIMetadataNode implements URIMetadata { +public class URIMetadataNode { private byte[] hash = null; private String urlRaw = null, keywords = null; @@ -64,6 +65,10 @@ public class URIMetadataNode implements URIMetadata { private String snippet = null; private WordReference word = null; // this is only used if the url is transported via remote search requests + public URIMetadataNode(final SolrInputDocument doc) { + this(ClientUtils.toSolrDocument(doc)); + } + public URIMetadataNode(final SolrDocument doc) { this.doc = doc; this.snippet = ""; @@ -79,123 +84,58 @@ public class URIMetadataNode implements URIMetadata { } } - public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) { - this(doc); + public URIMetadataNode(final SolrInputDocument doc, final WordReference searchedWord, final long ranking) { + this(ClientUtils.toSolrDocument(doc)); this.word = searchedWord; this.ranking = ranking; } - public URIMetadataRow toRow() { - return URIMetadataRow.importEntry(this.toString()); + public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) { + this(doc); + this.word = searchedWord; + this.ranking = ranking; } public SolrDocument getDocument() { return this.doc; } - private int getInt(YaCySchema field) { - assert !field.isMultiValued(); - assert field.getType() == SolrType.integer; - Integer x = (Integer) this.doc.getFieldValue(field.name()); - if (x == null) return 0; - return x.intValue(); - } - - private Date getDate(YaCySchema field) { - assert !field.isMultiValued(); - assert field.getType() == SolrType.date; - Date x = (Date) this.doc.getFieldValue(field.name()); - if (x == null) return new Date(0); - Date now = new Date(); - return x.after(now) ? now : x; - } - - private String getString(YaCySchema field) { - assert !field.isMultiValued(); - assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight; - Object x = this.doc.getFieldValue(field.name()); - if (x == null) return ""; - if (x instanceof ArrayList) { - @SuppressWarnings("unchecked") - ArrayList xa = (ArrayList) x; - return xa.size() == 0 ? "" : xa.get(0); - } - return (String) x; - } - - @SuppressWarnings("unchecked") - private ArrayList getStringList(YaCySchema field) { - assert field.isMultiValued(); - assert field.getType() == SolrType.string || field.getType() == SolrType.text_general; - Object r = this.doc.getFieldValue(field.name()); - if (r == null) return new ArrayList(0); - if (r instanceof ArrayList) { - return (ArrayList) r; - } - ArrayList a = new ArrayList(1); - a.add((String) r); - return a; - } - - @SuppressWarnings("unchecked") - private ArrayList getIntList(YaCySchema field) { - assert field.isMultiValued(); - assert field.getType() == SolrType.integer; - Object r = this.doc.getFieldValue(field.name()); - if (r == null) return new ArrayList(0); - if (r instanceof ArrayList) { - return (ArrayList) r; - } - ArrayList a = new ArrayList(1); - a.add((Integer) r); - return a; - } - - @Override public byte[] hash() { return this.hash; } - @Override public String hosthash() { String hosthash = (String) this.doc.getFieldValue(YaCySchema.host_id_s.name()); if (hosthash == null) hosthash = ASCII.String(this.hash, 6, 6); return hosthash; } - @Override public Date moddate() { return getDate(YaCySchema.last_modified); } - @Override public DigestURI url() { return this.url; } - @Override public boolean matches(Pattern matcher) { return matcher.matcher(this.urlRaw.toLowerCase()).matches(); } - @Override public String dc_title() { ArrayList a = getStringList(YaCySchema.title); if (a == null || a.size() == 0) return ""; return a.get(0); } - @Override public String dc_creator() { return getString(YaCySchema.author); } - @Override public String dc_publisher() { return getString(YaCySchema.publisher_t); } - @Override public String dc_subject() { if (this.keywords == null) { this.keywords = getString(YaCySchema.keywords); @@ -203,7 +143,6 @@ public class URIMetadataNode implements URIMetadata { return this.keywords; } - @Override public double lat() { if (this.lat == Double.NaN) { this.lon = 0.0d; @@ -220,60 +159,49 @@ public class URIMetadataNode implements URIMetadata { return this.lat; } - @Override public double lon() { if (this.lon == Double.NaN) lat(); return this.lon; } - @Override public long ranking() { return this.ranking; } - @Override public Date loaddate() { return getDate(YaCySchema.load_date_dt); } - @Override public Date freshdate() { return getDate(YaCySchema.fresh_date_dt); } - @Override public String md5() { return getString(YaCySchema.md5_s); } - @Override public char doctype() { ArrayList a = getStringList(YaCySchema.content_type); if (a == null || a.size() == 0) return Response.docType(url()); return Response.docType(a.get(0)); } - @Override public byte[] language() { String language = getString(YaCySchema.language_s); if (language == null || language.length() == 0) return ASCII.getBytes("en"); return UTF8.getBytes(language); } - - @Override public byte[] referrerHash() { ArrayList referrer = getStringList(YaCySchema.referrer_id_txt); if (referrer == null || referrer.size() == 0) return null; return ASCII.getBytes(referrer.get(0)); } - @Override public int size() { return getInt(YaCySchema.size_i); } - @Override public Bitfield flags() { if (flags == null) { this.flags = new Bitfield(); @@ -287,22 +215,18 @@ public class URIMetadataNode implements URIMetadata { return this.flags; } - @Override public int wordCount() { return getInt(YaCySchema.wordcount_i); } - @Override public int llocal() { return getInt(YaCySchema.inboundlinkscount_i); } - @Override public int lother() { return getInt(YaCySchema.outboundlinkscount_i); } - @Override public int limage() { if (this.imagec == -1) { this.imagec = getInt(YaCySchema.imagescount_i); @@ -310,7 +234,6 @@ public class URIMetadataNode implements URIMetadata { return this.imagec; } - @Override public int laudio() { if (this.audioc == -1) { this.audioc = getInt(YaCySchema.audiolinkscount_i); @@ -318,7 +241,6 @@ public class URIMetadataNode implements URIMetadata { return this.audioc; } - @Override public int lvideo() { if (this.videoc == -1) { this.videoc = getInt(YaCySchema.videolinkscount_i); @@ -326,7 +248,6 @@ public class URIMetadataNode implements URIMetadata { return this.videoc; } - @Override public int lapp() { if (this.appc == -1) { this.appc = getInt(YaCySchema.videolinkscount_i); @@ -348,24 +269,20 @@ public class URIMetadataNode implements URIMetadata { return getInt(YaCySchema.url_chars_i); } - @Override public String snippet() { return this.snippet; } - @Override public String[] collections() { ArrayList a = getStringList(YaCySchema.collection_sxt); return a.toArray(new String[a.size()]); } - @Override public WordReference word() { return this.word; } - @Override - public boolean isOlder(URIMetadata other) { + public boolean isOlder(URIMetadataRow other) { if (other == null) return false; final Date tmoddate = moddate(); final Date omoddate = other.moddate(); @@ -379,7 +296,7 @@ public class URIMetadataNode implements URIMetadata { return false; } - protected static StringBuilder corePropList(URIMetadata md) { + private static StringBuilder corePropList(URIMetadataNode md) { // generate a parseable string; this is a simple property-list final StringBuilder s = new StringBuilder(300); @@ -427,7 +344,6 @@ public class URIMetadataNode implements URIMetadata { * the toString format must be completely identical to URIMetadataRow because that is used * to transport the data over p2p connections. */ - @Override public String toString(String snippet) { // add information needed for remote transport final StringBuilder core = corePropList(this); @@ -457,19 +373,65 @@ public class URIMetadataNode implements URIMetadata { core.append('}'); return core.toString(); } + + private int getInt(YaCySchema field) { + assert !field.isMultiValued(); + assert field.getType() == SolrType.integer; + Object x = this.doc.getFieldValue(field.name()); + if (x == null) return 0; + if (x instanceof Integer) return ((Integer) x).intValue(); + if (x instanceof Long) return ((Long) x).intValue(); + return 0; + } - @Override - public Request toBalancerEntry(final String initiatorHash) { - return new Request( - ASCII.getBytes(initiatorHash), - url(), - referrerHash(), - dc_title(), - moddate(), - null, - 0, - 0, - 0, - 0); + private Date getDate(YaCySchema field) { + assert !field.isMultiValued(); + assert field.getType() == SolrType.date; + Date x = (Date) this.doc.getFieldValue(field.name()); + if (x == null) return new Date(0); + Date now = new Date(); + return x.after(now) ? now : x; } + + private String getString(YaCySchema field) { + assert !field.isMultiValued(); + assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight; + Object x = this.doc.getFieldValue(field.name()); + if (x == null) return ""; + if (x instanceof ArrayList) { + @SuppressWarnings("unchecked") + ArrayList xa = (ArrayList) x; + return xa.size() == 0 ? "" : xa.get(0); + } + return (String) x; + } + + @SuppressWarnings("unchecked") + private ArrayList getStringList(YaCySchema field) { + assert field.isMultiValued(); + assert field.getType() == SolrType.string || field.getType() == SolrType.text_general; + Object r = this.doc.getFieldValue(field.name()); + if (r == null) return new ArrayList(0); + if (r instanceof ArrayList) { + return (ArrayList) r; + } + ArrayList a = new ArrayList(1); + a.add((String) r); + return a; + } + + @SuppressWarnings("unchecked") + private ArrayList getIntList(YaCySchema field) { + assert field.isMultiValued(); + assert field.getType() == SolrType.integer; + Object r = this.doc.getFieldValue(field.name()); + if (r == null) return new ArrayList(0); + if (r instanceof ArrayList) { + return (ArrayList) r; + } + ArrayList a = new ArrayList(1); + a.add((Integer) r); + return a; + } + } \ No newline at end of file diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index d191b2dff..c771ded1b 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -54,7 +54,7 @@ import net.yacy.kelondro.util.kelondroException; import net.yacy.search.query.QueryParams; import net.yacy.utils.crypt; -public class URIMetadataRow implements URIMetadata { +public class URIMetadataRow { // this object stores attributes for URL entries @@ -104,120 +104,14 @@ public class URIMetadataRow implements URIMetadata { private final Row.Entry entry; private final String snippet; - private final String[] collections; private WordReference word; // this is only used if the url is transported via remote search requests - private final long ranking; // during generation of a search result this value is set private Components comp; - public URIMetadataRow() { - // create a dummy entry, good to produce poison objects - this.entry = rowdef.newEntry(); - this.snippet = ""; - this.collections = new String[0]; - this.word = null; - this.ranking = 0; - this.comp = null; - } - - public URIMetadataRow( - final DigestURI url, - final String dc_title, - final String dc_creator, - final String dc_subject, - final String dc_publisher, - final double lon, final double lat, // decimal degrees as in WGS84; if unknown both values may be 0.0d; - final Date mod, - final Date load, - final Date fresh, - final String referrer, - final byte[] md5, - final long size, - final int wc, - final char dt, - final Bitfield flags, - final byte[] lang, - final int llocal, - final int lother, - final int laudio, - final int limage, - final int lvideo, - final int lapp, - final String[] collections) { - // create new entry - this.entry = rowdef.newEntry(); - this.entry.setCol(col_hash, url.hash()); - this.entry.setCol(col_comp, encodeComp(url, dc_title, dc_creator, dc_subject, dc_publisher, lat, lon)); - encodeDate(col_mod, mod); - encodeDate(col_load, load); - encodeDate(col_fresh, fresh); - this.entry.setCol(col_referrer, (referrer == null) ? null : UTF8.getBytes(referrer)); - this.entry.setCol(col_md5, md5); - this.entry.setCol(col_size, size); - this.entry.setCol(col_wc, wc); - this.entry.setCol(col_dt, new byte[]{(byte) dt}); - this.entry.setCol(col_flags, flags.bytes()); - this.entry.setCol(col_lang, lang); - this.entry.setCol(col_llocal, llocal); - this.entry.setCol(col_lother, lother); - this.entry.setCol(col_limage, limage); - this.entry.setCol(col_laudio, laudio); - this.entry.setCol(col_lvideo, lvideo); - this.entry.setCol(col_lapp, lapp); - //System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString()); - this.snippet = ""; - this.collections = collections; - this.word = null; - this.ranking = 0; - this.comp = null; - } - - private void encodeDate(final int col, final Date d) { - // calculates the number of days since 1.1.1970 and returns this as 4-byte array - // 86400000 is the number of milliseconds in one day - long time = d.getTime(); - long now = System.currentTimeMillis(); - this.entry.setCol(col, NaturalOrder.encodeLong((time > now ? now : time) / 86400000L, 4)); - } - - private Date decodeDate(final int col) { - final long t = this.entry.getColLong(col); - /*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch - /* - if (t < 350400) return new Date(3600000L * t); // hours since epoch - if (t < 21024000) return new Date(60000L * t); // minutes since epoch - */ - } - - private static byte[] encodeComp( - final DigestURI url, - final String dc_title, - final String dc_creator, - final String dc_subject, - final String dc_publisher, - final double lat, - final double lon) { - final CharBuffer s = new CharBuffer(3600, 360); - s.append(url.toNormalform(true)).appendLF(); - s.append(dc_title).appendLF(); - if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); - s.appendLF(); - if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject); - s.appendLF(); - if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher); - s.appendLF(); - if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF(); - String s0 = s.toString(); - s.close(); - return UTF8.getBytes(s0); - } - - public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord, final long ranking) { + public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) { this.entry = entry; this.snippet = ""; this.word = searchedWord; - this.ranking = ranking; this.comp = null; - this.collections = new String[0]; } private URIMetadataRow(final Properties prop) throws kelondroException { @@ -278,17 +172,15 @@ public class URIMetadataRow implements URIMetadata { this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0"))); this.snippet = crypt.simpleDecode(prop.getProperty("snippet", "")); this.word = null; - if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported"); if (prop.containsKey("wi")) { this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", "")))); } - this.ranking = 0; this.comp = null; - this.collections = new String[0]; } - + public static URIMetadataRow importEntry(final String propStr) { - if (propStr == null || (!propStr.isEmpty() && propStr.charAt(0) != '{') || !propStr.endsWith("}")) { + if (propStr == null || propStr.isEmpty() || propStr.charAt(0) != '{' || !propStr.endsWith("}")) { + Log.logSevere("URIMetadataRow", "importEntry: propStr is not proper: " + propStr); return null; } try { @@ -300,7 +192,46 @@ public class URIMetadataRow implements URIMetadata { } } - @Override + private void encodeDate(final int col, final Date d) { + // calculates the number of days since 1.1.1970 and returns this as 4-byte array + // 86400000 is the number of milliseconds in one day + long time = d.getTime(); + long now = System.currentTimeMillis(); + this.entry.setCol(col, NaturalOrder.encodeLong((time > now ? now : time) / 86400000L, 4)); + } + + private Date decodeDate(final int col) { + final long t = this.entry.getColLong(col); + /*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch + /* + if (t < 350400) return new Date(3600000L * t); // hours since epoch + if (t < 21024000) return new Date(60000L * t); // minutes since epoch + */ + } + + private static byte[] encodeComp( + final DigestURI url, + final String dc_title, + final String dc_creator, + final String dc_subject, + final String dc_publisher, + final double lat, + final double lon) { + final CharBuffer s = new CharBuffer(3600, 360); + s.append(url.toNormalform(true)).appendLF(); + s.append(dc_title).appendLF(); + if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); + s.appendLF(); + if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject); + s.appendLF(); + if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher); + s.appendLF(); + if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF(); + String s0 = s.toString(); + s.close(); + return UTF8.getBytes(s0); + } + public byte[] hash() { // return a url-hash, based on the md5 algorithm // the result is a String of 12 bytes within a 72-bit space @@ -310,54 +241,40 @@ public class URIMetadataRow implements URIMetadata { } private String hostHash = null; - @Override public String hosthash() { if (this.hostHash != null) return this.hostHash; this.hostHash = ASCII.String(this.entry.getPrimaryKeyBytes(), 6, 6); return this.hostHash; } - @Override - public long ranking() { - return this.ranking; - } - - @Override public boolean matches(final Pattern matcher) { return this.metadata().matches(matcher); } - @Override public DigestURI url() { return this.metadata().url(); } - @Override public String dc_title() { return this.metadata().dc_title(); } - @Override public String dc_creator() { return this.metadata().dc_creator(); } - @Override public String dc_publisher() { return this.metadata().dc_publisher(); } - @Override public String dc_subject() { return this.metadata().dc_subject(); } - @Override public double lat() { return this.metadata().lat(); } - @Override public double lon() { return this.metadata().lon(); } @@ -379,22 +296,18 @@ public class URIMetadataRow implements URIMetadata { return this.comp; } - @Override public Date moddate() { return decodeDate(col_mod); } - @Override public Date loaddate() { return decodeDate(col_load); } - @Override public Date freshdate() { return decodeDate(col_fresh); } - @Override public byte[] referrerHash() { // return the creator's hash or null if there is none // FIXME: There seem to be some malformed entries in the databasees like "null\0\0\0\0\0\0\0\0" @@ -408,18 +321,15 @@ public class URIMetadataRow implements URIMetadata { return r; } - @Override public String md5() { // returns the md5 in hex representation return Digest.encodeHex(this.entry.getColBytes(col_md5, true)); } - @Override public char doctype() { return (char) this.entry.getColByte(col_dt); } - @Override public byte[] language() { byte[] b = this.entry.getColBytes(col_lang, true); if ((b == null || b[0] == (byte)'[') && this.metadata().url != null) { @@ -430,100 +340,98 @@ public class URIMetadataRow implements URIMetadata { return b; } - @Override public int size() { return (int) this.entry.getColLong(col_size); } - @Override public Bitfield flags() { return new Bitfield(this.entry.getColBytes(col_flags, true)); } - @Override public int wordCount() { return (int) this.entry.getColLong(col_wc); } - @Override public int llocal() { return (int) this.entry.getColLong(col_llocal); } - @Override public int lother() { return (int) this.entry.getColLong(col_lother); } - @Override public int limage() { return (int) this.entry.getColLong(col_limage); } - @Override public int laudio() { return (int) this.entry.getColLong(col_laudio); } - @Override public int lvideo() { return (int) this.entry.getColLong(col_lvideo); } - @Override public int lapp() { return (int) this.entry.getColLong(col_lapp); } - @Override public String snippet() { // the snippet may appear here if the url was transported in a remote search // it will not be saved anywhere, but can only be requested here return this.snippet; } - @Override - public String[] collections() { - return this.collections; - } + - @Override public WordReference word() { return this.word; } - @Override - public boolean isOlder(final URIMetadata other) { - if (other == null) return false; - final Date tmoddate = moddate(); - final Date omoddate = other.moddate(); - if (tmoddate.before(omoddate)) return true; - if (tmoddate.equals(omoddate)) { - final Date tloaddate = loaddate(); - final Date oloaddate = other.loaddate(); - if (tloaddate.before(oloaddate)) return true; - if (tloaddate.equals(oloaddate)) return true; - } - return false; - } + private static StringBuilder corePropList(URIMetadataRow md) { + // generate a parseable string; this is a simple property-list + final StringBuilder s = new StringBuilder(300); - @Override - public String toString(final String snippet) { - // add information needed for remote transport - final StringBuilder core = URIMetadataNode.corePropList(this); - if (core == null) - return null; - - core.ensureCapacity(core.length() + snippet.length() * 2); - core.insert(0, "{"); - core.append(",snippet=").append(crypt.simpleEncode(snippet)); - core.append("}"); + // create new formatters to make concurrency possible + final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); - return core.toString(); - //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; + try { + s.append("hash=").append(ASCII.String(md.hash())); + s.append(",url=").append(crypt.simpleEncode(md.url().toNormalform(true))); + s.append(",descr=").append(crypt.simpleEncode(md.dc_title())); + s.append(",author=").append(crypt.simpleEncode(md.dc_creator())); + s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(md.dc_subject()))); + s.append(",publisher=").append(crypt.simpleEncode(md.dc_publisher())); + s.append(",lat=").append(md.lat()); + s.append(",lon=").append(md.lon()); + s.append(",mod=").append(formatter.format(md.moddate())); + s.append(",load=").append(formatter.format(md.loaddate())); + s.append(",fresh=").append(formatter.format(md.freshdate())); + s.append(",referrer=").append(md.referrerHash() == null ? "" : ASCII.String(md.referrerHash())); + s.append(",md5=").append(md.md5()); + s.append(",size=").append(md.size()); + s.append(",wc=").append(md.wordCount()); + s.append(",dt=").append(md.doctype()); + s.append(",flags=").append(md.flags().exportB64()); + s.append(",lang=").append(md.language() == null ? "EN" : UTF8.String(md.language())); + s.append(",llocal=").append(md.llocal()); + s.append(",lother=").append(md.lother()); + s.append(",limage=").append(md.limage()); + s.append(",laudio=").append(md.laudio()); + s.append(",lvideo=").append(md.lvideo()); + s.append(",lapp=").append(md.lapp()); + if (md.word() != null) { + // append also word properties + final String wprop = md.word().toPropertyForm(); + s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop)); + } + return s; + } catch (final Throwable e) { + Log.logException(e); + return null; + } } - @Override public Request toBalancerEntry(final String initiatorHash) { return new Request( ASCII.getBytes(initiatorHash), @@ -545,7 +453,7 @@ public class URIMetadataRow implements URIMetadata { */ @Override public String toString() { - final StringBuilder core = URIMetadataNode.corePropList(this); + final StringBuilder core = corePropList(this); if (core == null) return null; core.insert(0, "{"); diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index ff12169f4..01299beed 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -37,7 +37,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.Base64Order; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.logging.Log; @@ -71,7 +71,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc private final Queue positions; private double termFrequency; - public WordReferenceVars(final URIMetadata md) { + public WordReferenceVars(final URIMetadataRow md) { this.language = md.language(); this.flags = md.flags(); this.lastModified = md.moddate().getTime(); diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 4471edd45..613c127f1 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -81,7 +81,6 @@ import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -812,7 +811,8 @@ public final class Protocol try { event.getQuery().getSegment().fulltext().putMetadata(urlEntry); ResultURLs.stack( - urlEntry, + ASCII.String(urlEntry.url().hash()), + urlEntry.url().getHost(), event.peers.mySeed().hash.getBytes(), UTF8.getBytes(target.hash), EventOrigin.QUERIES); @@ -1103,7 +1103,8 @@ public final class Protocol try { event.getQuery().getSegment().fulltext().putDocument(ClientUtils.toSolrInputDocument(doc)); ResultURLs.stack( - urlEntry, + ASCII.String(urlEntry.url().hash()), + urlEntry.url().getHost(), event.peers.mySeed().hash.getBytes(), UTF8.getBytes(target.hash), EventOrigin.QUERIES); @@ -1187,7 +1188,7 @@ public final class Protocol final String process, final String result, final String reason, - final URIMetadata entry, + final URIMetadataNode entry, final String wordhashes) { assert (target != null); assert (mySeed != null); @@ -1225,8 +1226,7 @@ public final class Protocol // send request try { // prepare request - final Map parts = - basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); + final Map parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); parts.put("process", UTF8.StringBody(process)); parts.put("urlhash", UTF8.StringBody(((entry == null) ? "" : ASCII.String(entry.hash())))); parts.put("result", UTF8.StringBody(result)); @@ -1266,7 +1266,7 @@ public final class Protocol public static String transferIndex( final Seed targetSeed, final ReferenceContainerCache indexes, - final SortedMap urlCache, + final SortedMap urlCache, final boolean gzipBody, final int timeout) { @@ -1327,7 +1327,7 @@ public final class Protocol } // all url's known // extract the urlCache from the result - final URIMetadata[] urls = new URIMetadata[uhs.length]; + final URIMetadataNode[] urls = new URIMetadataNode[uhs.length]; for ( int i = 0; i < uhs.length; i++ ) { urls[i] = urlCache.get(ASCII.getBytes(uhs[i])); if ( urls[i] == null ) { @@ -1435,7 +1435,7 @@ public final class Protocol private static Map transferURL( final Seed targetSeed, - final URIMetadata[] urls, + final URIMetadataNode[] urls, boolean gzipBody, final int timeout) { // this post a message to the remote message board @@ -1457,7 +1457,7 @@ public final class Protocol String resource; int urlc = 0; int urlPayloadSize = 0; - for ( final URIMetadata url : urls ) { + for ( final URIMetadataNode url : urls ) { if ( url != null ) { resource = url.toString(); //System.out.println("*** DEBUG resource = " + resource); diff --git a/source/net/yacy/peers/Transmission.java b/source/net/yacy/peers/Transmission.java index 9928ab7db..61e27900d 100644 --- a/source/net/yacy/peers/Transmission.java +++ b/source/net/yacy/peers/Transmission.java @@ -36,7 +36,6 @@ import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -89,7 +88,7 @@ public class Transmission { */ private final byte[] primaryTarget; private final ReferenceContainerCache containers; - private final SortedMap references; + private final SortedMap references; private final HandleSet badReferences; private final List targets; private int hit, miss; @@ -105,7 +104,7 @@ public class Transmission { super(); this.primaryTarget = primaryTarget; this.containers = new ReferenceContainerCache(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength); - this.references = new TreeMap(Base64Order.enhancedCoder); + this.references = new TreeMap(Base64Order.enhancedCoder); this.badReferences = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); this.targets = targets; this.hit = 0; @@ -179,7 +178,7 @@ public class Transmission { notFoundx.add(e.urlhash()); this.badReferences.put(e.urlhash()); } else { - this.references.put(e.urlhash(), r.toRow()); + this.references.put(e.urlhash(), r); } } // now delete all references that were not found diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index 0546e9b14..19652e757 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -46,7 +46,7 @@ import java.util.regex.PatternSyntaxException; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.logging.Log; @@ -328,17 +328,19 @@ public class Blacklist { return ret; } + public boolean isListed(final BlacklistType blacklistType, final URIMetadataNode entry) { + return isListed(blacklistType, entry.url()); + } + public boolean isListed(final BlacklistType blacklistType, final URIMetadataRow entry) { + return isListed(blacklistType, entry.url()); + } + /** * Checks whether the given entry is listed in given blacklist type * @param blacklistType The used blacklist * @param entry Entry to be checked - * @return Whether the given entry is blacklisted + * @return Whether the given entry is blacklisted */ - public boolean isListed(final BlacklistType blacklistType, final URIMetadata entry) { - // Call inner method - return isListed(blacklistType, entry.url()); - } - public boolean isListed(final BlacklistType blacklistType, final DigestURI url) { if (url == null) { throw new IllegalArgumentException("url may not be null"); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index b008eb975..bbc3aba32 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -76,6 +76,8 @@ import java.util.zip.GZIPOutputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import org.apache.solr.common.SolrInputDocument; + import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Classification; @@ -146,9 +148,7 @@ import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.interaction.contentcontrol.ContentControlImportThread; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.rwi.ReferenceContainer; @@ -2589,13 +2589,10 @@ public final class Switchboard extends serverSwitch this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url); // STORE WORD INDEX - URIMetadataRow newEntry = + SolrInputDocument newEntry = this.index.storeDocument( url, referrerURL, - queueEntry.lastModified(), - new Date(), - queueEntry.size(), queueEntry.profile(), queueEntry.getResponseHeader(), document, @@ -2628,7 +2625,9 @@ public final class Switchboard extends serverSwitch } // update url result list statistics - ResultURLs.stack(newEntry, // loaded url db entry + ResultURLs.stack( + ASCII.String(url.hash()), // loaded url db entry + url.getHost(), queueEntry.initiator(), // initiator peer hash UTF8.getBytes(this.peers.mySeed().hash), // executor peer hash processCase // process case @@ -2654,8 +2653,7 @@ public final class Switchboard extends serverSwitch initiatorPeer.setAlternativeAddress(this.clusterhashes.get(queueEntry.initiator())); } // start a thread for receipt sending to avoid a blocking here - new Thread(new receiptSending(initiatorPeer, newEntry), "sending receipt to " - + ASCII.String(queueEntry.initiator())).start(); + new Thread(new receiptSending(initiatorPeer, new URIMetadataNode(newEntry)), "sending receipt to " + ASCII.String(queueEntry.initiator())).start(); } } } @@ -2820,9 +2818,9 @@ public final class Switchboard extends serverSwitch public class receiptSending implements Runnable { private final Seed initiatorPeer; - private final URIMetadata reference; + private final URIMetadataNode reference; - public receiptSending(final Seed initiatorPeer, final URIMetadata reference) { + public receiptSending(final Seed initiatorPeer, final URIMetadataNode reference) { this.initiatorPeer = initiatorPeer; this.reference = reference; } diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index c7539e59e..8f9601486 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -30,10 +30,11 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.Date; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import org.apache.solr.common.SolrInputDocument; + import net.yacy.cora.document.Classification; import net.yacy.cora.document.UTF8; import net.yacy.document.Condenser; @@ -41,7 +42,6 @@ import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.logging.Log; import net.yacy.search.query.QueryParams; @@ -101,12 +101,12 @@ public class DocumentIndex extends Segment { @Override public void run() { DigestURI f; - URIMetadata[] resultRows; + SolrInputDocument[] resultRows; try { while ( (f = DocumentIndex.this.queue.take()) != poison ) { try { resultRows = add(f); - for ( final URIMetadata resultRow : resultRows ) { + for ( final SolrInputDocument resultRow : resultRows ) { if ( DocumentIndex.this.callback != null ) { if ( resultRow == null ) { DocumentIndex.this.callback.fail(f, "result is null"); @@ -138,7 +138,7 @@ public class DocumentIndex extends Segment { this.queue.clear(); } - private URIMetadata[] add(final DigestURI url) throws IOException { + private SolrInputDocument[] add(final DigestURI url) throws IOException { if ( url == null ) { throw new IOException("file = null"); } @@ -161,7 +161,7 @@ public class DocumentIndex extends Segment { throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } //Document document = Document.mergeDocuments(url, null, documents); - final URIMetadata[] rows = new URIMetadata[documents.length]; + final SolrInputDocument[] rows = new SolrInputDocument[documents.length]; int c = 0; for ( final Document document : documents ) { if (document == null) continue; @@ -170,9 +170,6 @@ public class DocumentIndex extends Segment { super.storeDocument( url, null, - new Date(url.lastModified()), - new Date(), - url.length(), null, null, document, @@ -275,7 +272,7 @@ public class DocumentIndex extends Segment { public interface CallbackListener { - public void commit(DigestURI f, URIMetadata resultRow); + public void commit(DigestURI f, SolrInputDocument resultRow); public void fail(DigestURI f, String failReason); } @@ -296,7 +293,7 @@ public class DocumentIndex extends Segment { System.out.println("using index files at " + segmentPath.getAbsolutePath()); final CallbackListener callback = new CallbackListener() { @Override - public void commit(final DigestURI f, final URIMetadata resultRow) { + public void commit(final DigestURI f, final SolrInputDocument resultRow) { System.out.println("indexed: " + f.toString()); } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 036aff09a..dcab10d5b 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -50,7 +50,6 @@ import net.yacy.cora.storage.ZIPReader; import net.yacy.cora.storage.ZIPWriter; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReference; @@ -64,7 +63,6 @@ import net.yacy.kelondro.util.MergeIterator; import net.yacy.search.Switchboard; import org.apache.lucene.util.Version; -import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; @@ -226,10 +224,10 @@ public final class Fulltext implements Iterable { // slow migration to solr final Row.Entry entry = this.urlIndexFile.remove(urlHash); if (entry == null) return null; - URIMetadataRow row = new URIMetadataRow(entry, wre, weight); + URIMetadataRow row = new URIMetadataRow(entry, wre); SolrInputDocument solrInput = this.solrScheme.metadata2solr(row); this.putDocument(solrInput); - return new URIMetadataNode(ClientUtils.toSolrDocument(solrInput), wre, weight); + return new URIMetadataNode(solrInput, wre, weight); } catch (final IOException e) { Log.logException(e); } @@ -244,9 +242,9 @@ public final class Fulltext implements Iterable { if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); SolrDocument sd = this.solr.get(id); Date now = new Date(); - Date sdDate = this.solrScheme.getDate(sd, YaCySchema.last_modified); - if (sdDate.after(now)) sdDate = now; - Date docDate = this.solrScheme.getDate(doc, YaCySchema.last_modified); + Date sdDate = sd == null ? null : SolrConfiguration.getDate(sd, YaCySchema.last_modified); + if (sdDate == null || sdDate.after(now)) sdDate = now; + Date docDate = SolrConfiguration.getDate(doc, YaCySchema.last_modified); if (docDate.after(now)) docDate = now; if (sd == null || sdDate.before(docDate)) { if (this.solrScheme.contains(YaCySchema.ip_s)) { @@ -263,13 +261,8 @@ public final class Fulltext implements Iterable { if (MemoryControl.shortStatus()) clearCache(); } - public void putMetadata(final URIMetadata entry) throws IOException { - if (entry instanceof URIMetadataNode) { - putDocument(ClientUtils.toSolrInputDocument(((URIMetadataNode) entry).getDocument())); - return; - } - assert entry instanceof URIMetadataRow; - URIMetadataRow row = (URIMetadataRow) entry; + public void putMetadata(final URIMetadataRow entry) throws IOException { + URIMetadataRow row = entry; byte[] idb = row.hash(); String id = ASCII.String(idb); @@ -516,7 +509,7 @@ public final class Fulltext implements Iterable { } } else { final Iterator i = entries(); // iterates indexURLEntry objects - URIMetadata entry; + URIMetadataNode entry; String url; while (i.hasNext()) { entry = i.next(); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 3c6bab11b..e68931ff9 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -35,6 +35,8 @@ import java.util.Properties; import java.util.Set; import java.util.concurrent.BlockingQueue; +import org.apache.solr.common.SolrInputDocument; + import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; @@ -57,7 +59,6 @@ import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReferenceFactory; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; @@ -284,7 +285,7 @@ public class Segment { if (this.urlCitationIndex != null) this.urlCitationIndex.close(); } - private String votedLanguage( + private static String votedLanguage( final DigestURI url, final String urlNormalform, final Document document, @@ -295,23 +296,17 @@ public class Segment { if (language == null) { // no statistics available, we take either the metadata (if given) or the TLD language = (bymetadata == null) ? url.language() : bymetadata; - if (this.log.isFine()) this.log.logFine("LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language); } else { if (bymetadata == null) { // two possible results: compare and report conflicts - if (language.equals(url.language())) - if (this.log.isFine()) this.log.logFine("LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language); - else { - final String error = "LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")"; + if (!language.equals(url.language())) { // see if we have a hint in the url that the statistic was right final String u = urlNormalform.toLowerCase(); if (!u.contains("/" + language + "/") && !u.contains("/" + ISO639.country(language).toLowerCase() + "/")) { // no confirmation using the url, use the TLD language = url.language(); - if (this.log.isFine()) this.log.logFine(error + ", corrected using the TLD"); } else { // this is a strong hint that the statistics was in fact correct - if (this.log.isFine()) this.log.logFine(error + ", but the url proves that the statistic is correct"); } } } else { @@ -340,12 +335,9 @@ public class Segment { if (this.termIndex != null) this.termIndex.add(termHash, entry); } - public URIMetadataRow storeDocument( + public SolrInputDocument storeDocument( final DigestURI url, final DigestURI referrerURL, - Date modDate, - final Date loadDate, - final long sourcesize, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, @@ -359,44 +351,21 @@ public class Segment { // CREATE INDEX // load some document metadata + final Date loadDate = new Date(); final String id = ASCII.String(url.hash()); final String dc_title = document.dc_title(); final String urlNormalform = url.toNormalform(true); final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language // STORE URL TO LOADED-URL-DB - if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader + Date modDate = responseHeader.lastModified(); + if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; char docType = Response.docType(document.dc_format()); - final URIMetadataRow metadata = new URIMetadataRow( - url, // URL - dc_title, // document description - document.dc_creator(), // author - document.dc_subject(' '), // tags - document.dc_publisher(), // publisher (may be important to get location data) - document.lon(), // decimal degrees as in WGS84; - document.lat(), // if unknown both values may be 0.0d; - modDate, // modification date - loadDate, // loaded date - new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula - (referrerURL == null) ? null : ASCII.String(referrerURL.hash()), // referer hash - new byte[0], // md5 - (int) sourcesize, // size - condenser.RESULT_NUMB_WORDS, // word count - docType, // doctype - condenser.RESULT_FLAGS, // flags - UTF8.getBytes(language), // language - document.inboundLinks().size(), // inbound links - document.outboundLinks().size(), // outbound links - document.getAudiolinks().size(), // laudio - document.getImages().size(), // limage - document.getVideolinks().size(), // lvideo - document.getApplinks().size(), // lapp - profile.collections() // collections - ); - + // STORE TO SOLR + final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language); try { - this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, metadata)); + this.fulltext.putDocument(solrInputDoc); } catch ( final IOException e ) { Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage()); } @@ -487,7 +456,7 @@ public class Segment { } // finished - return metadata; + return solrInputDoc; } public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) { diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index ca3527695..460df3c58 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -55,13 +55,10 @@ import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; -import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.Bitfield; -import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; @@ -174,18 +171,12 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (isEmpty() || contains(key)) key.add(doc, value); } - public Date getDate(SolrInputDocument doc, final YaCySchema key) { + public static Date getDate(SolrInputDocument doc, final YaCySchema key) { Date x = (Date) doc.getFieldValue(key.name()); Date now = new Date(); return (x == null) ? new Date(0) : x.after(now) ? now : x; } - public Date getDate(SolrDocument doc, final YaCySchema key) { - Date x = doc == null ? null : (Date) doc.getFieldValue(key.name()); - Date now = new Date(); - return (x == null) ? new Date(0) : x.after(now) ? now : x; - } - /** * save configuration to file and update enum SolrFields * @throws IOException @@ -207,11 +198,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } catch (final IOException e) {} } - public SolrInputDocument metadata2solr(final URIMetadata md) { - assert md instanceof URIMetadataRow; - if (md instanceof URIMetadataNode) { - return ClientUtils.toSolrInputDocument(((URIMetadataNode) md).getDocument()); - } + public SolrInputDocument metadata2solr(final URIMetadataRow md) { final SolrInputDocument doc = new SolrInputDocument(); final DigestURI digestURI = DigestURI.toDigestURI(md.url()); @@ -339,10 +326,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); } - public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, Condenser condenser, final URIMetadata metadata) { + public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, Condenser condenser, DigestURI referrerURL, String language) { // we use the SolrCell design as index scheme final SolrInputDocument doc = new SolrInputDocument(); - final DigestURI digestURI = DigestURI.toDigestURI(yacydoc.dc_source()); + final DigestURI digestURI = DigestURI.toDigestURI(document.dc_source()); boolean allAttr = this.isEmpty(); add(doc, YaCySchema.id, id); if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) @@ -377,7 +364,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom); } - List titles = yacydoc.titles(); + List titles = document.titles(); if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, titles); if (allAttr || contains(YaCySchema.title_count_i)) add(doc, YaCySchema.title_count_i, titles.size()); if (allAttr || contains(YaCySchema.title_chars_val)) { @@ -391,7 +378,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable add(doc, YaCySchema.title_words_val, cv); } - String description = yacydoc.dc_description(); + String description = document.dc_description(); List descriptions = new ArrayList(); for (String s: description.split("\n")) descriptions.add(s); if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, description); @@ -407,11 +394,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable add(doc, YaCySchema.description_words_val, cv); } - if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, yacydoc.dc_creator()); - if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, new String[]{yacydoc.dc_format()}); - if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, header == null ? new Date() : header.lastModified()); - if (allAttr || contains(YaCySchema.keywords)) add(doc, YaCySchema.keywords, yacydoc.dc_subject(' ')); - final String content = yacydoc.getTextString(); + if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, document.dc_creator()); + if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, new String[]{document.dc_format()}); + if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified()); + if (allAttr || contains(YaCySchema.keywords)) add(doc, YaCySchema.keywords, document.dc_subject(' ')); + final String content = document.getTextString(); if (allAttr || contains(YaCySchema.text_t)) add(doc, YaCySchema.text_t, content); if (allAttr || contains(YaCySchema.wordcount_i)) { final int contentwc = content.split(" ").length; @@ -427,11 +414,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.url_file_ext_s)) add(doc, YaCySchema.url_file_ext_s, digestURI.getFileExtension()); // get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme - Set inboundLinks = yacydoc.inboundLinks(); - Set outboundLinks = yacydoc.outboundLinks(); + Set inboundLinks = document.inboundLinks(); + Set outboundLinks = document.outboundLinks(); int c = 0; - final Object parser = yacydoc.getParserObject(); + final Object parser = document.getParserObject(); Map images = new HashMap(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; @@ -482,10 +469,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (robots_meta.indexOf("nofollow",0) >= 0) b += 8; // set bit 3 } String x_robots_tag = ""; - if (header != null) { - x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, ""); + if (responseHeader != null) { + x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, ""); if (x_robots_tag.isEmpty()) { - x_robots_tag = header.get(HeaderFramework.X_ROBOTS, ""); + x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, ""); } } if (!x_robots_tag.isEmpty()) { @@ -670,14 +657,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // response time - add(doc, YaCySchema.responsetime_i, header == null ? 0 : Integer.parseInt(header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"))); + add(doc, YaCySchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"))); } // list all links - final Map alllinks = yacydoc.getAnchors(); + final Map alllinks = document.getAnchors(); c = 0; if (allAttr || contains(YaCySchema.inboundlinkscount_i)) add(doc, YaCySchema.inboundlinkscount_i, inboundLinks.size()); - if (allAttr || contains(YaCySchema.inboundlinksnofollowcount_i)) add(doc, YaCySchema.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount()); + if (allAttr || contains(YaCySchema.inboundlinksnofollowcount_i)) add(doc, YaCySchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount()); final List inboundlinksTag = new ArrayList(inboundLinks.size()); final List inboundlinksURLProtocol = new ArrayList(inboundLinks.size()); final List inboundlinksURLStub = new ArrayList(inboundLinks.size()); @@ -725,7 +712,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable c = 0; if (allAttr || contains(YaCySchema.outboundlinkscount_i)) add(doc, YaCySchema.outboundlinkscount_i, outboundLinks.size()); - if (allAttr || contains(YaCySchema.outboundlinksnofollowcount_i)) add(doc, YaCySchema.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount()); + if (allAttr || contains(YaCySchema.outboundlinksnofollowcount_i)) add(doc, YaCySchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); final List outboundlinksTag = new ArrayList(outboundLinks.size()); final List outboundlinksURLProtocol = new ArrayList(outboundLinks.size()); final List outboundlinksURLStub = new ArrayList(outboundLinks.size()); @@ -772,26 +759,30 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.outboundlinks_alttag_txt)) add(doc, YaCySchema.outboundlinks_alttag_txt, outboundlinksAltTag); // charset - if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, yacydoc.getCharset()); + if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, document.getCharset()); // coordinates - if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { - if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(yacydoc.lat()) + "," + Double.toString(yacydoc.lon())); + if (document.lat() != 0.0f && document.lon() != 0.0f) { + if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon())); } - if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, header == null ? 200 : header.getStatusCode()); - - // fields that are additionally in URIMetadataRow - if (allAttr || contains(YaCySchema.load_date_dt)) add(doc, YaCySchema.load_date_dt, metadata.loaddate()); - if (allAttr || contains(YaCySchema.fresh_date_dt)) add(doc, YaCySchema.fresh_date_dt, metadata.freshdate()); - if (allAttr || contains(YaCySchema.host_id_s)) add(doc, YaCySchema.host_id_s, metadata.hosthash()); - if ((allAttr || contains(YaCySchema.referrer_id_txt)) && metadata.referrerHash() != null) add(doc, YaCySchema.referrer_id_txt, new String[]{ASCII.String(metadata.referrerHash())}); + if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode()); + + // fields that were additionally in URIMetadataRow + Date loadDate = new Date(); + Date modDate = responseHeader.lastModified(); + if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; + int size = (int) Math.max(document.dc_source().length(), responseHeader.getContentLength()); + if (allAttr || contains(YaCySchema.load_date_dt)) add(doc, YaCySchema.load_date_dt, loadDate); + if (allAttr || contains(YaCySchema.fresh_date_dt)) add(doc, YaCySchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula + if (allAttr || contains(YaCySchema.host_id_s)) add(doc, YaCySchema.host_id_s, document.dc_source().hosthash()); + if ((allAttr || contains(YaCySchema.referrer_id_txt)) && referrerURL != null) add(doc, YaCySchema.referrer_id_txt, new String[]{ASCII.String(referrerURL.hash())}); //if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]); - if (allAttr || contains(YaCySchema.publisher_t)) add(doc, YaCySchema.publisher_t, yacydoc.dc_publisher()); - if ((allAttr || contains(YaCySchema.language_s)) && metadata.language() != null) add(doc, YaCySchema.language_s, UTF8.String(metadata.language())); - if (allAttr || contains(YaCySchema.size_i)) add(doc, YaCySchema.size_i, metadata.size()); - if (allAttr || contains(YaCySchema.audiolinkscount_i)) add(doc, YaCySchema.audiolinkscount_i, yacydoc.getAudiolinks().size()); - if (allAttr || contains(YaCySchema.videolinkscount_i)) add(doc, YaCySchema.videolinkscount_i, yacydoc.getVideolinks().size()); - if (allAttr || contains(YaCySchema.applinkscount_i)) add(doc, YaCySchema.applinkscount_i, yacydoc.getApplinks().size()); + if (allAttr || contains(YaCySchema.publisher_t)) add(doc, YaCySchema.publisher_t, document.dc_publisher()); + if ((allAttr || contains(YaCySchema.language_s)) && language != null) add(doc, YaCySchema.language_s, language); + if (allAttr || contains(YaCySchema.size_i)) add(doc, YaCySchema.size_i, size); + if (allAttr || contains(YaCySchema.audiolinkscount_i)) add(doc, YaCySchema.audiolinkscount_i, document.getAudiolinks().size()); + if (allAttr || contains(YaCySchema.videolinkscount_i)) add(doc, YaCySchema.videolinkscount_i, document.getVideolinks().size()); + if (allAttr || contains(YaCySchema.applinkscount_i)) add(doc, YaCySchema.applinkscount_i, document.getApplinks().size()); return doc; } @@ -827,6 +818,25 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable return a; } + /** + * encode a string containing attributes from anchor rel properties binary: + * bit 0: "me" contained in rel + * bit 1: "nofollow" contained in rel + * @param rel + * @return binary encoded information about rel + */ + private static List relEval(final List rel) { + List il = new ArrayList(rel.size()); + for (final String s: rel) { + int i = 0; + final String s0 = s.toLowerCase().trim(); + if ("me".equals(s0)) i += 1; + if ("nofollow".equals(s0)) i += 2; + il.add(i); + } + return il; + } + public static Iterator getLinks(SolrDocument doc, boolean inbound) { Collection urlstub = doc.getFieldValues((inbound ? YaCySchema.inboundlinks_urlstub_txt : YaCySchema.outboundlinks_urlstub_txt).name()); Collection urlprot = urlstub == null ? null : indexedList2protocolList(doc.getFieldValues((inbound ? YaCySchema.inboundlinks_protocol_sxt : YaCySchema.outboundlinks_protocol_sxt).name()), urlstub.size()); @@ -846,30 +856,17 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable return list.iterator(); } - /** - * encode a string containing attributes from anchor rel properties binary: - * bit 0: "me" contained in rel - * bit 1: "nofollow" contained in rel - * @param rel - * @return binary encoded information about rel - */ - private static List relEval(final List rel) { - List il = new ArrayList(rel.size()); - for (final String s: rel) { - int i = 0; - final String s0 = s.toLowerCase().trim(); - if ("me".equals(s0)) i += 1; - if ("nofollow".equals(s0)) i += 2; - il.add(i); - } - return il; + public static Date getDate(SolrDocument doc, final YaCySchema key) { + Date x = doc == null ? null : (Date) doc.getFieldValue(key.name()); + Date now = new Date(); + return (x == null) ? new Date(0) : x.after(now) ? now : x; } - public String solrGetID(final SolrDocument solr) { + public static String solrGetID(final SolrDocument solr) { return (String) solr.getFieldValue(YaCySchema.id.getSolrFieldName()); } - public DigestURI solrGetURL(final SolrDocument solr) { + public static DigestURI solrGetURL(final SolrDocument solr) { try { return new DigestURI((String) solr.getFieldValue(YaCySchema.sku.getSolrFieldName())); } catch (final MalformedURLException e) { @@ -877,29 +874,29 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } } - public String solrGetTitle(final SolrDocument solr) { + public static String solrGetTitle(final SolrDocument solr) { return (String) solr.getFieldValue(YaCySchema.title.getSolrFieldName()); } - public String solrGetText(final SolrDocument solr) { + public static String solrGetText(final SolrDocument solr) { return (String) solr.getFieldValue(YaCySchema.text_t.getSolrFieldName()); } - public String solrGetAuthor(final SolrDocument solr) { + public static String solrGetAuthor(final SolrDocument solr) { return (String) solr.getFieldValue(YaCySchema.author.getSolrFieldName()); } - public String solrGetDescription(final SolrDocument solr) { + public static String solrGetDescription(final SolrDocument solr) { return (String) solr.getFieldValue(YaCySchema.description.getSolrFieldName()); } - public Date solrGetDate(final SolrDocument solr) { + public static Date solrGetDate(final SolrDocument solr) { Date date = (Date) solr.getFieldValue(YaCySchema.last_modified.getSolrFieldName()); Date now = new Date(); return date.after(now) ? now : date; } - public Collection solrGetKeywords(final SolrDocument solr) { + public static Collection solrGetKeywords(final SolrDocument solr) { final Collection c = solr.getFieldValues(YaCySchema.keywords.getSolrFieldName()); final ArrayList a = new ArrayList(); for (final Object s: c) { diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index f1f425581..ebc06b3da 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -44,7 +44,6 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.Cache; import net.yacy.data.WorkTables; import net.yacy.document.Condenser; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 3fb382b46..5e119f06b 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -48,7 +48,6 @@ import net.yacy.document.SnippetExtractor; import net.yacy.document.WordTokenizer; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.util.ByteArray;