From f9c0e6e95029a22406e9498f23706d43f30fae81 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 10 Aug 2012 13:26:51 +0200 Subject: [PATCH] - Implemented and integrated the URIMetadataNode object which is a metadata representation from the solr index. This shall replace metadata from the built-in database in the future. - added the Solr-driven metadata into the search index of YaCy which makes it now possible to run YaCy without the old metadata index. This is a major stept forward to a full migration to Solr. --- defaults/solr.keys.list | 3 - htroot/yacy/crawlReceipt.java | 3 +- htroot/yacy/transferURL.java | 3 +- source/de/anomic/crawler/ResultURLs.java | 2 +- .../yacy/kelondro/data/meta/URIMetadata.java | 3 + .../kelondro/data/meta/URIMetadataNode.java | 673 +++++++----------- .../kelondro/data/meta/URIMetadataRow.java | 16 +- .../yacy/kelondro/data/meta/URIReference.java | 19 +- .../kelondro/data/meta/URIReferenceNode.java | 16 +- .../kelondro/data/word/WordReference.java | 16 +- .../kelondro/data/word/WordReferenceRow.java | 6 + .../kelondro/data/word/WordReferenceVars.java | 1 + .../net/yacy/kelondro/table/SplitTable.java | 1 + source/net/yacy/peers/Protocol.java | 12 +- .../yacy/search/index/DocumentReference.java | 4 - .../yacy/search/index/MetadataRepository.java | 44 +- .../yacy/search/index/SolrConfiguration.java | 22 +- source/net/yacy/search/index/YaCySchema.java | 1 - .../yacy/search/ranking/ReferenceOrder.java | 4 +- .../net/yacy/search/snippet/ResultEntry.java | 10 +- 20 files changed, 346 insertions(+), 513 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 1093b9da8..6f336a8c3 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -292,9 +292,6 @@ publisher_t ## the language used in the document; starts with primary language language_txt -## an external ranking value -ranking_i - ## the size of the raw source size_i diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 06fe81c0a..a3b639597 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -31,6 +31,7 @@ import java.io.IOException; import net.yacy.cora.document.ASCII; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.peers.Protocol; @@ -115,7 +116,7 @@ public final class crawlReceipt { } // generating a new loaded URL entry - final URIMetadataRow entry = URIMetadataRow.importEntry(propStr); + final URIMetadata entry = URIMetadataRow.importEntry(propStr); if (entry == null) { if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "3600"); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index cf50c88f3..633c08dd4 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -33,6 +33,7 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.peers.EventChannel; @@ -87,7 +88,7 @@ public final class transferURL { final int sizeBefore = sb.index.urlMetadata().size(); // read the urls from the other properties and store String urls; - URIMetadataRow lEntry; + URIMetadata lEntry; for (int i = 0; i < urlc; i++) { serverCore.checkInterruption(); diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java index 460107638..2593d40b1 100644 --- a/source/de/anomic/crawler/ResultURLs.java +++ b/source/de/anomic/crawler/ResultURLs.java @@ -223,7 +223,7 @@ public final class ResultURLs { public static void main(final String[] args) { try { final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/"); - final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0d, 0.0d, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0); + final URIMetadata urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0d, 0.0d, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0); final EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING; System.out.println("valid test:\n======="); // add diff --git a/source/net/yacy/kelondro/data/meta/URIMetadata.java b/source/net/yacy/kelondro/data/meta/URIMetadata.java index 49dc6a0e7..52a4b11a3 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadata.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadata.java @@ -26,6 +26,7 @@ import java.util.Date; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.order.Bitfield; +import de.anomic.crawler.retrieval.Request; public interface URIMetadata extends URIReference { @@ -82,4 +83,6 @@ public interface URIMetadata extends URIReference { public byte[] referrerHash(); + public Request toBalancerEntry(final String initiatorHash); + } diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 5593fd602..29e98cfa8 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -1,7 +1,7 @@ /** * URIMetadataNode * Copyright 2012 by Michael Peter Christen - * First released 3.4.2012 at http://yacy.net + * First released 10.8.2012 at http://yacy.net * * This file is part of YaCy Content Integration * @@ -9,12 +9,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -22,432 +22,260 @@ package net.yacy.kelondro.data.meta; -import net.yacy.cora.lod.Node; -import net.yacy.cora.lod.vocabulary.Rdf; -import net.yacy.kelondro.data.word.WordReferenceVars; +import java.net.MalformedURLException; +import java.util.Date; +import java.util.List; +import java.util.regex.Pattern; +import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.UTF8; +import net.yacy.cora.lod.vocabulary.Tagging; +import net.yacy.document.Condenser; +import net.yacy.kelondro.data.word.WordReference; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.order.Base64Order; +import net.yacy.kelondro.order.Bitfield; +import net.yacy.search.index.YaCySchema; -public class URIMetadataNode /*extends URIReferenceNode implements URIMetadata*/ { +import org.apache.solr.common.SolrDocument; - private final Node entry; +import de.anomic.crawler.retrieval.Request; +import de.anomic.crawler.retrieval.Response; +import de.anomic.tools.crypt; + +/** + * This is the URIMetadata object implementation for Solr documents. + * The purpose of this object is the migration from the old metadata structure to solr document. + * Future implementations should try to replace URIMetadata objects completely by SolrDocument objects + */ +public class URIMetadataNode implements URIMetadata { + + private final byte[] hash; + private final String urlRaw, keywords; + private DigestURI url; + Bitfield flags; + private final int imagec, audioc, videoc, appc; + private final double lon, lat; + private long ranking; // during generation of a search result this value is set + private final SolrDocument doc; private final String snippet; - private final WordReferenceVars word; // this is only used if the url is transported via remote search requests - private final long ranking; // during generation of a search result this value is set - - public URIMetadataNode() { - // create a dummy entry, good to produce poison objects - this.entry = new Node(Rdf.Description); - this.snippet = null; - this.word = null; - this.ranking = 0; - } -/* - public URIMetadataNode( - final DigestURI url, - final String dc_title, - final String dc_creator, - final String dc_subject, - final String dc_publisher, - final float lon, final float lat, // decimal degrees as in WGS84; if unknown both values may be 0.0f; - final Date mod, - final Date load, - final Date fresh, - final String referrer, - final byte[] md5, - final long size, - final int wc, - final char dt, - final Bitfield flags, - final byte[] lang, - final int llocal, - final int lother, - final int laudio, - final int limage, - final int lvideo, - final int lapp) { - // create new entry - this.entry = new Node(); - this.entry.setSubject(UTF8.getBytes(url.toNormalform(true, false))); - this.entry.setObject(YaCyMetadata.hash, url.hash()); - this.entry.setObject(DublinCore.Title, UTF8.getBytes(dc_title)); - this.entry.setObject(DublinCore.Creator, UTF8.getBytes(dc_creator)); - this.entry.setObject(DublinCore.Subject, UTF8.getBytes(dc_subject)); - this.entry.setObject(DublinCore.Publisher, UTF8.getBytes(dc_publisher)); - this.entry.setObject(Geo.Lat, ASCII.getBytes(Float.toString(lat))); - this.entry.setObject(Geo.Long, ASCII.getBytes(Float.toString(lon))); - - - encodeDate(col_mod, mod); - encodeDate(col_load, load); - encodeDate(col_fresh, fresh); - this.entry.setCol(col_referrer, (referrer == null) ? null : UTF8.getBytes(referrer)); - this.entry.setCol(col_md5, md5); - this.entry.setCol(col_size, size); - this.entry.setCol(col_wc, wc); - this.entry.setCol(col_dt, new byte[]{(byte) dt}); - this.entry.setCol(col_flags, flags.bytes()); - this.entry.setCol(col_lang, lang); - this.entry.setCol(col_llocal, llocal); - this.entry.setCol(col_lother, lother); - this.entry.setCol(col_limage, limage); - this.entry.setCol(col_laudio, laudio); - this.entry.setCol(col_lvideo, lvideo); - this.entry.setCol(col_lapp, lapp); - //System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString()); - this.snippet = null; - this.word = null; - this.ranking = 0; - this.comp = null; - } - - private byte[] encodeDate(final Date d) { - // calculates the number of days since 1.1.1970 and returns this as 4-byte array - // 86400000 is the number of milliseconds in one day - return NaturalOrder.encodeLong(d.getTime() / 86400000L, 4); - } - - private Date decodeDate(final int col) { - final long t = this.entry.getColLong(col); - } - - public static byte[] encodeComp( - final DigestURI url, - final String dc_title, - final String dc_creator, - final String dc_subject, - final String dc_publisher, - final float lat, - final float lon) { - final CharBuffer s = new CharBuffer(360); - s.append(url.toNormalform(false, true)).appendLF(); - s.append(dc_title).appendLF(); - if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); - s.appendLF(); - if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject); - s.appendLF(); - if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher); - s.appendLF(); - if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Float.toString(lat)).append(',').append(Float.toString(lon)).appendLF(); - return UTF8.getBytes(s.toString()); - } - - public URIMetadataRow(final Row.Entry entry, final WordReferenceVars searchedWord, final long ranking) { - this.entry = entry; - this.snippet = null; - this.word = searchedWord; - this.ranking = ranking; - this.comp = null; - } + private WordReference word; // this is only used if the url is transported via remote search requests - public URIMetadataRow(final Properties prop) { - // generates an plasmaLURLEntry using the properties from the argument - // the property names must correspond to the one from toString - //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); - DigestURI url; + public URIMetadataNode(final SolrDocument doc) { + this.doc = doc; + this.snippet = ""; + this.word = null; + this.ranking = Long.MIN_VALUE; + this.hash = ASCII.getBytes(getString(YaCySchema.id)); + this.urlRaw = getString(YaCySchema.sku); try { - url = new DigestURI(crypt.simpleDecode(prop.getProperty("url", ""), null), ASCII.getBytes(prop.getProperty("hash"))); - } catch (final MalformedURLException e) { - url = null; + this.url = new DigestURI(this.urlRaw, this.hash); + } catch (MalformedURLException e) { + Log.logException(e); + this.url = null; } - String descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (descr == null) descr = ""; - String dc_creator = crypt.simpleDecode(prop.getProperty("author", ""), null); if (dc_creator == null) dc_creator = ""; - String tags = crypt.simpleDecode(prop.getProperty("tags", ""), null); if (tags == null) tags = ""; - String dc_publisher = crypt.simpleDecode(prop.getProperty("publisher", ""), null); if (dc_publisher == null) dc_publisher = ""; - String lons = crypt.simpleDecode(prop.getProperty("lon", "0.0"), null); if (lons == null) lons = "0.0"; - String lats = crypt.simpleDecode(prop.getProperty("lat", "0.0"), null); if (lats == null) lats = "0.0"; - - this.entry = rowdef.newEntry(); - this.entry.setCol(col_hash, url.hash()); // FIXME potential null pointer access - this.entry.setCol(col_comp, encodeComp(url, descr, dc_creator, tags, dc_publisher, Float.parseFloat(lats), Float.parseFloat(lons))); - // create new formatters to make concurrency possible - final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); - - try { - encodeDate(col_mod, formatter.parse(prop.getProperty("mod", "20000101"))); - } catch (final ParseException e) { - encodeDate(col_mod, new Date()); - } - try { - encodeDate(col_load, formatter.parse(prop.getProperty("load", "20000101"))); - } catch (final ParseException e) { - encodeDate(col_load, new Date()); - } - try { - encodeDate(col_fresh, formatter.parse(prop.getProperty("fresh", "20000101"))); - } catch (final ParseException e) { - encodeDate(col_fresh, new Date()); - } - this.entry.setCol(col_referrer, UTF8.getBytes(prop.getProperty("referrer", ""))); - this.entry.setCol(col_md5, Digest.decodeHex(prop.getProperty("md5", ""))); - this.entry.setCol(col_size, Integer.parseInt(prop.getProperty("size", "0"))); - this.entry.setCol(col_wc, Integer.parseInt(prop.getProperty("wc", "0"))); - final String dt = prop.getProperty("dt", "t"); - this.entry.setCol(col_dt, dt.length() > 0 ? new byte[]{(byte) dt.charAt(0)} : new byte[]{(byte) 't'}); - final String flags = prop.getProperty("flags", "AAAAAA"); - this.entry.setCol(col_flags, (flags.length() > 6) ? QueryParams.empty_constraint.bytes() : (new Bitfield(4, flags)).bytes()); - this.entry.setCol(col_lang, UTF8.getBytes(prop.getProperty("lang", "uk"))); - this.entry.setCol(col_llocal, Integer.parseInt(prop.getProperty("llocal", "0"))); - this.entry.setCol(col_lother, Integer.parseInt(prop.getProperty("lother", "0"))); - this.entry.setCol(col_limage, Integer.parseInt(prop.getProperty("limage", "0"))); - this.entry.setCol(col_laudio, Integer.parseInt(prop.getProperty("laudio", "0"))); - this.entry.setCol(col_lvideo, Integer.parseInt(prop.getProperty("lvideo", "0"))); - this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0"))); - this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null); - this.word = null; - if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported"); - if (prop.containsKey("wi")) { - this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", "")))); - } - this.ranking = 0; - this.comp = null; + // to set the flags bitfield we need to pre-load some values from the Solr document + this.keywords = getString(YaCySchema.keywords); + this.imagec = getInt(YaCySchema.imagescount_i); + this.audioc = getInt(YaCySchema.audiolinkscount_i); + this.videoc = getInt(YaCySchema.videolinkscount_i); + this.appc = getInt(YaCySchema.videolinkscount_i); + this.lon = getDouble(YaCySchema.lon_coordinate); + this.lat = getDouble(YaCySchema.lat_coordinate); + this.flags = new Bitfield(); + if (this.keywords != null && this.keywords.indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true); + if (this.lon != 0.0d || this.lat != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true); + if (this.imagec > 0) this.flags.set(Condenser.flag_cat_hasimage, true); + if (this.audioc > 0) this.flags.set(Condenser.flag_cat_hasaudio, true); + if (this.videoc > 0) this.flags.set(Condenser.flag_cat_hasvideo, true); + if (this.appc > 0) this.flags.set(Condenser.flag_cat_hasapp, true); + } + + public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) { + this(doc); + this.word = searchedWord; + this.ranking = ranking; } - public static URIMetadataRow importEntry(final String propStr) { - if (propStr == null || (propStr.length() > 0 && propStr.charAt(0) != '{') || !propStr.endsWith("}")) { - return null; - } - try { - return new URIMetadataRow(MapTools.s2p(propStr.substring(1, propStr.length() - 1))); - } catch (final kelondroException e) { - // wrong format - return null; - } + private int getInt(YaCySchema field) { + Integer x = (Integer) this.doc.getFieldValue(field.name()); + if (x == null) return 0; + return x.intValue(); } - private StringBuilder corePropList() { - // generate a parseable string; this is a simple property-list - final Components metadata = metadata(); - final StringBuilder s = new StringBuilder(300); - if (metadata == null) return null; - //System.out.println("author=" + comp.author()); - - // create new formatters to make concurrency possible - final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); - - try { - s.append("hash=").append(ASCII.String(hash())); - assert (s.toString().indexOf(0) < 0); - s.append(",url=").append(crypt.simpleEncode(metadata.url().toNormalform(false, true))); - assert (s.toString().indexOf(0) < 0); - s.append(",descr=").append(crypt.simpleEncode(metadata.dc_title())); - assert (s.toString().indexOf(0) < 0); - s.append(",author=").append(crypt.simpleEncode(metadata.dc_creator())); - assert (s.toString().indexOf(0) < 0); - s.append(",tags=").append(crypt.simpleEncode(metadata.dc_subject())); - assert (s.toString().indexOf(0) < 0); - s.append(",publisher=").append(crypt.simpleEncode(metadata.dc_publisher())); - assert (s.toString().indexOf(0) < 0); - s.append(",lat=").append(metadata.lat()); - assert (s.toString().indexOf(0) < 0); - s.append(",lon=").append(metadata.lon()); - assert (s.toString().indexOf(0) < 0); - s.append(",mod=").append(formatter.format(moddate())); - assert (s.toString().indexOf(0) < 0); - s.append(",load=").append(formatter.format(loaddate())); - assert (s.toString().indexOf(0) < 0); - s.append(",fresh=").append(formatter.format(freshdate())); - assert (s.toString().indexOf(0) < 0); - s.append(",referrer=").append(referrerHash() == null ? "" : ASCII.String(referrerHash())); - assert (s.toString().indexOf(0) < 0); - s.append(",md5=").append(md5()); - assert (s.toString().indexOf(0) < 0); - s.append(",size=").append(size()); - assert (s.toString().indexOf(0) < 0); - s.append(",wc=").append(wordCount()); - assert (s.toString().indexOf(0) < 0); - s.append(",dt=").append(doctype()); - assert (s.toString().indexOf(0) < 0); - s.append(",flags=").append(flags().exportB64()); - assert (s.toString().indexOf(0) < 0); - s.append(",lang=").append(language() == null ? "EN" : UTF8.String(language())); - assert (s.toString().indexOf(0) < 0); - s.append(",llocal=").append(llocal()); - assert (s.toString().indexOf(0) < 0); - s.append(",lother=").append(lother()); - assert (s.toString().indexOf(0) < 0); - s.append(",limage=").append(limage()); - assert (s.toString().indexOf(0) < 0); - s.append(",laudio=").append(laudio()); - assert (s.toString().indexOf(0) < 0); - s.append(",lvideo=").append(lvideo()); - assert (s.toString().indexOf(0) < 0); - s.append(",lapp=").append(lapp()); - assert (s.toString().indexOf(0) < 0); + private long getLong(YaCySchema field) { + Long x = (Long) this.doc.getFieldValue(field.name()); + if (x == null) return 0; + return x.longValue(); + } - if (this.word != null) { - // append also word properties - final String wprop = this.word.toPropertyForm(); - s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop)); - } - assert (s.toString().indexOf(0) < 0); - return s; + private double getDouble(YaCySchema field) { + Double x = (Double) this.doc.getFieldValue(field.name()); + if (x == null) return 0.0d; + return x.doubleValue(); + } - } catch (final Throwable e) { - // serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); - // if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null"); - // if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null"); - Log.logException(e); - return null; - } + private Date getDate(YaCySchema field) { + Date x = (Date) this.doc.getFieldValue(field.name()); + if (x == null) return new Date(0); + return x; } - public Row.Entry toRowEntry() { - return this.entry; + private String getString(YaCySchema field) { + String x = (String) this.doc.getFieldValue(field.name()); + if (x == null) return ""; + return x; } + @Override public byte[] hash() { - // return a url-hash, based on the md5 algorithm - // the result is a String of 12 bytes within a 72-bit space - // (each byte has an 6-bit range) - // that should be enough for all web pages on the world - return this.entry.getPrimaryKeyBytes(); + return this.hash; } - public long ranking() { - return this.ranking; + @Override + public String hosthash() { + return (String) this.doc.getFieldValue(YaCySchema.host_id_s.name()); } - public boolean matches(final Pattern matcher) { - return this.metadata().matches(matcher); + @Override + public Date moddate() { + return getDate(YaCySchema.last_modified); } - + + @Override public DigestURI url() { - return this.metadata().url(); + return this.url; } - - public String dc_title() { - return this.metadata().dc_title(); + + @Override + public boolean matches(Pattern matcher) { + return matcher.matcher(this.urlRaw.toLowerCase()).matches(); } - - public String dc_creator() { - return this.metadata().dc_creator(); + + @Override + public String dc_title() { + @SuppressWarnings("unchecked") + List titles = (List) this.doc.getFieldValue(YaCySchema.title.name()); + if (titles == null || titles.size() == 0) return ""; + return titles.get(0); } - - public String dc_publisher() { - return this.metadata().dc_publisher(); + + @Override + public String dc_creator() { + return getString(YaCySchema.author); } - - public String dc_subject() { - return this.metadata().dc_subject(); + + @Override + public String dc_publisher() { + return getString(YaCySchema.publisher_t); } - public float lat() { - return this.metadata().lat(); + @Override + public String dc_subject() { + return this.keywords; } - public float lon() { - return this.metadata().lon(); + @Override + public double lat() { + return this.lat; } - - private Components metadata() { - // avoid double computation of metadata elements - if (this.comp != null) return this.comp; - // parse elements from comp field; - final byte[] c = this.entry.getColBytes(col_comp, true); - final List cl = ByteBuffer.split(c, (byte) 10); - this.comp = new Components( - (cl.size() > 0) ? UTF8.String(cl.get(0)) : "", - hash(), - (cl.size() > 1) ? UTF8.String(cl.get(1)) : "", - (cl.size() > 2) ? UTF8.String(cl.get(2)) : "", - (cl.size() > 3) ? UTF8.String(cl.get(3)) : "", - (cl.size() > 4) ? UTF8.String(cl.get(4)) : "", - (cl.size() > 5) ? UTF8.String(cl.get(5)) : ""); - return this.comp; + + @Override + public double lon() { + return this.lon; } - public Date moddate() { - return decodeDate(col_mod); + @Override + public long ranking() { + return this.ranking; } + @Override public Date loaddate() { - return decodeDate(col_load); + return getDate(YaCySchema.load_date_dt); } + @Override public Date freshdate() { - return decodeDate(col_fresh); - } - - public byte[] referrerHash() { - // return the creator's hash or null if there is none - // FIXME: There seem to be some malformed entries in the databasees like "null\0\0\0\0\0\0\0\0" - final byte[] r = this.entry.getColBytes(col_referrer, true); - if (r != null) { - int i = r.length; - while (i > 0) { - if (r[--i] == 0) return null; - } - } - return r; + return getDate(YaCySchema.fresh_date_dt); } + @Override public String md5() { - // returns the md5 in hex representation - return Digest.encodeHex(this.entry.getColBytes(col_md5, true)); + return getString(YaCySchema.md5_s); } + @Override public char doctype() { - return (char) this.entry.getColByte(col_dt); + return Response.docType(getString(YaCySchema.content_type)); } + @Override public byte[] language() { - byte[] b = this.entry.getColBytes(col_lang, true); - if (b == null || b[0] == (byte)'[') { - String tld = this.metadata().url.getTLD(); - if (tld.length() < 2 || tld.length() > 2) return ASCII.getBytes("en"); - return ASCII.getBytes(tld); - } - return b; + String[] languages = (String[]) this.doc.getFieldValue(YaCySchema.language_txt.name()); + if (languages == null || languages.length == 0) return ASCII.getBytes("en"); + return UTF8.getBytes(languages[0]); } + @Override public int size() { - return (int) this.entry.getColLong(col_size); + return getInt(YaCySchema.size_i); } + @Override public Bitfield flags() { - return new Bitfield(this.entry.getColBytes(col_flags, true)); + return this.flags; } + @Override public int wordCount() { - return (int) this.entry.getColLong(col_wc); + return getInt(YaCySchema.wordcount_i); } + @Override public int llocal() { - return (int) this.entry.getColLong(col_llocal); + return getInt(YaCySchema.inboundlinkscount_i); } + @Override public int lother() { - return (int) this.entry.getColLong(col_lother); + return getInt(YaCySchema.outboundlinkscount_i); } + @Override public int limage() { - return (int) this.entry.getColLong(col_limage); + return this.imagec; } + @Override public int laudio() { - return (int) this.entry.getColLong(col_laudio); + return this.audioc; } + @Override public int lvideo() { - return (int) this.entry.getColLong(col_lvideo); + return this.videoc; } + @Override public int lapp() { - return (int) this.entry.getColLong(col_lapp); + return this.appc; } + @Override public String snippet() { - // the snippet may appear here if the url was transported in a remote search - // it will not be saved anywhere, but can only be requested here return this.snippet; } - public WordReferenceVars word() { + @Override + public WordReference word() { return this.word; } - public boolean isOlder(final URIMetadata other) { + @Override + public boolean isOlder(URIMetadata other) { if (other == null) return false; final Date tmoddate = moddate(); final Date omoddate = other.moddate(); @@ -461,7 +289,84 @@ public class URIMetadataNode /*extends URIReferenceNode implements URIMetadata*/ return false; } - public String toString(final String snippet) { + private StringBuilder corePropList() { + // generate a parseable string; this is a simple property-list + final StringBuilder s = new StringBuilder(300); + //System.out.println("author=" + comp.author()); + + // create new formatters to make concurrency possible + final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); + + try { + s.append("hash=").append(ASCII.String(hash())); + assert (s.toString().indexOf(0) < 0); + s.append(",url=").append(crypt.simpleEncode(url().toNormalform(false, true))); + assert (s.toString().indexOf(0) < 0); + s.append(",descr=").append(crypt.simpleEncode(dc_title())); + assert (s.toString().indexOf(0) < 0); + s.append(",author=").append(crypt.simpleEncode(dc_creator())); + assert (s.toString().indexOf(0) < 0); + s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(dc_subject()))); + assert (s.toString().indexOf(0) < 0); + s.append(",publisher=").append(crypt.simpleEncode(dc_publisher())); + assert (s.toString().indexOf(0) < 0); + s.append(",lat=").append(lat()); + assert (s.toString().indexOf(0) < 0); + s.append(",lon=").append(lon()); + assert (s.toString().indexOf(0) < 0); + s.append(",mod=").append(formatter.format(moddate())); + assert (s.toString().indexOf(0) < 0); + s.append(",load=").append(formatter.format(loaddate())); + assert (s.toString().indexOf(0) < 0); + s.append(",fresh=").append(formatter.format(freshdate())); + assert (s.toString().indexOf(0) < 0); + s.append(",referrer=").append(referrerHash() == null ? "" : ASCII.String(referrerHash())); + assert (s.toString().indexOf(0) < 0); + s.append(",md5=").append(md5()); + assert (s.toString().indexOf(0) < 0); + s.append(",size=").append(size()); + assert (s.toString().indexOf(0) < 0); + s.append(",wc=").append(wordCount()); + assert (s.toString().indexOf(0) < 0); + s.append(",dt=").append(doctype()); + assert (s.toString().indexOf(0) < 0); + s.append(",flags=").append(flags().exportB64()); + assert (s.toString().indexOf(0) < 0); + s.append(",lang=").append(language() == null ? "EN" : UTF8.String(language())); + assert (s.toString().indexOf(0) < 0); + s.append(",llocal=").append(llocal()); + assert (s.toString().indexOf(0) < 0); + s.append(",lother=").append(lother()); + assert (s.toString().indexOf(0) < 0); + s.append(",limage=").append(limage()); + assert (s.toString().indexOf(0) < 0); + s.append(",laudio=").append(laudio()); + assert (s.toString().indexOf(0) < 0); + s.append(",lvideo=").append(lvideo()); + assert (s.toString().indexOf(0) < 0); + s.append(",lapp=").append(lapp()); + assert (s.toString().indexOf(0) < 0); + + if (this.word != null) { + // append also word properties + final String wprop = this.word.toPropertyForm(); + s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop)); + } + assert (s.toString().indexOf(0) < 0); + return s; + + } catch (final Throwable e) { + Log.logException(e); + return null; + } + } + + /** + * the toString format must be completely identical to URIMetadataRow because that is used + * to transport the data over p2p connections. + */ + @Override + public String toString(String snippet) { // add information needed for remote transport final StringBuilder core = corePropList(); if (core == null) @@ -476,12 +381,20 @@ public class URIMetadataNode /*extends URIReferenceNode implements URIMetadata*/ //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; } + @Override + public byte[] referrerHash() { + String[] referrer = (String[]) this.doc.getFieldValue(YaCySchema.referrer_id_txt.name()); + if (referrer == null || referrer.length == 0) return null; + return ASCII.getBytes(referrer[0]); + } + + @Override public Request toBalancerEntry(final String initiatorHash) { return new Request( ASCII.getBytes(initiatorHash), - metadata().url(), + url(), referrerHash(), - metadata().dc_title(), + dc_title(), moddate(), null, 0, @@ -489,74 +402,4 @@ public class URIMetadataNode /*extends URIReferenceNode implements URIMetadata*/ 0, 0); } - - @Override - public String toString() { - final StringBuilder core = corePropList(); - if (core == null) return null; - - core.insert(0, "{"); - core.append("}"); - - return core.toString(); - //return "{" + core + "}"; - } - - private class Components { - private DigestURI url; - private String urlRaw; - private byte[] urlHash; - private final String dc_title, dc_creator, dc_subject, dc_publisher; - private final String latlon; // a comma-separated tuple as "," where the coordinates are given as WGS84 spatial coordinates in decimal degrees - - public Components( - final String urlRaw, - final byte[] urlhash, - final String title, - final String author, - final String tags, - final String publisher, - final String latlon) { - this.url = null; - this.urlRaw = urlRaw; - this.urlHash = urlhash; - this.dc_title = title; - this.dc_creator = author; - this.dc_subject = tags; - this.dc_publisher = publisher; - this.latlon = latlon; - } - public boolean matches(final Pattern matcher) { - if (this.urlRaw != null) return matcher.matcher(this.urlRaw.toLowerCase()).matches(); - if (this.url != null) return matcher.matcher(this.url.toNormalform(true, true).toLowerCase()).matches(); - return false; - } - public DigestURI url() { - if (this.url == null) { - try { - this.url = new DigestURI(this.urlRaw, this.urlHash); - } catch (final MalformedURLException e) { - this.url = null; - } - this.urlRaw = null; - this.urlHash = null; - } - return this.url; - } - public String dc_title() { return this.dc_title; } - public String dc_creator() { return this.dc_creator; } - public String dc_publisher() { return this.dc_publisher; } - public String dc_subject() { return this.dc_subject; } - public float lat() { - if (this.latlon == null || this.latlon.isEmpty()) return 0.0f; - final int p = this.latlon.indexOf(','); - return p < 0 ? 0.0f : Float.parseFloat(this.latlon.substring(0, p)); - } - public float lon() { - if (this.latlon == null || this.latlon.isEmpty()) return 0.0f; - final int p = this.latlon.indexOf(','); - return p < 0 ? 0.0f : Float.parseFloat(this.latlon.substring(p + 1)); - } - } - */ } \ No newline at end of file diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index 48fbaea3c..7374f4121 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -30,7 +30,6 @@ import java.net.MalformedURLException; import java.text.ParseException; import java.util.Date; import java.util.List; -import java.util.Map; import java.util.Properties; import java.util.regex.Pattern; @@ -38,6 +37,7 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; import net.yacy.cora.lod.vocabulary.Tagging; +import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.index.Row; @@ -104,7 +104,7 @@ public class URIMetadataRow implements URIMetadata { private final Row.Entry entry; private final String snippet; - private WordReferenceVars word; // this is only used if the url is transported via remote search requests + private WordReference word; // this is only used if the url is transported via remote search requests private final long ranking; // during generation of a search result this value is set private Components comp; @@ -167,12 +167,6 @@ public class URIMetadataRow implements URIMetadata { this.comp = null; } - @Override - public Map toMap() { - // TODO to be implemented - return null; - } - private void encodeDate(final int col, final Date d) { // calculates the number of days since 1.1.1970 and returns this as 4-byte array // 86400000 is the number of milliseconds in one day @@ -211,7 +205,7 @@ public class URIMetadataRow implements URIMetadata { return UTF8.getBytes(s0); } - public URIMetadataRow(final Row.Entry entry, final WordReferenceVars searchedWord, final long ranking) { + public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord, final long ranking) { this.entry = entry; this.snippet = null; this.word = searchedWord; @@ -284,7 +278,7 @@ public class URIMetadataRow implements URIMetadata { this.comp = null; } - public static URIMetadataRow importEntry(final String propStr) { + public static URIMetadata importEntry(final String propStr) { if (propStr == null || (!propStr.isEmpty() && propStr.charAt(0) != '{') || !propStr.endsWith("}")) { return null; } @@ -560,7 +554,7 @@ public class URIMetadataRow implements URIMetadata { } @Override - public WordReferenceVars word() { + public WordReference word() { return this.word; } diff --git a/source/net/yacy/kelondro/data/meta/URIReference.java b/source/net/yacy/kelondro/data/meta/URIReference.java index 0616f2689..d701bb49a 100644 --- a/source/net/yacy/kelondro/data/meta/URIReference.java +++ b/source/net/yacy/kelondro/data/meta/URIReference.java @@ -9,12 +9,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -23,7 +23,6 @@ package net.yacy.kelondro.data.meta; import java.util.Date; -import java.util.Map; import java.util.regex.Pattern; public interface URIReference { @@ -40,7 +39,7 @@ public interface URIReference { * @return */ public String hosthash(); - + /** * The modification date of the URIReference is given if * the record was created first and is defined with the @@ -48,26 +47,20 @@ public interface URIReference { * @return the modification date of this record */ public Date moddate(); - + /** * The DigestURI is the payload of the URIReference * @return the url as DigestURI with assigned URL hash according to the record hash */ public DigestURI url(); - + /** * check if the url matches agains a given matcher * @param matcher * @return true if the url() matches */ public boolean matches(final Pattern matcher); - - /** - * transform the record into a map which can be stored - * @return - */ - public Map toMap(); - + /** * produce a visible representation of the record * @return a string for the url() diff --git a/source/net/yacy/kelondro/data/meta/URIReferenceNode.java b/source/net/yacy/kelondro/data/meta/URIReferenceNode.java index eab4cfdf4..cc840c2bc 100644 --- a/source/net/yacy/kelondro/data/meta/URIReferenceNode.java +++ b/source/net/yacy/kelondro/data/meta/URIReferenceNode.java @@ -9,12 +9,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -26,7 +26,6 @@ import java.net.MalformedURLException; import java.text.ParseException; import java.util.Date; import java.util.HashMap; -import java.util.Map; import java.util.regex.Pattern; import net.yacy.cora.date.ISO8601Formatter; @@ -36,14 +35,14 @@ public class URIReferenceNode extends HashMap implements URIRefe private static final long serialVersionUID = -1580155759116466570L; - private byte[] hash; + private final byte[] hash; public URIReferenceNode(DigestURI uri, Date date) { this.hash = uri.hash(); this.put(MetadataVocabulary.url.name(), ASCII.getBytes(uri.toNormalform(true, false))); this.put(MetadataVocabulary.moddate.name(), ASCII.getBytes(ISO8601Formatter.FORMATTER.format(date))); } - + @Override public byte[] hash() { return this.hash; @@ -56,7 +55,7 @@ public class URIReferenceNode extends HashMap implements URIRefe this.hostHash = ASCII.String(this.hash, 6, 6); return this.hostHash; } - + @Override public Date moddate() { byte[] x = this.get(MetadataVocabulary.moddate.name()); @@ -84,9 +83,4 @@ public class URIReferenceNode extends HashMap implements URIRefe return matcher.matcher(ASCII.String(x)).matches(); } - @Override - public Map toMap() { - return this; - } - } diff --git a/source/net/yacy/kelondro/data/word/WordReference.java b/source/net/yacy/kelondro/data/word/WordReference.java index f30812afe..fb3983c02 100644 --- a/source/net/yacy/kelondro/data/word/WordReference.java +++ b/source/net/yacy/kelondro/data/word/WordReference.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -48,17 +48,19 @@ public interface WordReference extends Reference { public char getType(); public int wordsintitle(); - + public int llocal(); - + public int lother(); - + public int urllength(); - + public int urlcomps(); - + public Bitfield flags(); - + public double termFrequency(); + public String hosthash(); + } diff --git a/source/net/yacy/kelondro/data/word/WordReferenceRow.java b/source/net/yacy/kelondro/data/word/WordReferenceRow.java index 7bc43b1ff..554d8f9a9 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceRow.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceRow.java @@ -431,4 +431,10 @@ public final class WordReferenceRow extends AbstractReference implements WordRef } + @Override + public String hosthash() { + return ASCII.String(this.urlhash(), 6, 6); + } + + } diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index bdd623836..91910542d 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -285,6 +285,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc return this.urlHash; } + @Override public String hosthash() { if (this.hostHash != null) return this.hostHash; this.hostHash = ASCII.String(this.urlHash, 6, 6); diff --git a/source/net/yacy/kelondro/table/SplitTable.java b/source/net/yacy/kelondro/table/SplitTable.java index 4bfb9a867..c421b8d8e 100644 --- a/source/net/yacy/kelondro/table/SplitTable.java +++ b/source/net/yacy/kelondro/table/SplitTable.java @@ -265,6 +265,7 @@ public class SplitTable implements Index, Iterable { } public static void delete(final File path, final String tablename) { + if (path == null || tablename == null) return; final File tabledir = new File(path, tablename); if (!(tabledir.exists())) return; if ((!(tabledir.isDirectory()))) { diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 34d6295f8..1480d00ec 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -690,7 +690,7 @@ public final class Protocol // insert results to containers int term = count; - for ( final URIMetadataRow urlEntry : result.links ) { + for ( final URIMetadata urlEntry : result.links ) { if ( term-- <= 0 ) { break; // do not process more that requested (in case that evil peers fill us up with rubbish) } @@ -890,7 +890,7 @@ public final class Protocol public Map indexcount; // public long searchtime; // time that the peer actually spent to create the result public String[] references; // search hints, the top-words - public List links; // LURLs of search + public List links; // LURLs of search public Map indexabstract; // index abstracts, a collection of url-hashes per word public SearchResult( @@ -1015,14 +1015,14 @@ public final class Protocol } } this.references = resultMap.get("references").split(","); - this.links = new ArrayList(this.urlcount); + this.links = new ArrayList(this.urlcount); for ( int n = 0; n < this.urlcount; n++ ) { // get one single search result final String resultLine = resultMap.get("resource" + n); if ( resultLine == null ) { continue; } - final URIMetadataRow urlEntry = URIMetadataRow.importEntry(resultLine); + final URIMetadata urlEntry = URIMetadataRow.importEntry(resultLine); if ( urlEntry == null ) { continue; } @@ -1226,7 +1226,7 @@ public final class Protocol } // all url's known // extract the urlCache from the result - final URIMetadata[] urls = new URIMetadataRow[uhs.length]; + final URIMetadata[] urls = new URIMetadata[uhs.length]; for ( int i = 0; i < uhs.length; i++ ) { urls[i] = urlCache.get(ASCII.getBytes(uhs[i])); if ( urls[i] == null ) { @@ -1540,7 +1540,7 @@ public final class Protocol new RankingProfile(Classification.ContentDomain.TEXT), // rankingProfile, null // constraint); ); - for ( final URIMetadataRow link : result.links ) { + for ( final URIMetadata link : result.links ) { System.out.println(link.url().toNormalform(true, false)); System.out.println(link.snippet()); } diff --git a/source/net/yacy/search/index/DocumentReference.java b/source/net/yacy/search/index/DocumentReference.java index 43a3eb9f8..34501d6bf 100644 --- a/source/net/yacy/search/index/DocumentReference.java +++ b/source/net/yacy/search/index/DocumentReference.java @@ -49,10 +49,6 @@ public class DocumentReference { this.data = null; } - public void store(final URIReference entry) { - this.data.put(entry.hash(), entry.toMap()); - } - public URIReference load(final WeakPriorityBlockingQueue.Element obrwi) { return null; } diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java index 122ad7ad5..c8ed8363b 100644 --- a/source/net/yacy/search/index/MetadataRepository.java +++ b/source/net/yacy/search/index/MetadataRepository.java @@ -49,8 +49,9 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.data.word.WordReferenceVars; +import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.index.Cache; import net.yacy.kelondro.index.Index; import net.yacy.kelondro.index.Row; @@ -61,6 +62,7 @@ import net.yacy.search.Switchboard; import net.yacy.search.solr.EmbeddedSolrConnector; import org.apache.lucene.util.Version; +import org.apache.solr.common.SolrDocument; public final class MetadataRepository implements /*Metadata,*/ Iterable { @@ -190,36 +192,34 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable * @param obrwi * @return */ - public URIMetadata load(WordReferenceVars wre, long weight) { + public URIMetadata load(WordReference wre, long weight) { if (wre == null) return null; // all time was already wasted in takeRWI to get another element - final byte[] urlHash = wre.urlhash(); - if (urlHash == null) return null; - if (this.urlIndexFile != null) try { - final Row.Entry entry = this.urlIndexFile.get(urlHash, false); - if (entry == null) return null; - return new URIMetadataRow(entry, wre, weight); - } catch (final IOException e) { - Log.logException(e); - } - /* - try { - SolrDocument doc = this.solr.get(ASCII.String(urlHash)); - } catch (IOException e) { - Log.logException(e); - } - */ - return null; + return load(wre.urlhash(), wre, weight); } public URIMetadata load(final byte[] urlHash) { if (urlHash == null) return null; + return load(urlHash, null, 0); + } + + private URIMetadata load(final byte[] urlHash, WordReference wre, long weight) { + + // get the metadata from the old metadata index if (this.urlIndexFile != null) try { final Row.Entry entry = this.urlIndexFile.get(urlHash, false); - if (entry == null) return null; - return new URIMetadataRow(entry, null, 0); + if (entry != null) return new URIMetadataRow(entry, wre, weight); } catch (final IOException e) { - return null; + Log.logException(e); } + + // get the metadata from Solr + try { + SolrDocument doc = this.solr.get(ASCII.String(urlHash)); + if (doc != null) return new URIMetadataNode(doc, wre, weight); + } catch (IOException e) { + Log.logException(e); + } + return null; } diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 6ebad68f6..c0eb9b6f7 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -105,11 +105,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable private boolean contains(YaCySchema field) { return this.contains(field.name()); } - + protected void addSolr(final SolrDoc solrdoc, final YaCySchema key, final byte[] value) { if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length != 0))) solrdoc.addSolr(key, UTF8.String(value)); } - + protected void addSolr(final SolrDoc solrdoc, final YaCySchema key, final String value) { if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value); } @@ -149,7 +149,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable protected void addSolr(final SolrDoc solrdoc, final YaCySchema key, final boolean value) { if (isEmpty() || contains(key)) solrdoc.addSolr(key, value); } - + /** * save configuration to file and update enum SolrFields * @throws IOException @@ -170,7 +170,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } } catch (final IOException e) {} } - + public SolrDoc metadata2solr(final URIMetadata md) { final SolrDoc solrdoc = new SolrDoc(); final DigestURI digestURI = new DigestURI(md.url()); @@ -190,18 +190,18 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.content_type)) addSolr(solrdoc, YaCySchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype())); if (allAttr || contains(YaCySchema.last_modified)) addSolr(solrdoc, YaCySchema.last_modified, md.moddate()); if (allAttr || contains(YaCySchema.text_t)) addSolr(solrdoc, YaCySchema.text_t, ""); // not delivered in metadata - if (allAttr || contains(YaCySchema.wordcount_i)) addSolr(solrdoc, YaCySchema.wordcount_i, md.wordCount()); + if (allAttr || contains(YaCySchema.wordcount_i)) addSolr(solrdoc, YaCySchema.wordcount_i, md.wordCount()); if (allAttr || contains(YaCySchema.keywords)) { String keywords = md.dc_subject(); Bitfield flags = md.flags(); if (flags.get(Condenser.flag_cat_indexof)) { if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else { - if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof"; + if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof"; } } addSolr(solrdoc, YaCySchema.keywords, keywords); } - + // path elements of link final String path = digestURI.getPath(); if (path != null && (allAttr || contains(YaCySchema.paths_txt))) { @@ -229,12 +229,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.md5_s)) addSolr(solrdoc, YaCySchema.md5_s, md.md5()); if (allAttr || contains(YaCySchema.publisher_t)) addSolr(solrdoc, YaCySchema.publisher_t, md.dc_publisher()); if ((allAttr || contains(YaCySchema.language_txt)) && md.language() != null) addSolr(solrdoc, YaCySchema.language_txt,new String[]{UTF8.String(md.language())}); - if (allAttr || contains(YaCySchema.ranking_i)) addSolr(solrdoc, YaCySchema.ranking_i, md.ranking()); if (allAttr || contains(YaCySchema.size_i)) addSolr(solrdoc, YaCySchema.size_i, md.size()); if (allAttr || contains(YaCySchema.audiolinkscount_i)) addSolr(solrdoc, YaCySchema.audiolinkscount_i, md.laudio()); if (allAttr || contains(YaCySchema.videolinkscount_i)) addSolr(solrdoc, YaCySchema.videolinkscount_i, md.lvideo()); if (allAttr || contains(YaCySchema.applinkscount_i)) addSolr(solrdoc, YaCySchema.applinkscount_i, md.lapp()); - + return solrdoc; } @@ -585,7 +584,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } if (allAttr || contains(YaCySchema.httpstatus_i)) addSolr(solrdoc, YaCySchema.httpstatus_i, header == null ? 200 : header.getStatusCode()); - // fields that are additionally in URIMetadataRow + // fields that are additionally in URIMetadataRow if (allAttr || contains(YaCySchema.load_date_dt)) addSolr(solrdoc, YaCySchema.load_date_dt, metadata.loaddate()); if (allAttr || contains(YaCySchema.fresh_date_dt)) addSolr(solrdoc, YaCySchema.fresh_date_dt, metadata.freshdate()); if (allAttr || contains(YaCySchema.host_id_s)) addSolr(solrdoc, YaCySchema.host_id_s, metadata.hosthash()); @@ -593,12 +592,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable //if (allAttr || contains(SolrField.md5_s)) addSolr(solrdoc, SolrField.md5_s, new byte[0]); if (allAttr || contains(YaCySchema.publisher_t)) addSolr(solrdoc, YaCySchema.publisher_t, yacydoc.dc_publisher()); if ((allAttr || contains(YaCySchema.language_txt)) && metadata.language() != null) addSolr(solrdoc, YaCySchema.language_txt,new String[]{UTF8.String(metadata.language())}); - if (allAttr || contains(YaCySchema.ranking_i)) addSolr(solrdoc, YaCySchema.ranking_i, metadata.ranking()); if (allAttr || contains(YaCySchema.size_i)) addSolr(solrdoc, YaCySchema.size_i, metadata.size()); if (allAttr || contains(YaCySchema.audiolinkscount_i)) addSolr(solrdoc, YaCySchema.audiolinkscount_i, yacydoc.getAudiolinks().size()); if (allAttr || contains(YaCySchema.videolinkscount_i)) addSolr(solrdoc, YaCySchema.videolinkscount_i, yacydoc.getVideolinks().size()); if (allAttr || contains(YaCySchema.applinkscount_i)) addSolr(solrdoc, YaCySchema.applinkscount_i, yacydoc.getApplinks().size()); - + return solrdoc; } diff --git a/source/net/yacy/search/index/YaCySchema.java b/source/net/yacy/search/index/YaCySchema.java index 31cd03e9e..8380f7dc8 100644 --- a/source/net/yacy/search/index/YaCySchema.java +++ b/source/net/yacy/search/index/YaCySchema.java @@ -131,7 +131,6 @@ public enum YaCySchema implements Schema { md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5(); publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher(); language_txt(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language(); - ranking_i(SolrType.integer, true, true, "an external ranking value"),// long ranking(); size_i(SolrType.integer, true, true, "the size of the raw source"),// int size(); audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio(); videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo(); diff --git a/source/net/yacy/search/ranking/ReferenceOrder.java b/source/net/yacy/search/ranking/ReferenceOrder.java index 4bb72cba3..549d07bb2 100644 --- a/source/net/yacy/search/ranking/ReferenceOrder.java +++ b/source/net/yacy/search/ranking/ReferenceOrder.java @@ -213,7 +213,7 @@ public class ReferenceOrder { * @param t * @return a ranking: the higher the number, the better is the ranking */ - public long cardinal(final WordReferenceVars t) { + public long cardinal(final WordReference t) { //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords); // the normalizedEntry must be a normalized indexEntry final Bitfield flags = t.flags(); @@ -254,7 +254,7 @@ public class ReferenceOrder { + ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0) + ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0) + ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0) - + ((ByteBuffer.equals(t.language, this.language)) ? 255 << this.ranking.coeff_language : 0) + + ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0) + ((DigestURI.probablyRootURL(t.urlhash())) ? 15 << this.ranking.coeff_urllength : 0); //if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0; diff --git a/source/net/yacy/search/snippet/ResultEntry.java b/source/net/yacy/search/snippet/ResultEntry.java index 126f58c41..23b8c1a87 100644 --- a/source/net/yacy/search/snippet/ResultEntry.java +++ b/source/net/yacy/search/snippet/ResultEntry.java @@ -36,6 +36,8 @@ import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.word.Word; +import net.yacy.kelondro.data.word.WordReference; +import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; @@ -188,10 +190,12 @@ public class ResultEntry implements Comparable, Comparator