- Implemented and integrated the URIMetadataNode object which is a

metadata representation from the solr index. This shall replace metadata
from the built-in database in the future.
- added the Solr-driven metadata into the search index of YaCy which
makes it now possible to run YaCy without the old metadata index. This
is a major stept forward to a full migration to Solr.
Michael Peter Christen 13 years ago
parent b2b480fff2
commit f9c0e6e950

@ -292,9 +292,6 @@ publisher_t
## the language used in the document; starts with primary language
## an external ranking value
## the size of the raw source

@ -31,6 +31,7 @@ import java.io.IOException;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.Protocol;
@ -115,7 +116,7 @@ public final class crawlReceipt {
// generating a new loaded URL entry
final URIMetadataRow entry = URIMetadataRow.importEntry(propStr);
final URIMetadata entry = URIMetadataRow.importEntry(propStr);
if (entry == null) {
if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "3600");

@ -33,6 +33,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.EventChannel;
@ -87,7 +88,7 @@ public final class transferURL {
final int sizeBefore = sb.index.urlMetadata().size();
// read the urls from the other properties and store
String urls;
URIMetadataRow lEntry;
URIMetadata lEntry;
for (int i = 0; i < urlc; i++) {

@ -223,7 +223,7 @@ public final class ResultURLs {
public static void main(final String[] args) {
try {
final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/");
final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0d, 0.0d, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0);
final URIMetadata urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0d, 0.0d, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0);
final EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
System.out.println("valid test:\n=======");
// add

@ -26,6 +26,7 @@ import java.util.Date;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.order.Bitfield;
import de.anomic.crawler.retrieval.Request;
public interface URIMetadata extends URIReference {
@ -82,4 +83,6 @@ public interface URIMetadata extends URIReference {
public byte[] referrerHash();
public Request toBalancerEntry(final String initiatorHash);

@ -1,7 +1,7 @@
* URIMetadataNode
* Copyright 2012 by Michael Peter Christen
* First released 3.4.2012 at http://yacy.net
* First released 10.8.2012 at http://yacy.net
* This file is part of YaCy Content Integration
@ -9,12 +9,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -22,432 +22,260 @@
package net.yacy.kelondro.data.meta;
import net.yacy.cora.lod.Node;
import net.yacy.cora.lod.vocabulary.Rdf;
import net.yacy.kelondro.data.word.WordReferenceVars;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.List;
import java.util.regex.Pattern;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.search.index.YaCySchema;
public class URIMetadataNode /*extends URIReferenceNode implements URIMetadata*/ {
import org.apache.solr.common.SolrDocument;
private final Node entry;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.tools.crypt;
* This is the URIMetadata object implementation for Solr documents.
* The purpose of this object is the migration from the old metadata structure to solr document.
* Future implementations should try to replace URIMetadata objects completely by SolrDocument objects
public class URIMetadataNode implements URIMetadata {
private final byte[] hash;
private final String urlRaw, keywords;
private DigestURI url;
Bitfield flags;
private final int imagec, audioc, videoc, appc;
private final double lon, lat;
private long ranking; // during generation of a search result this value is set
private final SolrDocument doc;
private final String snippet;
private final WordReferenceVars word; // this is only used if the url is transported via remote search requests
private final long ranking; // during generation of a search result this value is set
public URIMetadataNode() {
// create a dummy entry, good to produce poison objects
this.entry = new Node(Rdf.Description);
this.snippet = null;
this.word = null;
this.ranking = 0;
public URIMetadataNode(
final DigestURI url,
final String dc_title,
final String dc_creator,
final String dc_subject,
final String dc_publisher,
final float lon, final float lat, // decimal degrees as in WGS84; if unknown both values may be 0.0f;
final Date mod,
final Date load,
final Date fresh,
final String referrer,
final byte[] md5,
final long size,
final int wc,
final char dt,
final Bitfield flags,
final byte[] lang,
final int llocal,
final int lother,
final int laudio,
final int limage,
final int lvideo,
final int lapp) {
// create new entry
this.entry = new Node();
this.entry.setSubject(UTF8.getBytes(url.toNormalform(true, false)));
this.entry.setObject(YaCyMetadata.hash, url.hash());
this.entry.setObject(DublinCore.Title, UTF8.getBytes(dc_title));
this.entry.setObject(DublinCore.Creator, UTF8.getBytes(dc_creator));
this.entry.setObject(DublinCore.Subject, UTF8.getBytes(dc_subject));
this.entry.setObject(DublinCore.Publisher, UTF8.getBytes(dc_publisher));
this.entry.setObject(Geo.Lat, ASCII.getBytes(Float.toString(lat)));
this.entry.setObject(Geo.Long, ASCII.getBytes(Float.toString(lon)));
encodeDate(col_mod, mod);
encodeDate(col_load, load);
encodeDate(col_fresh, fresh);
this.entry.setCol(col_referrer, (referrer == null) ? null : UTF8.getBytes(referrer));
this.entry.setCol(col_md5, md5);
this.entry.setCol(col_size, size);
this.entry.setCol(col_wc, wc);
this.entry.setCol(col_dt, new byte[]{(byte) dt});
this.entry.setCol(col_flags, flags.bytes());
this.entry.setCol(col_lang, lang);
this.entry.setCol(col_llocal, llocal);
this.entry.setCol(col_lother, lother);
this.entry.setCol(col_limage, limage);
this.entry.setCol(col_laudio, laudio);
this.entry.setCol(col_lvideo, lvideo);
this.entry.setCol(col_lapp, lapp);
//System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString());
this.snippet = null;
this.word = null;
this.ranking = 0;
this.comp = null;
private byte[] encodeDate(final Date d) {
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
// 86400000 is the number of milliseconds in one day
return NaturalOrder.encodeLong(d.getTime() / 86400000L, 4);
private Date decodeDate(final int col) {
final long t = this.entry.getColLong(col);
public static byte[] encodeComp(
final DigestURI url,
final String dc_title,
final String dc_creator,
final String dc_subject,
final String dc_publisher,
final float lat,
final float lon) {
final CharBuffer s = new CharBuffer(360);
s.append(url.toNormalform(false, true)).appendLF();
if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator);
if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject);
if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher);
if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Float.toString(lat)).append(',').append(Float.toString(lon)).appendLF();
return UTF8.getBytes(s.toString());
public URIMetadataRow(final Row.Entry entry, final WordReferenceVars searchedWord, final long ranking) {
this.entry = entry;
this.snippet = null;
this.word = searchedWord;
this.ranking = ranking;
this.comp = null;
private WordReference word; // this is only used if the url is transported via remote search requests
public URIMetadataRow(final Properties prop) {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
DigestURI url;
public URIMetadataNode(final SolrDocument doc) {
this.doc = doc;
this.snippet = "";
this.word = null;
this.ranking = Long.MIN_VALUE;
this.hash = ASCII.getBytes(getString(YaCySchema.id));
this.urlRaw = getString(YaCySchema.sku);
try {
url = new DigestURI(crypt.simpleDecode(prop.getProperty("url", ""), null), ASCII.getBytes(prop.getProperty("hash")));
} catch (final MalformedURLException e) {
url = null;
this.url = new DigestURI(this.urlRaw, this.hash);
} catch (MalformedURLException e) {
this.url = null;
String descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); if (descr == null) descr = "";
String dc_creator = crypt.simpleDecode(prop.getProperty("author", ""), null); if (dc_creator == null) dc_creator = "";
String tags = crypt.simpleDecode(prop.getProperty("tags", ""), null); if (tags == null) tags = "";
String dc_publisher = crypt.simpleDecode(prop.getProperty("publisher", ""), null); if (dc_publisher == null) dc_publisher = "";
String lons = crypt.simpleDecode(prop.getProperty("lon", "0.0"), null); if (lons == null) lons = "0.0";
String lats = crypt.simpleDecode(prop.getProperty("lat", "0.0"), null); if (lats == null) lats = "0.0";
this.entry = rowdef.newEntry();
this.entry.setCol(col_hash, url.hash()); // FIXME potential null pointer access
this.entry.setCol(col_comp, encodeComp(url, descr, dc_creator, tags, dc_publisher, Float.parseFloat(lats), Float.parseFloat(lons)));
// create new formatters to make concurrency possible
final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute);
try {
encodeDate(col_mod, formatter.parse(prop.getProperty("mod", "20000101")));
} catch (final ParseException e) {
encodeDate(col_mod, new Date());
try {
encodeDate(col_load, formatter.parse(prop.getProperty("load", "20000101")));
} catch (final ParseException e) {
encodeDate(col_load, new Date());
try {
encodeDate(col_fresh, formatter.parse(prop.getProperty("fresh", "20000101")));
} catch (final ParseException e) {
encodeDate(col_fresh, new Date());
this.entry.setCol(col_referrer, UTF8.getBytes(prop.getProperty("referrer", "")));
this.entry.setCol(col_md5, Digest.decodeHex(prop.getProperty("md5", "")));
this.entry.setCol(col_size, Integer.parseInt(prop.getProperty("size", "0")));
this.entry.setCol(col_wc, Integer.parseInt(prop.getProperty("wc", "0")));
final String dt = prop.getProperty("dt", "t");
this.entry.setCol(col_dt, dt.length() > 0 ? new byte[]{(byte) dt.charAt(0)} : new byte[]{(byte) 't'});
final String flags = prop.getProperty("flags", "AAAAAA");
this.entry.setCol(col_flags, (flags.length() > 6) ? QueryParams.empty_constraint.bytes() : (new Bitfield(4, flags)).bytes());
this.entry.setCol(col_lang, UTF8.getBytes(prop.getProperty("lang", "uk")));
this.entry.setCol(col_llocal, Integer.parseInt(prop.getProperty("llocal", "0")));
this.entry.setCol(col_lother, Integer.parseInt(prop.getProperty("lother", "0")));
this.entry.setCol(col_limage, Integer.parseInt(prop.getProperty("limage", "0")));
this.entry.setCol(col_laudio, Integer.parseInt(prop.getProperty("laudio", "0")));
this.entry.setCol(col_lvideo, Integer.parseInt(prop.getProperty("lvideo", "0")));
this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0")));
this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""), null);
this.word = null;
if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported");
if (prop.containsKey("wi")) {
this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))));
this.ranking = 0;
this.comp = null;
// to set the flags bitfield we need to pre-load some values from the Solr document
this.keywords = getString(YaCySchema.keywords);
this.imagec = getInt(YaCySchema.imagescount_i);
this.audioc = getInt(YaCySchema.audiolinkscount_i);
this.videoc = getInt(YaCySchema.videolinkscount_i);
this.appc = getInt(YaCySchema.videolinkscount_i);
this.lon = getDouble(YaCySchema.lon_coordinate);
this.lat = getDouble(YaCySchema.lat_coordinate);
this.flags = new Bitfield();
if (this.keywords != null && this.keywords.indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true);
if (this.lon != 0.0d || this.lat != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true);
if (this.imagec > 0) this.flags.set(Condenser.flag_cat_hasimage, true);
if (this.audioc > 0) this.flags.set(Condenser.flag_cat_hasaudio, true);
if (this.videoc > 0) this.flags.set(Condenser.flag_cat_hasvideo, true);
if (this.appc > 0) this.flags.set(Condenser.flag_cat_hasapp, true);
public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) {
this.word = searchedWord;
this.ranking = ranking;
public static URIMetadataRow importEntry(final String propStr) {
if (propStr == null || (propStr.length() > 0 && propStr.charAt(0) != '{') || !propStr.endsWith("}")) {
return null;
try {
return new URIMetadataRow(MapTools.s2p(propStr.substring(1, propStr.length() - 1)));
} catch (final kelondroException e) {
// wrong format
return null;
private int getInt(YaCySchema field) {
Integer x = (Integer) this.doc.getFieldValue(field.name());
if (x == null) return 0;
return x.intValue();
private StringBuilder corePropList() {
// generate a parseable string; this is a simple property-list
final Components metadata = metadata();
final StringBuilder s = new StringBuilder(300);
if (metadata == null) return null;
//System.out.println("author=" + comp.author());
// create new formatters to make concurrency possible
final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute);
try {
assert (s.toString().indexOf(0) < 0);
s.append(",url=").append(crypt.simpleEncode(metadata.url().toNormalform(false, true)));
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
s.append(",referrer=").append(referrerHash() == null ? "" : ASCII.String(referrerHash()));
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
s.append(",lang=").append(language() == null ? "EN" : UTF8.String(language()));
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
private long getLong(YaCySchema field) {
Long x = (Long) this.doc.getFieldValue(field.name());
if (x == null) return 0;
return x.longValue();
if (this.word != null) {
// append also word properties
final String wprop = this.word.toPropertyForm();
assert (s.toString().indexOf(0) < 0);
return s;
private double getDouble(YaCySchema field) {
Double x = (Double) this.doc.getFieldValue(field.name());
if (x == null) return 0.0d;
return x.doubleValue();
} catch (final Throwable e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
return null;
private Date getDate(YaCySchema field) {
Date x = (Date) this.doc.getFieldValue(field.name());
if (x == null) return new Date(0);
return x;
public Row.Entry toRowEntry() {
return this.entry;
private String getString(YaCySchema field) {
String x = (String) this.doc.getFieldValue(field.name());
if (x == null) return "";
return x;
public byte[] hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.entry.getPrimaryKeyBytes();
return this.hash;
public long ranking() {
return this.ranking;
public String hosthash() {
return (String) this.doc.getFieldValue(YaCySchema.host_id_s.name());
public boolean matches(final Pattern matcher) {
return this.metadata().matches(matcher);
public Date moddate() {
return getDate(YaCySchema.last_modified);
public DigestURI url() {
return this.metadata().url();
return this.url;
public String dc_title() {
return this.metadata().dc_title();
public boolean matches(Pattern matcher) {
return matcher.matcher(this.urlRaw.toLowerCase()).matches();
public String dc_creator() {
return this.metadata().dc_creator();
public String dc_title() {
List<String> titles = (List<String>) this.doc.getFieldValue(YaCySchema.title.name());
if (titles == null || titles.size() == 0) return "";
return titles.get(0);
public String dc_publisher() {
return this.metadata().dc_publisher();
public String dc_creator() {
return getString(YaCySchema.author);
public String dc_subject() {
return this.metadata().dc_subject();
public String dc_publisher() {
return getString(YaCySchema.publisher_t);
public float lat() {
return this.metadata().lat();
public String dc_subject() {
return this.keywords;
public float lon() {
return this.metadata().lon();
public double lat() {
return this.lat;
private Components metadata() {
// avoid double computation of metadata elements
if (this.comp != null) return this.comp;
// parse elements from comp field;
final byte[] c = this.entry.getColBytes(col_comp, true);
final List<byte[]> cl = ByteBuffer.split(c, (byte) 10);
this.comp = new Components(
(cl.size() > 0) ? UTF8.String(cl.get(0)) : "",
(cl.size() > 1) ? UTF8.String(cl.get(1)) : "",
(cl.size() > 2) ? UTF8.String(cl.get(2)) : "",
(cl.size() > 3) ? UTF8.String(cl.get(3)) : "",
(cl.size() > 4) ? UTF8.String(cl.get(4)) : "",
(cl.size() > 5) ? UTF8.String(cl.get(5)) : "");
return this.comp;
public double lon() {
return this.lon;
public Date moddate() {
return decodeDate(col_mod);
public long ranking() {
return this.ranking;
public Date loaddate() {
return decodeDate(col_load);
return getDate(YaCySchema.load_date_dt);
public Date freshdate() {
return decodeDate(col_fresh);
public byte[] referrerHash() {
// return the creator's hash or null if there is none
// FIXME: There seem to be some malformed entries in the databasees like "null\0\0\0\0\0\0\0\0"
final byte[] r = this.entry.getColBytes(col_referrer, true);
if (r != null) {
int i = r.length;
while (i > 0) {
if (r[--i] == 0) return null;
return r;
return getDate(YaCySchema.fresh_date_dt);
public String md5() {
// returns the md5 in hex representation
return Digest.encodeHex(this.entry.getColBytes(col_md5, true));
return getString(YaCySchema.md5_s);
public char doctype() {
return (char) this.entry.getColByte(col_dt);
return Response.docType(getString(YaCySchema.content_type));
public byte[] language() {
byte[] b = this.entry.getColBytes(col_lang, true);
if (b == null || b[0] == (byte)'[') {
String tld = this.metadata().url.getTLD();
if (tld.length() < 2 || tld.length() > 2) return ASCII.getBytes("en");
return ASCII.getBytes(tld);
return b;
String[] languages = (String[]) this.doc.getFieldValue(YaCySchema.language_txt.name());
if (languages == null || languages.length == 0) return ASCII.getBytes("en");
return UTF8.getBytes(languages[0]);
public int size() {
return (int) this.entry.getColLong(col_size);
return getInt(YaCySchema.size_i);
public Bitfield flags() {
return new Bitfield(this.entry.getColBytes(col_flags, true));
return this.flags;
public int wordCount() {
return (int) this.entry.getColLong(col_wc);
return getInt(YaCySchema.wordcount_i);
public int llocal() {
return (int) this.entry.getColLong(col_llocal);
return getInt(YaCySchema.inboundlinkscount_i);
public int lother() {
return (int) this.entry.getColLong(col_lother);
return getInt(YaCySchema.outboundlinkscount_i);
public int limage() {
return (int) this.entry.getColLong(col_limage);
return this.imagec;
public int laudio() {
return (int) this.entry.getColLong(col_laudio);
return this.audioc;
public int lvideo() {
return (int) this.entry.getColLong(col_lvideo);
return this.videoc;
public int lapp() {
return (int) this.entry.getColLong(col_lapp);
return this.appc;
public String snippet() {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return this.snippet;
public WordReferenceVars word() {
public WordReference word() {
return this.word;
public boolean isOlder(final URIMetadata other) {
public boolean isOlder(URIMetadata other) {
if (other == null) return false;
final Date tmoddate = moddate();
final Date omoddate = other.moddate();
@ -461,7 +289,84 @@ public class URIMetadataNode /*extends URIReferenceNode implements URIMetadata*/
return false;
public String toString(final String snippet) {
private StringBuilder corePropList() {
// generate a parseable string; this is a simple property-list
final StringBuilder s = new StringBuilder(300);
//System.out.println("author=" + comp.author());
// create new formatters to make concurrency possible
final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute);
try {
assert (s.toString().indexOf(0) < 0);
s.append(",url=").append(crypt.simpleEncode(url().toNormalform(false, true)));
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
s.append(",referrer=").append(referrerHash() == null ? "" : ASCII.String(referrerHash()));
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
s.append(",lang=").append(language() == null ? "EN" : UTF8.String(language()));
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
assert (s.toString().indexOf(0) < 0);
if (this.word != null) {
// append also word properties
final String wprop = this.word.toPropertyForm();
assert (s.toString().indexOf(0) < 0);
return s;
} catch (final Throwable e) {
return null;
* the toString format must be completely identical to URIMetadataRow because that is used
* to transport the data over p2p connections.
public String toString(String snippet) {
// add information needed for remote transport
final StringBuilder core = corePropList();
if (core == null)
@ -476,12 +381,20 @@ public class URIMetadataNode /*extends URIReferenceNode implements URIMetadata*/
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
public byte[] referrerHash() {
String[] referrer = (String[]) this.doc.getFieldValue(YaCySchema.referrer_id_txt.name());
if (referrer == null || referrer.length == 0) return null;
return ASCII.getBytes(referrer[0]);
public Request toBalancerEntry(final String initiatorHash) {
return new Request(
@ -489,74 +402,4 @@ public class URIMetadataNode /*extends URIReferenceNode implements URIMetadata*/
public String toString() {
final StringBuilder core = corePropList();
if (core == null) return null;
core.insert(0, "{");
return core.toString();
//return "{" + core + "}";
private class Components {
private DigestURI url;
private String urlRaw;
private byte[] urlHash;
private final String dc_title, dc_creator, dc_subject, dc_publisher;
private final String latlon; // a comma-separated tuple as "<latitude>,<longitude>" where the coordinates are given as WGS84 spatial coordinates in decimal degrees
public Components(
final String urlRaw,
final byte[] urlhash,
final String title,
final String author,
final String tags,
final String publisher,
final String latlon) {
this.url = null;
this.urlRaw = urlRaw;
this.urlHash = urlhash;
this.dc_title = title;
this.dc_creator = author;
this.dc_subject = tags;
this.dc_publisher = publisher;
this.latlon = latlon;
public boolean matches(final Pattern matcher) {
if (this.urlRaw != null) return matcher.matcher(this.urlRaw.toLowerCase()).matches();
if (this.url != null) return matcher.matcher(this.url.toNormalform(true, true).toLowerCase()).matches();
return false;
public DigestURI url() {
if (this.url == null) {
try {
this.url = new DigestURI(this.urlRaw, this.urlHash);
} catch (final MalformedURLException e) {
this.url = null;
this.urlRaw = null;
this.urlHash = null;
return this.url;
public String dc_title() { return this.dc_title; }
public String dc_creator() { return this.dc_creator; }
public String dc_publisher() { return this.dc_publisher; }
public String dc_subject() { return this.dc_subject; }
public float lat() {
if (this.latlon == null || this.latlon.isEmpty()) return 0.0f;
final int p = this.latlon.indexOf(',');
return p < 0 ? 0.0f : Float.parseFloat(this.latlon.substring(0, p));
public float lon() {
if (this.latlon == null || this.latlon.isEmpty()) return 0.0f;
final int p = this.latlon.indexOf(',');
return p < 0 ? 0.0f : Float.parseFloat(this.latlon.substring(p + 1));

@ -30,7 +30,6 @@ import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
@ -38,6 +37,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.Row;
@ -104,7 +104,7 @@ public class URIMetadataRow implements URIMetadata {
private final Row.Entry entry;
private final String snippet;
private WordReferenceVars word; // this is only used if the url is transported via remote search requests
private WordReference word; // this is only used if the url is transported via remote search requests
private final long ranking; // during generation of a search result this value is set
private Components comp;
@ -167,12 +167,6 @@ public class URIMetadataRow implements URIMetadata {
this.comp = null;
public Map<String, byte[]> toMap() {
// TODO to be implemented
return null;
private void encodeDate(final int col, final Date d) {
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
// 86400000 is the number of milliseconds in one day
@ -211,7 +205,7 @@ public class URIMetadataRow implements URIMetadata {
return UTF8.getBytes(s0);
public URIMetadataRow(final Row.Entry entry, final WordReferenceVars searchedWord, final long ranking) {
public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord, final long ranking) {
this.entry = entry;
this.snippet = null;
this.word = searchedWord;
@ -284,7 +278,7 @@ public class URIMetadataRow implements URIMetadata {
this.comp = null;
public static URIMetadataRow importEntry(final String propStr) {
public static URIMetadata importEntry(final String propStr) {
if (propStr == null || (!propStr.isEmpty() && propStr.charAt(0) != '{') || !propStr.endsWith("}")) {
return null;
@ -560,7 +554,7 @@ public class URIMetadataRow implements URIMetadata {
public WordReferenceVars word() {
public WordReference word() {
return this.word;

@ -9,12 +9,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -23,7 +23,6 @@
package net.yacy.kelondro.data.meta;
import java.util.Date;
import java.util.Map;
import java.util.regex.Pattern;
public interface URIReference {
@ -40,7 +39,7 @@ public interface URIReference {
* @return
public String hosthash();
* The modification date of the URIReference is given if
* the record was created first and is defined with the
@ -48,26 +47,20 @@ public interface URIReference {
* @return the modification date of this record
public Date moddate();
* The DigestURI is the payload of the URIReference
* @return the url as DigestURI with assigned URL hash according to the record hash
public DigestURI url();
* check if the url matches agains a given matcher
* @param matcher
* @return true if the url() matches
public boolean matches(final Pattern matcher);
* transform the record into a map which can be stored
* @return
public Map<String, byte[]> toMap();
* produce a visible representation of the record
* @return a string for the url()

@ -9,12 +9,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -26,7 +26,6 @@ import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import net.yacy.cora.date.ISO8601Formatter;
@ -36,14 +35,14 @@ public class URIReferenceNode extends HashMap<String, byte[]> implements URIRefe
private static final long serialVersionUID = -1580155759116466570L;
private byte[] hash;
private final byte[] hash;
public URIReferenceNode(DigestURI uri, Date date) {
this.hash = uri.hash();
this.put(MetadataVocabulary.url.name(), ASCII.getBytes(uri.toNormalform(true, false)));
this.put(MetadataVocabulary.moddate.name(), ASCII.getBytes(ISO8601Formatter.FORMATTER.format(date)));
public byte[] hash() {
return this.hash;
@ -56,7 +55,7 @@ public class URIReferenceNode extends HashMap<String, byte[]> implements URIRefe
this.hostHash = ASCII.String(this.hash, 6, 6);
return this.hostHash;
public Date moddate() {
byte[] x = this.get(MetadataVocabulary.moddate.name());
@ -84,9 +83,4 @@ public class URIReferenceNode extends HashMap<String, byte[]> implements URIRefe
return matcher.matcher(ASCII.String(x)).matches();
public Map<String, byte[]> toMap() {
return this;

@ -9,7 +9,7 @@
// $LastChangedBy$
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -48,17 +48,19 @@ public interface WordReference extends Reference {
public char getType();
public int wordsintitle();
public int llocal();
public int lother();
public int urllength();
public int urlcomps();
public Bitfield flags();
public double termFrequency();
public String hosthash();

@ -431,4 +431,10 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
public String hosthash() {
return ASCII.String(this.urlhash(), 6, 6);

@ -285,6 +285,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
return this.urlHash;
public String hosthash() {
if (this.hostHash != null) return this.hostHash;
this.hostHash = ASCII.String(this.urlHash, 6, 6);

@ -265,6 +265,7 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
public static void delete(final File path, final String tablename) {
if (path == null || tablename == null) return;
final File tabledir = new File(path, tablename);
if (!(tabledir.exists())) return;
if ((!(tabledir.isDirectory()))) {

@ -690,7 +690,7 @@ public final class Protocol
// insert results to containers
int term = count;
for ( final URIMetadataRow urlEntry : result.links ) {
for ( final URIMetadata urlEntry : result.links ) {
if ( term-- <= 0 ) {
break; // do not process more that requested (in case that evil peers fill us up with rubbish)
@ -890,7 +890,7 @@ public final class Protocol
public Map<byte[], Integer> indexcount; //
public long searchtime; // time that the peer actually spent to create the result
public String[] references; // search hints, the top-words
public List<URIMetadataRow> links; // LURLs of search
public List<URIMetadata> links; // LURLs of search
public Map<byte[], String> indexabstract; // index abstracts, a collection of url-hashes per word
public SearchResult(
@ -1015,14 +1015,14 @@ public final class Protocol
this.references = resultMap.get("references").split(",");
this.links = new ArrayList<URIMetadataRow>(this.urlcount);
this.links = new ArrayList<URIMetadata>(this.urlcount);
for ( int n = 0; n < this.urlcount; n++ ) {
// get one single search result
final String resultLine = resultMap.get("resource" + n);
if ( resultLine == null ) {
final URIMetadataRow urlEntry = URIMetadataRow.importEntry(resultLine);
final URIMetadata urlEntry = URIMetadataRow.importEntry(resultLine);
if ( urlEntry == null ) {
@ -1226,7 +1226,7 @@ public final class Protocol
} // all url's known
// extract the urlCache from the result
final URIMetadata[] urls = new URIMetadataRow[uhs.length];
final URIMetadata[] urls = new URIMetadata[uhs.length];
for ( int i = 0; i < uhs.length; i++ ) {
urls[i] = urlCache.get(ASCII.getBytes(uhs[i]));
if ( urls[i] == null ) {
@ -1540,7 +1540,7 @@ public final class Protocol
new RankingProfile(Classification.ContentDomain.TEXT), // rankingProfile,
null // constraint);
for ( final URIMetadataRow link : result.links ) {
for ( final URIMetadata link : result.links ) {
System.out.println(link.url().toNormalform(true, false));

@ -49,10 +49,6 @@ public class DocumentReference {
this.data = null;
public void store(final URIReference entry) {
this.data.put(entry.hash(), entry.toMap());
public URIReference load(final WeakPriorityBlockingQueue.Element<WordReference> obrwi) {
return null;

@ -49,8 +49,9 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.index.Cache;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
@ -61,6 +62,7 @@ import net.yacy.search.Switchboard;
import net.yacy.search.solr.EmbeddedSolrConnector;
import org.apache.lucene.util.Version;
import org.apache.solr.common.SolrDocument;
public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]> {
@ -190,36 +192,34 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
* @param obrwi
* @return
public URIMetadata load(WordReferenceVars wre, long weight) {
public URIMetadata load(WordReference wre, long weight) {
if (wre == null) return null; // all time was already wasted in takeRWI to get another element
final byte[] urlHash = wre.urlhash();
if (urlHash == null) return null;
if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry == null) return null;
return new URIMetadataRow(entry, wre, weight);
} catch (final IOException e) {
try {
SolrDocument doc = this.solr.get(ASCII.String(urlHash));
} catch (IOException e) {
return null;
return load(wre.urlhash(), wre, weight);
public URIMetadata load(final byte[] urlHash) {
if (urlHash == null) return null;
return load(urlHash, null, 0);
private URIMetadata load(final byte[] urlHash, WordReference wre, long weight) {
// get the metadata from the old metadata index
if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry == null) return null;
return new URIMetadataRow(entry, null, 0);
if (entry != null) return new URIMetadataRow(entry, wre, weight);
} catch (final IOException e) {
return null;
// get the metadata from Solr
try {
SolrDocument doc = this.solr.get(ASCII.String(urlHash));
if (doc != null) return new URIMetadataNode(doc, wre, weight);
} catch (IOException e) {
return null;

@ -105,11 +105,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
private boolean contains(YaCySchema field) {
return this.contains(field.name());
protected void addSolr(final SolrDoc solrdoc, final YaCySchema key, final byte[] value) {
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length != 0))) solrdoc.addSolr(key, UTF8.String(value));
protected void addSolr(final SolrDoc solrdoc, final YaCySchema key, final String value) {
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value);
@ -149,7 +149,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
protected void addSolr(final SolrDoc solrdoc, final YaCySchema key, final boolean value) {
if (isEmpty() || contains(key)) solrdoc.addSolr(key, value);
* save configuration to file and update enum SolrFields
* @throws IOException
@ -170,7 +170,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
} catch (final IOException e) {}
public SolrDoc metadata2solr(final URIMetadata md) {
final SolrDoc solrdoc = new SolrDoc();
final DigestURI digestURI = new DigestURI(md.url());
@ -190,18 +190,18 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.content_type)) addSolr(solrdoc, YaCySchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype()));
if (allAttr || contains(YaCySchema.last_modified)) addSolr(solrdoc, YaCySchema.last_modified, md.moddate());
if (allAttr || contains(YaCySchema.text_t)) addSolr(solrdoc, YaCySchema.text_t, ""); // not delivered in metadata
if (allAttr || contains(YaCySchema.wordcount_i)) addSolr(solrdoc, YaCySchema.wordcount_i, md.wordCount());
if (allAttr || contains(YaCySchema.wordcount_i)) addSolr(solrdoc, YaCySchema.wordcount_i, md.wordCount());
if (allAttr || contains(YaCySchema.keywords)) {
String keywords = md.dc_subject();
Bitfield flags = md.flags();
if (flags.get(Condenser.flag_cat_indexof)) {
if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else {
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof";
addSolr(solrdoc, YaCySchema.keywords, keywords);
// path elements of link
final String path = digestURI.getPath();
if (path != null && (allAttr || contains(YaCySchema.paths_txt))) {
@ -229,12 +229,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.md5_s)) addSolr(solrdoc, YaCySchema.md5_s, md.md5());
if (allAttr || contains(YaCySchema.publisher_t)) addSolr(solrdoc, YaCySchema.publisher_t, md.dc_publisher());
if ((allAttr || contains(YaCySchema.language_txt)) && md.language() != null) addSolr(solrdoc, YaCySchema.language_txt,new String[]{UTF8.String(md.language())});
if (allAttr || contains(YaCySchema.ranking_i)) addSolr(solrdoc, YaCySchema.ranking_i, md.ranking());
if (allAttr || contains(YaCySchema.size_i)) addSolr(solrdoc, YaCySchema.size_i, md.size());
if (allAttr || contains(YaCySchema.audiolinkscount_i)) addSolr(solrdoc, YaCySchema.audiolinkscount_i, md.laudio());
if (allAttr || contains(YaCySchema.videolinkscount_i)) addSolr(solrdoc, YaCySchema.videolinkscount_i, md.lvideo());
if (allAttr || contains(YaCySchema.applinkscount_i)) addSolr(solrdoc, YaCySchema.applinkscount_i, md.lapp());
return solrdoc;
@ -585,7 +584,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.httpstatus_i)) addSolr(solrdoc, YaCySchema.httpstatus_i, header == null ? 200 : header.getStatusCode());
// fields that are additionally in URIMetadataRow
// fields that are additionally in URIMetadataRow
if (allAttr || contains(YaCySchema.load_date_dt)) addSolr(solrdoc, YaCySchema.load_date_dt, metadata.loaddate());
if (allAttr || contains(YaCySchema.fresh_date_dt)) addSolr(solrdoc, YaCySchema.fresh_date_dt, metadata.freshdate());
if (allAttr || contains(YaCySchema.host_id_s)) addSolr(solrdoc, YaCySchema.host_id_s, metadata.hosthash());
@ -593,12 +592,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
//if (allAttr || contains(SolrField.md5_s)) addSolr(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(YaCySchema.publisher_t)) addSolr(solrdoc, YaCySchema.publisher_t, yacydoc.dc_publisher());
if ((allAttr || contains(YaCySchema.language_txt)) && metadata.language() != null) addSolr(solrdoc, YaCySchema.language_txt,new String[]{UTF8.String(metadata.language())});
if (allAttr || contains(YaCySchema.ranking_i)) addSolr(solrdoc, YaCySchema.ranking_i, metadata.ranking());
if (allAttr || contains(YaCySchema.size_i)) addSolr(solrdoc, YaCySchema.size_i, metadata.size());
if (allAttr || contains(YaCySchema.audiolinkscount_i)) addSolr(solrdoc, YaCySchema.audiolinkscount_i, yacydoc.getAudiolinks().size());
if (allAttr || contains(YaCySchema.videolinkscount_i)) addSolr(solrdoc, YaCySchema.videolinkscount_i, yacydoc.getVideolinks().size());
if (allAttr || contains(YaCySchema.applinkscount_i)) addSolr(solrdoc, YaCySchema.applinkscount_i, yacydoc.getApplinks().size());
return solrdoc;

@ -131,7 +131,6 @@ public enum YaCySchema implements Schema {
md5_s(SolrType.string, true, true, "the md5 of the raw source"),// String md5();
publisher_t(SolrType.text_general, true, true, "the name of the publisher of the document"),// String dc_publisher();
language_txt(SolrType.string, true, true, "the language used in the document; starts with primary language"),// byte[] language();
ranking_i(SolrType.integer, true, true, "an external ranking value"),// long ranking();
size_i(SolrType.integer, true, true, "the size of the raw source"),// int size();
audiolinkscount_i(SolrType.integer, true, true, "number of links to audio resources"),// int laudio();
videolinkscount_i(SolrType.integer, true, true, "number of links to video resources"),// int lvideo();

@ -213,7 +213,7 @@ public class ReferenceOrder {
* @param t
* @return a ranking: the higher the number, the better is the ranking
public long cardinal(final WordReferenceVars t) {
public long cardinal(final WordReference t) {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
final Bitfield flags = t.flags();
@ -254,7 +254,7 @@ public class ReferenceOrder {
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
+ ((ByteBuffer.equals(t.language, this.language)) ? 255 << this.ranking.coeff_language : 0)
+ ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0)
+ ((DigestURI.probablyRootURL(t.urlhash())) ? 15 << this.ranking.coeff_urllength : 0);
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;

@ -36,6 +36,8 @@ import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
@ -188,10 +190,12 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
public double lon() {
return this.urlentry.lon();
public WordReferenceVars word() {
public WordReference word() {
final Reference word = this.urlentry.word();
assert word instanceof WordReferenceVars;
return (WordReferenceVars) word;
if (word instanceof WordReferenceVars) return (WordReferenceVars) word;
if (word instanceof WordReferenceRow) return (WordReferenceRow) word;
assert word instanceof WordReferenceRow || word instanceof WordReferenceVars : word == null ? "word = null" : "type = " + word.getClass().getCanonicalName();
return null;
public boolean hasTextSnippet() {
return (this.textSnippet != null) && (!this.textSnippet.getErrorCode().fail());
