diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index ae8cd6050..e11b347df 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -28,14 +28,13 @@ import java.io.IOException; - import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.peers.Protocol; import net.yacy.peers.Seed; import net.yacy.repository.Blacklist.BlacklistType; @@ -115,7 +114,7 @@ public final class crawlReceipt { } // generating a new loaded URL entry - final URIMetadataRow entry = URIMetadataRow.importEntry(propStr); + final URIMetadataNode entry = URIMetadataNode.importEntry(propStr); if (entry == null) { if (log.isWarn()) log.warn("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "3600"); @@ -130,7 +129,7 @@ public final class crawlReceipt { // check if the entry is in our network domain final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(entry.url()); - if (urlRejectReason != null) { + if (urlRejectReason != null) { log.warn("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + ASCII.String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "9999"); return prop; @@ -139,7 +138,7 @@ public final class crawlReceipt { // Check URL against DHT blacklist if (Switchboard.urlBlacklist.isListed(BlacklistType.DHT, entry)) { // URL is blacklisted - log.warn("crawlReceipt: RECEIVED wrong RECEIPT (URL is blacklisted) for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false) + " from peer " + iam); + log.warn("crawlReceipt: RECEIVED wrong RECEIPT (URL is blacklisted) for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false) + " from peer " + iam); prop.put("delay", "9999"); return prop; } diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 289d92b78..f2406971f 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -30,7 +30,6 @@ import java.io.IOException; import java.text.ParseException; import java.util.HashMap; import java.util.Map; - import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.feed.RSSMessage; @@ -38,7 +37,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.peers.EventChannel; import net.yacy.peers.Network; import net.yacy.peers.Protocol; @@ -89,8 +88,8 @@ public final class transferURL { int doublecheck = 0; // read the urls from the other properties and store String urls; - URIMetadataRow lEntry; - Map lEm = new HashMap(); + URIMetadataNode lEntry; + Map lEm = new HashMap(); for (int i = 0; i < urlc; i++) { // read new lurl-entry @@ -102,7 +101,7 @@ public final class transferURL { } // parse new lurl-entry - lEntry = URIMetadataRow.importEntry(urls); + lEntry = URIMetadataNode.importEntry(urls); if (lEntry == null) { if (Network.log.isWarn()) Network.log.warn("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); blocked++; diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 8824568bf..c369e72c8 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -23,12 +23,14 @@ package net.yacy.kelondro.data.meta; import java.net.MalformedURLException; +import java.text.ParseException; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; +import java.util.Properties; import java.util.regex.Pattern; import net.yacy.cora.date.GenericFormatter; @@ -44,8 +46,12 @@ import net.yacy.cora.order.Base64Order; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; +import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.util.Bitfield; +import net.yacy.kelondro.util.MapTools; +import net.yacy.kelondro.util.kelondroException; +import net.yacy.search.query.QueryParams; import net.yacy.search.schema.CollectionSchema; import net.yacy.utils.crypt; @@ -59,17 +65,87 @@ import org.apache.solr.common.SolrDocument; */ public class URIMetadataNode { - private byte[] hash = null; - private String urlRaw = null, keywords = null; - private DigestURL url = null; - private Bitfield flags = null; - private int imagec = -1, audioc = -1, videoc = -1, appc = -1; - private double lat = Double.NaN, lon = Double.NaN; - private long ranking = 0; // during generation of a search result this value is set - private SolrDocument doc = null; - private String snippet = null; - private WordReferenceVars word = null; // this is only used if the url is transported via remote search requests - + protected byte[] hash = null; + protected String urlRaw = null, keywords = null; + protected DigestURL url = null; + protected Bitfield flags = null; + protected int imagec = -1, audioc = -1, videoc = -1, appc = -1; + protected double lat = Double.NaN, lon = Double.NaN; + protected long ranking = 0; // during generation of a search result this value is set + protected SolrDocument doc = null; + protected String snippet = null; + protected WordReferenceVars word = null; // this is only used if the url is transported via remote search requests + + public URIMetadataNode(final Properties prop) { + // generates an plasmaLURLEntry using the properties from the argument + // the property names must correspond to the one from toString + //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); + this.doc = new SolrDocument(); + urlRaw = crypt.simpleDecode(prop.getProperty("url", "")); + try { + url = new DigestURL(urlRaw); + this.hash = url.hash(); + } catch (final MalformedURLException e) { + ConcurrentLog.logException(e); + this.url = null; + this.hash = null; + } + String descr = crypt.simpleDecode(prop.getProperty("descr", "")); if (descr == null) descr = ""; + String dc_creator = crypt.simpleDecode(prop.getProperty("author", "")); if (dc_creator == null) dc_creator = ""; + String tags = crypt.simpleDecode(prop.getProperty("tags", "")); if (tags == null) tags = ""; + this.keywords = Tagging.cleanTagFromAutotagging(tags); + String dc_publisher = crypt.simpleDecode(prop.getProperty("publisher", "")); if (dc_publisher == null) dc_publisher = ""; + String lons = crypt.simpleDecode(prop.getProperty("lon", "0.0")); if (lons == null) lons = "0.0"; + String lats = crypt.simpleDecode(prop.getProperty("lat", "0.0")); if (lats == null) lats = "0.0"; + + + this.doc.setField(CollectionSchema.title.name(), descr); + this.doc.setField(CollectionSchema.author.name(), dc_creator); + this.doc.setField(CollectionSchema.publisher_t.name(), dc_publisher); + this.lat = Float.parseFloat(lats); + this.lon = Float.parseFloat(lons); + + // create new formatters to make concurrency possible + final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); + + try { + this.doc.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"))); + } catch (final ParseException e) { + this.doc.setField(CollectionSchema.last_modified.name(), new Date()); + } + try { + this.doc.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"))); + } catch (final ParseException e) { + this.doc.setField(CollectionSchema.load_date_dt.name(), new Date()); + } + try { + this.doc.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"))); + } catch (final ParseException e) { + this.doc.setField(CollectionSchema.fresh_date_dt.name(), new Date()); + } + this.doc.setField(CollectionSchema.referrer_id_s.name(), prop.getProperty("referrer", "")); + this.doc.setField(CollectionSchema.md5_s.name(), prop.getProperty("md5", "")); + this.doc.setField(CollectionSchema.size_i.name(), Integer.parseInt(prop.getProperty("size", "0"))); + this.doc.setField(CollectionSchema.wordcount_i.name(), Integer.parseInt(prop.getProperty("wc", "0"))); + final String dt = prop.getProperty("dt", "t"); + String[] mime = Response.doctype2mime(null,dt.charAt(0)); + this.doc.setField(CollectionSchema.content_type.name(), mime); + final String flagsp = prop.getProperty("flags", "AAAAAA"); + this.flags = (flagsp.length() > 6) ? QueryParams.empty_constraint : (new Bitfield(4, flagsp)); + this.doc.setField(CollectionSchema.language_s.name(), prop.getProperty("lang", "")); + this.doc.setField(CollectionSchema.inboundlinkscount_i.name(), Integer.parseInt(prop.getProperty("llocal", "0"))); + this.doc.setField(CollectionSchema.outboundlinkscount_i.name(), Integer.parseInt(prop.getProperty("lother", "0"))); + this.imagec = Integer.parseInt(prop.getProperty("limage", "0")); + this.audioc = Integer.parseInt(prop.getProperty("laudio", "0")); + this.videoc = Integer.parseInt(prop.getProperty("lvideo", "0")); + this.appc = Integer.parseInt(prop.getProperty("lapp", "0")); + this.snippet = crypt.simpleDecode(prop.getProperty("snippet", "")); + this.word = null; + if (prop.containsKey("wi")) { + this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))), false); + } + } + public URIMetadataNode(final SolrDocument doc) { this.doc = doc; this.snippet = ""; @@ -340,20 +416,21 @@ public class URIMetadataNode { return getStringList(CollectionSchema.description_txt); } - public boolean isOlder(URIMetadataRow other) { - if (other == null) return false; - final Date tmoddate = moddate(); - final Date omoddate = other.moddate(); - if (tmoddate.before(omoddate)) return true; - if (tmoddate.equals(omoddate)) { - final Date tloaddate = loaddate(); - final Date oloaddate = other.loaddate(); - if (tloaddate.before(oloaddate)) return true; + public static URIMetadataNode importEntry(final String propStr) { + if (propStr == null || propStr.isEmpty() || propStr.charAt(0) != '{' || !propStr.endsWith("}")) { + ConcurrentLog.severe("URIMetadataNode", "importEntry: propStr is not proper: " + propStr); + return null; + } + try { + return new URIMetadataNode(MapTools.s2p(propStr.substring(1, propStr.length() - 1))); + } catch (final kelondroException e) { + // wrong format + ConcurrentLog.severe("URIMetadataNode", e.getMessage()); + return null; } - return false; } - private static StringBuilder corePropList(URIMetadataNode md) { + protected static StringBuilder corePropList(URIMetadataNode md) { // generate a parseable string; this is a simple property-list final StringBuilder s = new StringBuilder(300); @@ -431,20 +508,6 @@ public class URIMetadataNode { return core.toString(); } - /* - private DigestURI getURL(CollectionSchema field) { - assert !field.isMultiValued(); - assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight; - Object x = this.doc.getFieldValue(field.getSolrFieldName()); - if (x == null) return null; - try { - return new DigestURI((String) x); - } catch (final MalformedURLException e) { - return null; - } - } - */ - private int getInt(CollectionSchema field) { assert !field.isMultiValued(); assert field.getType() == SolrType.num_integer; diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java deleted file mode 100644 index c9038943a..000000000 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ /dev/null @@ -1,509 +0,0 @@ -// URLMetadataRow.java -// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 2006 on http://www.anomic.de -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General private License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General private License for more details. -// -// You should have received a copy of the GNU General private License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package net.yacy.kelondro.data.meta; - -import java.net.MalformedURLException; -import java.text.ParseException; -import java.util.Date; -import java.util.List; -import java.util.Properties; - -import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.encoding.ASCII; -import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.DigestURL; -import net.yacy.cora.lod.vocabulary.Tagging; -import net.yacy.cora.order.Base64Order; -import net.yacy.cora.order.Digest; -import net.yacy.cora.order.NaturalOrder; -import net.yacy.cora.util.ByteBuffer; -import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.word.WordReference; -import net.yacy.kelondro.data.word.WordReferenceRow; -import net.yacy.kelondro.data.word.WordReferenceVars; -import net.yacy.kelondro.index.Row; -import net.yacy.kelondro.io.CharBuffer; -import net.yacy.kelondro.util.Bitfield; -import net.yacy.kelondro.util.MapTools; -import net.yacy.kelondro.util.kelondroException; -import net.yacy.search.query.QueryParams; -import net.yacy.utils.crypt; - -public class URIMetadataRow { - - // this object stores attributes for URL entries - - private static final Row rowdef = new Row( - "String hash-12, " + // the url's hash - "String comp-360, " + // components: the url, description, author, tags and publisher - "Cardinal mod-4 {b256}, " + // last-modified from the httpd - "Cardinal load-4 {b256}, " + // time when the url was loaded - "Cardinal fresh-4 {b256}, " + // time until this url is fresh - "String referrer-12, " + // (one of) the url's referrer hash(es) - "byte[] md5-8, " + // the md5 of the url content (to identify changes) - "Cardinal size-6 {b256}, " + // size of file in bytes - "Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds - "byte[] dt-1, " + // doctype, taken from extension or any other heuristic - "Bitfield flags-4, " + // flags; any stuff (see Word-Entity definition) - "byte[] lang-2, " + // language - "Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width - "Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height - "Cardinal limage-2 {b256}, " + // # of embedded image links - "Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks - "Cardinal lvideo-2 {b256}, " + // # of embedded video links - "Cardinal lapp-2 {b256}", // # of embedded links to applications - Base64Order.enhancedCoder - ); - - /* =========================================================================== - * Constants to access the various columns of an URL entry - * =========================================================================== */ - private static final int col_hash = 0; // the url's hash - private static final int col_comp = 1; // components: the url, description, author and tags. As 5th element, an ETag is possible - private static final int col_mod = 2; // the modifed-date time from the server (servertime in row) - private static final int col_load = 3; // time when the url was loaded - private static final int col_fresh = 4; // time until this url is fresh - private static final int col_referrer = 5; // a referrer of the url (there may be several, but this is the one that was acually referring to this one) - private static final int col_md5 = 6; // the md5 of the url content (to identify changes) - private static final int col_size = 7; // size of file in bytes - private static final int col_wc = 8; // size of file by number of words; for video and audio: seconds - private static final int col_dt = 9; // doctype, taken from extension or any other heuristic - private static final int col_flags = 10; // flags; any stuff (see Word-Entity definition) - private static final int col_lang = 11; // language - private static final int col_llocal = 12; // # of outlinks to same domain; for video and image: width - private static final int col_lother = 13; // # of outlinks to outside domain; for video and image: height - private static final int col_limage = 14; // # of embedded image links - private static final int col_laudio = 15; // # of embedded audio links; for audio: track number; for video: number of audio tracks - private static final int col_lvideo = 16; // # of embedded video links - private static final int col_lapp = 17; // # of embedded links to applications - - private final Row.Entry entry; - private final String snippet; - private WordReference word; // this is only used if the url is transported via remote search requests - private Components comp; - - private URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) { - this.entry = entry; - this.snippet = ""; - this.word = searchedWord; - this.comp = null; - } - - private URIMetadataRow(final Properties prop) throws kelondroException { - // generates an plasmaLURLEntry using the properties from the argument - // the property names must correspond to the one from toString - //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); - DigestURL url; - String urls = crypt.simpleDecode(prop.getProperty("url", "")); - try { - url = new DigestURL(urls); - } catch (final MalformedURLException e) { - throw new kelondroException("bad url: " + urls); - } - String descr = crypt.simpleDecode(prop.getProperty("descr", "")); if (descr == null) descr = ""; - String dc_creator = crypt.simpleDecode(prop.getProperty("author", "")); if (dc_creator == null) dc_creator = ""; - String tags = crypt.simpleDecode(prop.getProperty("tags", "")); if (tags == null) tags = ""; - tags = Tagging.cleanTagFromAutotagging(tags); - String dc_publisher = crypt.simpleDecode(prop.getProperty("publisher", "")); if (dc_publisher == null) dc_publisher = ""; - String lons = crypt.simpleDecode(prop.getProperty("lon", "0.0")); if (lons == null) lons = "0.0"; - String lats = crypt.simpleDecode(prop.getProperty("lat", "0.0")); if (lats == null) lats = "0.0"; - - this.entry = rowdef.newEntry(); - this.entry.setCol(col_hash, url.hash()); // FIXME potential null pointer access - this.entry.setCol(col_comp, encodeComp(url, descr, dc_creator, tags, dc_publisher, Float.parseFloat(lats), Float.parseFloat(lons))); - - // create new formatters to make concurrency possible - final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); - - try { - encodeDate(col_mod, formatter.parse(prop.getProperty("mod", "20000101"))); - } catch (final ParseException e) { - encodeDate(col_mod, new Date()); - } - try { - encodeDate(col_load, formatter.parse(prop.getProperty("load", "20000101"))); - } catch (final ParseException e) { - encodeDate(col_load, new Date()); - } - try { - encodeDate(col_fresh, formatter.parse(prop.getProperty("fresh", "20000101"))); - } catch (final ParseException e) { - encodeDate(col_fresh, new Date()); - } - this.entry.setCol(col_referrer, UTF8.getBytes(prop.getProperty("referrer", ""))); - this.entry.setCol(col_md5, Digest.decodeHex(prop.getProperty("md5", ""))); - this.entry.setCol(col_size, Integer.parseInt(prop.getProperty("size", "0"))); - this.entry.setCol(col_wc, Integer.parseInt(prop.getProperty("wc", "0"))); - final String dt = prop.getProperty("dt", "t"); - this.entry.setCol(col_dt, dt.isEmpty() ? new byte[]{(byte) 't'} : new byte[]{(byte) dt.charAt(0)}); - final String flags = prop.getProperty("flags", "AAAAAA"); - this.entry.setCol(col_flags, (flags.length() > 6) ? QueryParams.empty_constraint.bytes() : (new Bitfield(4, flags)).bytes()); - this.entry.setCol(col_lang, UTF8.getBytes(prop.getProperty("lang", ""))); - this.entry.setCol(col_llocal, Integer.parseInt(prop.getProperty("llocal", "0"))); - this.entry.setCol(col_lother, Integer.parseInt(prop.getProperty("lother", "0"))); - this.entry.setCol(col_limage, Integer.parseInt(prop.getProperty("limage", "0"))); - this.entry.setCol(col_laudio, Integer.parseInt(prop.getProperty("laudio", "0"))); - this.entry.setCol(col_lvideo, Integer.parseInt(prop.getProperty("lvideo", "0"))); - this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0"))); - this.snippet = crypt.simpleDecode(prop.getProperty("snippet", "")); - this.word = null; - if (prop.containsKey("wi")) { - this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))), false); - } - this.comp = null; - } - - public static URIMetadataRow importEntry(final String propStr) { - if (propStr == null || propStr.isEmpty() || propStr.charAt(0) != '{' || !propStr.endsWith("}")) { - ConcurrentLog.severe("URIMetadataRow", "importEntry: propStr is not proper: " + propStr); - return null; - } - try { - return new URIMetadataRow(MapTools.s2p(propStr.substring(1, propStr.length() - 1))); - } catch (final kelondroException e) { - // wrong format - ConcurrentLog.severe("URIMetadataRow", e.getMessage()); - return null; - } - } - - private void encodeDate(final int col, final Date d) { - // calculates the number of days since 1.1.1970 and returns this as 4-byte array - // 86400000 is the number of milliseconds in one day - long time = d.getTime(); - long now = System.currentTimeMillis(); - this.entry.setCol(col, NaturalOrder.encodeLong((time > now ? now : time) / 86400000L, 4)); - } - - private Date decodeDate(final int col) { - final long t = this.entry.getColLong(col); - /*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch - /* - if (t < 350400) return new Date(3600000L * t); // hours since epoch - if (t < 21024000) return new Date(60000L * t); // minutes since epoch - */ - } - - private static byte[] encodeComp( - final DigestURL url, - final String dc_title, - final String dc_creator, - final String dc_subject, - final String dc_publisher, - final double lat, - final double lon) { - final CharBuffer s = new CharBuffer(3600, 360); - s.append(url.toNormalform(true)).appendLF(); - s.append(dc_title).appendLF(); - if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); - s.appendLF(); - if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject); - s.appendLF(); - if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher); - s.appendLF(); - if (lon == 0.0 && lat == 0.0) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF(); - String s0 = s.toString(); - s.close(); - return UTF8.getBytes(s0); - } - - public byte[] hash() { - // return a url-hash, based on the md5 algorithm - // the result is a String of 12 bytes within a 72-bit space - // (each byte has an 6-bit range) - // that should be enough for all web pages on the world - final byte[] h = this.entry.getPrimaryKeyBytes(); - return h; - } - - public DigestURL url() { - return this.metadata().url(); - } - - public String dc_title() { - return this.metadata().dc_title(); - } - - public String dc_creator() { - return this.metadata().dc_creator(); - } - - public String dc_publisher() { - return this.metadata().dc_publisher(); - } - - public String dc_subject() { - return this.metadata().dc_subject(); - } - - public double lat() { - return this.metadata().lat(); - } - - public double lon() { - return this.metadata().lon(); - } - - public Components metadata() { - // avoid double computation of metadata elements - if (this.comp != null) return this.comp; - // parse elements from comp field; - final byte[] c = this.entry.getColBytes(col_comp, true); - final List cl = ByteBuffer.split(c, (byte) 10); - this.comp = new Components( - (cl.size() > 0) ? UTF8.String(cl.get(0)) : "", - hash(), - (cl.size() > 1) ? UTF8.String(cl.get(1)) : "", - (cl.size() > 2) ? UTF8.String(cl.get(2)) : "", - (cl.size() > 3) ? UTF8.String(cl.get(3)) : "", - (cl.size() > 4) ? UTF8.String(cl.get(4)) : "", - (cl.size() > 5) ? UTF8.String(cl.get(5)) : ""); - return this.comp; - } - - public Date moddate() { - return decodeDate(col_mod); - } - - public Date loaddate() { - return decodeDate(col_load); - } - - public Date freshdate() { - return decodeDate(col_fresh); - } - - public byte[] referrerHash() { - // return the creator's hash or null if there is none - // FIXME: There seem to be some malformed entries in the databasees like "null\0\0\0\0\0\0\0\0" - final byte[] r = this.entry.getColBytes(col_referrer, true); - if (r != null) { - int i = r.length; - while (i > 0) { - if (r[--i] == 0) return null; - } - } - return r; - } - - public String md5() { - // returns the md5 in hex representation - return Digest.encodeHex(this.entry.getColBytes(col_md5, true)); - } - - public char doctype() { - return (char) this.entry.getColByte(col_dt); - } - - public byte[] language() { - byte[] b = this.entry.getColBytes(col_lang, true); - if ((b == null || b[0] == (byte)'[') && this.metadata().url != null) { - String lang = this.metadata().url.language(); // calculate lang by TLD - this.entry.setCol(col_lang, UTF8.getBytes(lang)); //remember calculation - return ASCII.getBytes(lang); - } - return b; - } - - public int size() { - return (int) this.entry.getColLong(col_size); - } - - public Bitfield flags() { - return new Bitfield(this.entry.getColBytes(col_flags, true)); - } - - public int wordCount() { - return (int) this.entry.getColLong(col_wc); - } - - public int llocal() { - return (int) this.entry.getColLong(col_llocal); - } - - public int lother() { - return (int) this.entry.getColLong(col_lother); - } - - public int limage() { - return (int) this.entry.getColLong(col_limage); - } - - public int laudio() { - return (int) this.entry.getColLong(col_laudio); - } - - public int lvideo() { - return (int) this.entry.getColLong(col_lvideo); - } - - public int lapp() { - return (int) this.entry.getColLong(col_lapp); - } - - public String snippet() { - // the snippet may appear here if the url was transported in a remote search - // it will not be saved anywhere, but can only be requested here - return this.snippet; - } - - - - public WordReference word() { - return this.word; - } - - private static StringBuilder corePropList(URIMetadataRow md) { - // generate a parseable string; this is a simple property-list - final StringBuilder s = new StringBuilder(300); - - // create new formatters to make concurrency possible - final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); - - try { - s.append("hash=").append(ASCII.String(md.hash())); - s.append(",url=").append(crypt.simpleEncode(md.url().toNormalform(true))); - s.append(",descr=").append(crypt.simpleEncode(md.dc_title())); - s.append(",author=").append(crypt.simpleEncode(md.dc_creator())); - s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(md.dc_subject()))); - s.append(",publisher=").append(crypt.simpleEncode(md.dc_publisher())); - s.append(",lat=").append(md.lat()); - s.append(",lon=").append(md.lon()); - s.append(",mod=").append(formatter.format(md.moddate())); - s.append(",load=").append(formatter.format(md.loaddate())); - s.append(",fresh=").append(formatter.format(md.freshdate())); - s.append(",referrer=").append(md.referrerHash() == null ? "" : ASCII.String(md.referrerHash())); - s.append(",md5=").append(md.md5()); - s.append(",size=").append(md.size()); - s.append(",wc=").append(md.wordCount()); - s.append(",dt=").append(md.doctype()); - s.append(",flags=").append(md.flags().exportB64()); - s.append(",lang=").append(md.language() == null ? "EN" : UTF8.String(md.language())); - s.append(",llocal=").append(md.llocal()); - s.append(",lother=").append(md.lother()); - s.append(",limage=").append(md.limage()); - s.append(",laudio=").append(md.laudio()); - s.append(",lvideo=").append(md.lvideo()); - s.append(",lapp=").append(md.lapp()); - if (md.word() != null) { - // append also word properties - final String wprop = md.word().toPropertyForm(); - s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop)); - } - return s; - } catch (final Throwable e) { - ConcurrentLog.logException(e); - return null; - } - } - - /** - * @return the object as String.
- * This e.g. looks like this: - *
{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}
- */ - @Override - public String toString() { - final StringBuilder core = corePropList(this); - if (core == null) return null; - - core.insert(0, "{"); - core.append("}"); - - return core.toString(); - //return "{" + core + "}"; - } - - private class Components { - private DigestURL url; - private String urlRaw; - private byte[] urlHash; - private final String dc_title, dc_creator, dc_subject, dc_publisher; - private String latlon; // a comma-separated tuple as "," where the coordinates are given as WGS84 spatial coordinates in decimal degrees - - private Components( - final String urlRaw, - final byte[] urlhash, - final String title, - final String author, - final String tags, - final String publisher, - final String latlon) { - this.url = null; - this.urlRaw = urlRaw; - this.urlHash = urlhash; - this.dc_title = title; - this.dc_creator = author; - this.dc_subject = tags; - this.dc_publisher = publisher; - this.latlon = latlon; - } - private DigestURL url() { - if (this.url == null) { - try { - this.url = new DigestURL(this.urlRaw, this.urlHash); - } catch (final MalformedURLException e) { - this.url = null; - } - this.urlRaw = null; - this.urlHash = null; - } - return this.url; - } - private String dc_title() { return this.dc_title; } - private String dc_creator() { return this.dc_creator; } - private String dc_publisher() { return this.dc_publisher; } - private String dc_subject() { return this.dc_subject; } - private double lat() { - if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; - final int p = this.latlon.indexOf(','); - if (p < 0) return 0.0d; - try { - double lat = this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p)); - if (lat >= -90.0d && lat <= 90.0d) return lat; - this.latlon = null; // wrong value - return 0.0d; - } catch (final NumberFormatException e) { - return 0.0d; - } - } - private double lon() { - if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; - final int p = this.latlon.indexOf(','); - if (p < 0 || p == this.latlon.length() - 1) return 0.0d; - try { - double lon = this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1)); - if (lon >= -180.0d && lon <= 180.0d) return lon; - this.latlon = null; // wrong value - return 0.0d; - } catch (final NumberFormatException e) { - return 0.0d; - } - } - } - -} diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index 8cc47ecd9..3c2cf2137 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -31,7 +31,6 @@ import java.util.Comparator; import java.util.Queue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; - import net.yacy.cora.date.MicroDate; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; @@ -39,7 +38,7 @@ import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.rwi.AbstractReference; @@ -72,7 +71,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc private double termFrequency; private final boolean local; - public WordReferenceVars(final URIMetadataRow md, final boolean local) { + public WordReferenceVars(final URIMetadataNode md, final boolean local) { this.language = md.language(); this.flags = md.flags(); this.lastModified = md.moddate().getTime(); diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 270ddcfa2..aa1f72da0 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -90,7 +90,6 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; @@ -748,8 +747,8 @@ public final class Protocol { // insert results to containers int term = count; - Collection storeDocs = new ArrayList(result.links.size()); - for ( final URIMetadataRow urlEntry : result.links ) { + Collection storeDocs = new ArrayList(result.links.size()); + for ( final URIMetadataNode urlEntry : result.links ) { if ( term-- <= 0 ) { break; // do not process more that requested (in case that evil peers fill us up with rubbish) } @@ -822,7 +821,7 @@ public final class Protocol { } } - for (URIMetadataRow entry: storeDocs) { + for (URIMetadataNode entry: storeDocs) { try { event.query.getSegment().fulltext().putMetadata(entry); } catch (final IOException e) { @@ -864,7 +863,7 @@ public final class Protocol { public Map indexcount; // //public long searchtime; // time that the peer actually spent to create the result public String[] references; // search hints, the top-words - public List links; // LURLs of search + public List links; // LURLs of search public Map indexabstract; // index abstracts, a collection of url-hashes per word public SearchResult( @@ -984,14 +983,14 @@ public final class Protocol { } } this.references = resultMap.get("references").split(","); - this.links = new ArrayList(this.availableCount); + this.links = new ArrayList(this.availableCount); for ( int n = 0; n < this.availableCount; n++ ) { // get one single search result final String resultLine = resultMap.get("resource" + n); if ( resultLine == null ) { continue; } - final URIMetadataRow urlEntry = URIMetadataRow.importEntry(resultLine); + final URIMetadataNode urlEntry = URIMetadataNode.importEntry(resultLine); if ( urlEntry == null ) { continue; } diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index 012fe5cf7..f2b7748ac 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -54,7 +54,6 @@ import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ListManager; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; @@ -461,9 +460,6 @@ public class Blacklist { public final boolean isListed(final BlacklistType blacklistType, final URIMetadataNode entry) { return isListed(blacklistType, entry.url()); } - public final boolean isListed(final BlacklistType blacklistType, final URIMetadataRow entry) { - return isListed(blacklistType, entry.url()); - } /** * Checks whether the given entry is listed in given blacklist type. diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index b3dda7d73..82de4dfd3 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -62,7 +62,6 @@ import net.yacy.cora.storage.ZIPWriter; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.util.MemoryControl; import net.yacy.search.Switchboard; @@ -330,7 +329,7 @@ public final class Fulltext { /** * deprecated method to store document metadata, use Solr documents wherever possible */ - public void putMetadata(final URIMetadataRow entry) throws IOException { + public void putMetadata(final URIMetadataNode entry) throws IOException { byte[] idb = entry.hash(); String id = ASCII.String(idb); try { diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 8950dcd9f..461bcd69f 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -45,16 +45,15 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; - import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; -import net.yacy.cora.federate.solr.Ranking; -import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.FailType; import net.yacy.cora.federate.solr.ProcessType; +import net.yacy.cora.federate.solr.Ranking; +import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; @@ -77,7 +76,7 @@ import net.yacy.document.content.DCEntry; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.citation.CitationReference; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.index.RowHandleMap; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.Bitfield; @@ -88,7 +87,6 @@ import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.query.QueryParams; import net.yacy.search.schema.WebgraphConfiguration.Subgraph; - import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; @@ -243,7 +241,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri return us; } - public SolrInputDocument metadata2solr(final URIMetadataRow md) { + public SolrInputDocument metadata2solr(final URIMetadataNode md) { final SolrInputDocument doc = new SolrInputDocument(); boolean allAttr = this.isEmpty();