diff --git a/nbproject/project.xml b/nbproject/project.xml index f3c66e0c5..631cd4b51 100644 --- a/nbproject/project.xml +++ b/nbproject/project.xml @@ -1,85 +1,85 @@ - - - org.netbeans.modules.ant.freeform - - - YaCy - - - - YaCy - - - - - java - source - UTF-8 - - - - java - htroot - UTF-8 - - - - . - UTF-8 - - - - - compile - - - clean - - - javadoc - - - run - - - test - - - clean - compile - - - - - - - source - - - - htroot - - - build.xml - - - - - - - - - - - - - - - source - htroot - lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/apache-solr-core-3.6.1.jar;lib/apache-solr-solrj-3.6.1.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.6.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.1.jar;lib/fontbox-1.7.0.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-r05.jar;lib/htmllexer.jar;lib/htmlparser.jar;lib/httpclient-4.2.2.jar;lib/httpcore-4.2.2.jar;lib/httpmime-4.2.2.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.6.1.jar;lib/jempbox-1.7.0.jar;lib/jena-2.6.4.jar;lib/jetty-6.1.26-patched-JETTY-1340.jar;lib/jetty-util-6.1.26-patched-JETTY-1340.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.16.jar;lib/log4j-over-slf4j-1.6.1.jar;lib/lucene-analyzers-3.6.1.jar;lib/lucene-core-3.6.1.jar;lib/lucene-highlighter-3.6.1.jar;lib/lucene-phonetic-3.6.1.jar;lib/lucene-spatial-3.6.1.jar;lib/lucene-spellchecker-3.6.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.0.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.6.1.jar;lib/slf4j-jdk14-1.6.1.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar - 1.6 - - - - + + + org.netbeans.modules.ant.freeform + + + YaCy + + + + YaCy + + + + + java + source + UTF-8 + + + + java + htroot + UTF-8 + + + + . + UTF-8 + + + + + compile + + + clean + + + javadoc + + + run + + + test + + + clean + compile + + + + + + + source + + + + htroot + + + build.xml + + + + + + + + + + + + + + + source + htroot + lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/apache-solr-core-4.0.0.jar;lib/apache-solr-solrj-4.0.0.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.7.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.1.jar;lib/fontbox-1.7.0.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-r05.jar;lib/htmllexer.jar;lib/htmlparser.jar;lib/httpclient-4.2.2.jar;lib/httpcore-4.2.2.jar;lib/httpmime-4.2.2.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.6.1.jar;lib/jempbox-1.7.0.jar;lib/jena-2.6.4.jar;lib/jetty-6.1.26-patched-JETTY-1340.jar;lib/jetty-util-6.1.26-patched-JETTY-1340.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.16.jar;lib/log4j-over-slf4j-1.6.1.jar;lib/lucene-analyzers-4.0.0.jar;lib/lucene-core-4.0.0.jar;lib/lucene-highlighter-4.0.0.jar;lib/lucene-phonetic-4.0.0.jar;lib/lucene-spatial-4.0.0.jar;lib/lucene-spellchecker-4.0.0.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.0.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.6.1.jar;lib/slf4j-jdk14-1.6.1.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar + 1.6 + + + + diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index c771ded1b..f2f29b241 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -1,523 +1,531 @@ -// URLMetadataRow.java -// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 2006 on http://www.anomic.de -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package net.yacy.kelondro.data.meta; - -import java.net.MalformedURLException; -import java.text.ParseException; -import java.util.Date; -import java.util.List; -import java.util.Properties; -import java.util.regex.Pattern; - -import net.yacy.cora.date.GenericFormatter; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; -import net.yacy.cora.lod.vocabulary.Tagging; -import net.yacy.cora.order.Base64Order; -import net.yacy.cora.order.Digest; -import net.yacy.cora.order.NaturalOrder; -import net.yacy.crawler.retrieval.Request; -import net.yacy.kelondro.data.word.WordReference; -import net.yacy.kelondro.data.word.WordReferenceRow; -import net.yacy.kelondro.data.word.WordReferenceVars; -import net.yacy.kelondro.index.Row; -import net.yacy.kelondro.io.CharBuffer; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.Bitfield; -import net.yacy.kelondro.util.ByteBuffer; -import net.yacy.kelondro.util.MapTools; -import net.yacy.kelondro.util.kelondroException; -import net.yacy.search.query.QueryParams; -import net.yacy.utils.crypt; - -public class URIMetadataRow { - - // this object stores attributes for URL entries - - public static final Row rowdef = new Row( - "String hash-12, " + // the url's hash - "String comp-360, " + // components: the url, description, author, tags and publisher - "Cardinal mod-4 {b256}, " + // last-modified from the httpd - "Cardinal load-4 {b256}, " + // time when the url was loaded - "Cardinal fresh-4 {b256}, " + // time until this url is fresh - "String referrer-12, " + // (one of) the url's referrer hash(es) - "byte[] md5-8, " + // the md5 of the url content (to identify changes) - "Cardinal size-6 {b256}, " + // size of file in bytes - "Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds - "byte[] dt-1, " + // doctype, taken from extension or any other heuristic - "Bitfield flags-4, " + // flags; any stuff (see Word-Entity definition) - "byte[] lang-2, " + // language - "Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width - "Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height - "Cardinal limage-2 {b256}, " + // # of embedded image links - "Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks - "Cardinal lvideo-2 {b256}, " + // # of embedded video links - "Cardinal lapp-2 {b256}", // # of embedded links to applications - Base64Order.enhancedCoder - ); - - /* =========================================================================== - * Constants to access the various columns of an URL entry - * =========================================================================== */ - private static final int col_hash = 0; // the url's hash - private static final int col_comp = 1; // components: the url, description, author and tags. As 5th element, an ETag is possible - private static final int col_mod = 2; // the modifed-date time from the server (servertime in row) - private static final int col_load = 3; // time when the url was loaded - private static final int col_fresh = 4; // time until this url is fresh - private static final int col_referrer = 5; // a referrer of the url (there may be several, but this is the one that was acually referring to this one) - private static final int col_md5 = 6; // the md5 of the url content (to identify changes) - private static final int col_size = 7; // size of file in bytes - private static final int col_wc = 8; // size of file by number of words; for video and audio: seconds - private static final int col_dt = 9; // doctype, taken from extension or any other heuristic - private static final int col_flags = 10; // flags; any stuff (see Word-Entity definition) - private static final int col_lang = 11; // language - private static final int col_llocal = 12; // # of outlinks to same domain; for video and image: width - private static final int col_lother = 13; // # of outlinks to outside domain; for video and image: height - private static final int col_limage = 14; // # of embedded image links - private static final int col_laudio = 15; // # of embedded audio links; for audio: track number; for video: number of audio tracks - private static final int col_lvideo = 16; // # of embedded video links - private static final int col_lapp = 17; // # of embedded links to applications - - private final Row.Entry entry; - private final String snippet; - private WordReference word; // this is only used if the url is transported via remote search requests - private Components comp; - - public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) { - this.entry = entry; - this.snippet = ""; - this.word = searchedWord; - this.comp = null; - } - - private URIMetadataRow(final Properties prop) throws kelondroException { - // generates an plasmaLURLEntry using the properties from the argument - // the property names must correspond to the one from toString - //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); - DigestURI url; - String urls = crypt.simpleDecode(prop.getProperty("url", "")); - try { - url = new DigestURI(urls); - } catch (final MalformedURLException e) { - throw new kelondroException("bad url: " + urls); - } - String descr = crypt.simpleDecode(prop.getProperty("descr", "")); if (descr == null) descr = ""; - String dc_creator = crypt.simpleDecode(prop.getProperty("author", "")); if (dc_creator == null) dc_creator = ""; - String tags = crypt.simpleDecode(prop.getProperty("tags", "")); if (tags == null) tags = ""; - tags = Tagging.cleanTagFromAutotagging(tags); - String dc_publisher = crypt.simpleDecode(prop.getProperty("publisher", "")); if (dc_publisher == null) dc_publisher = ""; - String lons = crypt.simpleDecode(prop.getProperty("lon", "0.0")); if (lons == null) lons = "0.0"; - String lats = crypt.simpleDecode(prop.getProperty("lat", "0.0")); if (lats == null) lats = "0.0"; - - this.entry = rowdef.newEntry(); - this.entry.setCol(col_hash, url.hash()); // FIXME potential null pointer access - this.entry.setCol(col_comp, encodeComp(url, descr, dc_creator, tags, dc_publisher, Float.parseFloat(lats), Float.parseFloat(lons))); - - // create new formatters to make concurrency possible - final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); - - try { - encodeDate(col_mod, formatter.parse(prop.getProperty("mod", "20000101"))); - } catch (final ParseException e) { - encodeDate(col_mod, new Date()); - } - try { - encodeDate(col_load, formatter.parse(prop.getProperty("load", "20000101"))); - } catch (final ParseException e) { - encodeDate(col_load, new Date()); - } - try { - encodeDate(col_fresh, formatter.parse(prop.getProperty("fresh", "20000101"))); - } catch (final ParseException e) { - encodeDate(col_fresh, new Date()); - } - this.entry.setCol(col_referrer, UTF8.getBytes(prop.getProperty("referrer", ""))); - this.entry.setCol(col_md5, Digest.decodeHex(prop.getProperty("md5", ""))); - this.entry.setCol(col_size, Integer.parseInt(prop.getProperty("size", "0"))); - this.entry.setCol(col_wc, Integer.parseInt(prop.getProperty("wc", "0"))); - final String dt = prop.getProperty("dt", "t"); - this.entry.setCol(col_dt, dt.isEmpty() ? new byte[]{(byte) 't'} : new byte[]{(byte) dt.charAt(0)}); - final String flags = prop.getProperty("flags", "AAAAAA"); - this.entry.setCol(col_flags, (flags.length() > 6) ? QueryParams.empty_constraint.bytes() : (new Bitfield(4, flags)).bytes()); - this.entry.setCol(col_lang, UTF8.getBytes(prop.getProperty("lang", "uk"))); - this.entry.setCol(col_llocal, Integer.parseInt(prop.getProperty("llocal", "0"))); - this.entry.setCol(col_lother, Integer.parseInt(prop.getProperty("lother", "0"))); - this.entry.setCol(col_limage, Integer.parseInt(prop.getProperty("limage", "0"))); - this.entry.setCol(col_laudio, Integer.parseInt(prop.getProperty("laudio", "0"))); - this.entry.setCol(col_lvideo, Integer.parseInt(prop.getProperty("lvideo", "0"))); - this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0"))); - this.snippet = crypt.simpleDecode(prop.getProperty("snippet", "")); - this.word = null; - if (prop.containsKey("wi")) { - this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", "")))); - } - this.comp = null; - } - - public static URIMetadataRow importEntry(final String propStr) { - if (propStr == null || propStr.isEmpty() || propStr.charAt(0) != '{' || !propStr.endsWith("}")) { - Log.logSevere("URIMetadataRow", "importEntry: propStr is not proper: " + propStr); - return null; - } - try { - return new URIMetadataRow(MapTools.s2p(propStr.substring(1, propStr.length() - 1))); - } catch (final kelondroException e) { - // wrong format - Log.logSevere("URIMetadataRow", e.getMessage()); - return null; - } - } - - private void encodeDate(final int col, final Date d) { - // calculates the number of days since 1.1.1970 and returns this as 4-byte array - // 86400000 is the number of milliseconds in one day - long time = d.getTime(); - long now = System.currentTimeMillis(); - this.entry.setCol(col, NaturalOrder.encodeLong((time > now ? now : time) / 86400000L, 4)); - } - - private Date decodeDate(final int col) { - final long t = this.entry.getColLong(col); - /*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch - /* - if (t < 350400) return new Date(3600000L * t); // hours since epoch - if (t < 21024000) return new Date(60000L * t); // minutes since epoch - */ - } - - private static byte[] encodeComp( - final DigestURI url, - final String dc_title, - final String dc_creator, - final String dc_subject, - final String dc_publisher, - final double lat, - final double lon) { - final CharBuffer s = new CharBuffer(3600, 360); - s.append(url.toNormalform(true)).appendLF(); - s.append(dc_title).appendLF(); - if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); - s.appendLF(); - if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject); - s.appendLF(); - if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher); - s.appendLF(); - if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF(); - String s0 = s.toString(); - s.close(); - return UTF8.getBytes(s0); - } - - public byte[] hash() { - // return a url-hash, based on the md5 algorithm - // the result is a String of 12 bytes within a 72-bit space - // (each byte has an 6-bit range) - // that should be enough for all web pages on the world - return this.entry.getPrimaryKeyBytes(); - } - - private String hostHash = null; - public String hosthash() { - if (this.hostHash != null) return this.hostHash; - this.hostHash = ASCII.String(this.entry.getPrimaryKeyBytes(), 6, 6); - return this.hostHash; - } - - public boolean matches(final Pattern matcher) { - return this.metadata().matches(matcher); - } - - public DigestURI url() { - return this.metadata().url(); - } - - public String dc_title() { - return this.metadata().dc_title(); - } - - public String dc_creator() { - return this.metadata().dc_creator(); - } - - public String dc_publisher() { - return this.metadata().dc_publisher(); - } - - public String dc_subject() { - return this.metadata().dc_subject(); - } - - public double lat() { - return this.metadata().lat(); - } - - public double lon() { - return this.metadata().lon(); - } - - private Components metadata() { - // avoid double computation of metadata elements - if (this.comp != null) return this.comp; - // parse elements from comp field; - final byte[] c = this.entry.getColBytes(col_comp, true); - final List cl = ByteBuffer.split(c, (byte) 10); - this.comp = new Components( - (cl.size() > 0) ? UTF8.String(cl.get(0)) : "", - hash(), - (cl.size() > 1) ? UTF8.String(cl.get(1)) : "", - (cl.size() > 2) ? UTF8.String(cl.get(2)) : "", - (cl.size() > 3) ? UTF8.String(cl.get(3)) : "", - (cl.size() > 4) ? UTF8.String(cl.get(4)) : "", - (cl.size() > 5) ? UTF8.String(cl.get(5)) : ""); - return this.comp; - } - - public Date moddate() { - return decodeDate(col_mod); - } - - public Date loaddate() { - return decodeDate(col_load); - } - - public Date freshdate() { - return decodeDate(col_fresh); - } - - public byte[] referrerHash() { - // return the creator's hash or null if there is none - // FIXME: There seem to be some malformed entries in the databasees like "null\0\0\0\0\0\0\0\0" - final byte[] r = this.entry.getColBytes(col_referrer, true); - if (r != null) { - int i = r.length; - while (i > 0) { - if (r[--i] == 0) return null; - } - } - return r; - } - - public String md5() { - // returns the md5 in hex representation - return Digest.encodeHex(this.entry.getColBytes(col_md5, true)); - } - - public char doctype() { - return (char) this.entry.getColByte(col_dt); - } - - public byte[] language() { - byte[] b = this.entry.getColBytes(col_lang, true); - if ((b == null || b[0] == (byte)'[') && this.metadata().url != null) { - String tld = this.metadata().url.getTLD(); - if (tld.length() < 2 || tld.length() > 2) return ASCII.getBytes("en"); - return ASCII.getBytes(tld); - } - return b; - } - - public int size() { - return (int) this.entry.getColLong(col_size); - } - - public Bitfield flags() { - return new Bitfield(this.entry.getColBytes(col_flags, true)); - } - - public int wordCount() { - return (int) this.entry.getColLong(col_wc); - } - - public int llocal() { - return (int) this.entry.getColLong(col_llocal); - } - - public int lother() { - return (int) this.entry.getColLong(col_lother); - } - - public int limage() { - return (int) this.entry.getColLong(col_limage); - } - - public int laudio() { - return (int) this.entry.getColLong(col_laudio); - } - - public int lvideo() { - return (int) this.entry.getColLong(col_lvideo); - } - - public int lapp() { - return (int) this.entry.getColLong(col_lapp); - } - - public String snippet() { - // the snippet may appear here if the url was transported in a remote search - // it will not be saved anywhere, but can only be requested here - return this.snippet; - } - - - - public WordReference word() { - return this.word; - } - - private static StringBuilder corePropList(URIMetadataRow md) { - // generate a parseable string; this is a simple property-list - final StringBuilder s = new StringBuilder(300); - - // create new formatters to make concurrency possible - final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); - - try { - s.append("hash=").append(ASCII.String(md.hash())); - s.append(",url=").append(crypt.simpleEncode(md.url().toNormalform(true))); - s.append(",descr=").append(crypt.simpleEncode(md.dc_title())); - s.append(",author=").append(crypt.simpleEncode(md.dc_creator())); - s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(md.dc_subject()))); - s.append(",publisher=").append(crypt.simpleEncode(md.dc_publisher())); - s.append(",lat=").append(md.lat()); - s.append(",lon=").append(md.lon()); - s.append(",mod=").append(formatter.format(md.moddate())); - s.append(",load=").append(formatter.format(md.loaddate())); - s.append(",fresh=").append(formatter.format(md.freshdate())); - s.append(",referrer=").append(md.referrerHash() == null ? "" : ASCII.String(md.referrerHash())); - s.append(",md5=").append(md.md5()); - s.append(",size=").append(md.size()); - s.append(",wc=").append(md.wordCount()); - s.append(",dt=").append(md.doctype()); - s.append(",flags=").append(md.flags().exportB64()); - s.append(",lang=").append(md.language() == null ? "EN" : UTF8.String(md.language())); - s.append(",llocal=").append(md.llocal()); - s.append(",lother=").append(md.lother()); - s.append(",limage=").append(md.limage()); - s.append(",laudio=").append(md.laudio()); - s.append(",lvideo=").append(md.lvideo()); - s.append(",lapp=").append(md.lapp()); - if (md.word() != null) { - // append also word properties - final String wprop = md.word().toPropertyForm(); - s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop)); - } - return s; - } catch (final Throwable e) { - Log.logException(e); - return null; - } - } - - public Request toBalancerEntry(final String initiatorHash) { - return new Request( - ASCII.getBytes(initiatorHash), - metadata().url(), - referrerHash(), - metadata().dc_title(), - moddate(), - null, - 0, - 0, - 0, - 0); - } - - /** - * @return the object as String.
- * This e.g. looks like this: - *
{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}
- */ - @Override - public String toString() { - final StringBuilder core = corePropList(this); - if (core == null) return null; - - core.insert(0, "{"); - core.append("}"); - - return core.toString(); - //return "{" + core + "}"; - } - - private class Components { - private DigestURI url; - private String urlRaw; - private byte[] urlHash; - private final String dc_title, dc_creator, dc_subject, dc_publisher; - private final String latlon; // a comma-separated tuple as "," where the coordinates are given as WGS84 spatial coordinates in decimal degrees - - public Components( - final String urlRaw, - final byte[] urlhash, - final String title, - final String author, - final String tags, - final String publisher, - final String latlon) { - this.url = null; - this.urlRaw = urlRaw; - this.urlHash = urlhash; - this.dc_title = title; - this.dc_creator = author; - this.dc_subject = tags; - this.dc_publisher = publisher; - this.latlon = latlon; - } - public boolean matches(final Pattern matcher) { - if (this.urlRaw != null) return matcher.matcher(this.urlRaw.toLowerCase()).matches(); - if (this.url != null) return matcher.matcher(this.url.toNormalform(true).toLowerCase()).matches(); - return false; - } - public DigestURI url() { - if (this.url == null) { - try { - this.url = new DigestURI(this.urlRaw, this.urlHash); - } catch (final MalformedURLException e) { - this.url = null; - } - this.urlRaw = null; - this.urlHash = null; - } - return this.url; - } - public String dc_title() { return this.dc_title; } - public String dc_creator() { return this.dc_creator; } - public String dc_publisher() { return this.dc_publisher; } - public String dc_subject() { return this.dc_subject; } - public double lat() { - if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; - final int p = this.latlon.indexOf(','); - return p < 0 ? 0.0f : Double.parseDouble(this.latlon.substring(0, p)); - } - public double lon() { - if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; - final int p = this.latlon.indexOf(','); - return p < 0 ? 0.0f : Double.parseDouble(this.latlon.substring(p + 1)); - } - } - -} +// URLMetadataRow.java +// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 2006 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.kelondro.data.meta; + +import java.net.MalformedURLException; +import java.text.ParseException; +import java.util.Date; +import java.util.List; +import java.util.Properties; +import java.util.regex.Pattern; + +import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.UTF8; +import net.yacy.cora.lod.vocabulary.Tagging; +import net.yacy.cora.order.Base64Order; +import net.yacy.cora.order.Digest; +import net.yacy.cora.order.NaturalOrder; +import net.yacy.crawler.retrieval.Request; +import net.yacy.kelondro.data.word.WordReference; +import net.yacy.kelondro.data.word.WordReferenceRow; +import net.yacy.kelondro.data.word.WordReferenceVars; +import net.yacy.kelondro.index.Row; +import net.yacy.kelondro.io.CharBuffer; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.Bitfield; +import net.yacy.kelondro.util.ByteBuffer; +import net.yacy.kelondro.util.MapTools; +import net.yacy.kelondro.util.kelondroException; +import net.yacy.search.query.QueryParams; +import net.yacy.utils.crypt; + +public class URIMetadataRow { + + // this object stores attributes for URL entries + + public static final Row rowdef = new Row( + "String hash-12, " + // the url's hash + "String comp-360, " + // components: the url, description, author, tags and publisher + "Cardinal mod-4 {b256}, " + // last-modified from the httpd + "Cardinal load-4 {b256}, " + // time when the url was loaded + "Cardinal fresh-4 {b256}, " + // time until this url is fresh + "String referrer-12, " + // (one of) the url's referrer hash(es) + "byte[] md5-8, " + // the md5 of the url content (to identify changes) + "Cardinal size-6 {b256}, " + // size of file in bytes + "Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds + "byte[] dt-1, " + // doctype, taken from extension or any other heuristic + "Bitfield flags-4, " + // flags; any stuff (see Word-Entity definition) + "byte[] lang-2, " + // language + "Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width + "Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height + "Cardinal limage-2 {b256}, " + // # of embedded image links + "Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks + "Cardinal lvideo-2 {b256}, " + // # of embedded video links + "Cardinal lapp-2 {b256}", // # of embedded links to applications + Base64Order.enhancedCoder + ); + + /* =========================================================================== + * Constants to access the various columns of an URL entry + * =========================================================================== */ + private static final int col_hash = 0; // the url's hash + private static final int col_comp = 1; // components: the url, description, author and tags. As 5th element, an ETag is possible + private static final int col_mod = 2; // the modifed-date time from the server (servertime in row) + private static final int col_load = 3; // time when the url was loaded + private static final int col_fresh = 4; // time until this url is fresh + private static final int col_referrer = 5; // a referrer of the url (there may be several, but this is the one that was acually referring to this one) + private static final int col_md5 = 6; // the md5 of the url content (to identify changes) + private static final int col_size = 7; // size of file in bytes + private static final int col_wc = 8; // size of file by number of words; for video and audio: seconds + private static final int col_dt = 9; // doctype, taken from extension or any other heuristic + private static final int col_flags = 10; // flags; any stuff (see Word-Entity definition) + private static final int col_lang = 11; // language + private static final int col_llocal = 12; // # of outlinks to same domain; for video and image: width + private static final int col_lother = 13; // # of outlinks to outside domain; for video and image: height + private static final int col_limage = 14; // # of embedded image links + private static final int col_laudio = 15; // # of embedded audio links; for audio: track number; for video: number of audio tracks + private static final int col_lvideo = 16; // # of embedded video links + private static final int col_lapp = 17; // # of embedded links to applications + + private final Row.Entry entry; + private final String snippet; + private WordReference word; // this is only used if the url is transported via remote search requests + private Components comp; + + public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) { + this.entry = entry; + this.snippet = ""; + this.word = searchedWord; + this.comp = null; + } + + private URIMetadataRow(final Properties prop) throws kelondroException { + // generates an plasmaLURLEntry using the properties from the argument + // the property names must correspond to the one from toString + //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); + DigestURI url; + String urls = crypt.simpleDecode(prop.getProperty("url", "")); + try { + url = new DigestURI(urls); + } catch (final MalformedURLException e) { + throw new kelondroException("bad url: " + urls); + } + String descr = crypt.simpleDecode(prop.getProperty("descr", "")); if (descr == null) descr = ""; + String dc_creator = crypt.simpleDecode(prop.getProperty("author", "")); if (dc_creator == null) dc_creator = ""; + String tags = crypt.simpleDecode(prop.getProperty("tags", "")); if (tags == null) tags = ""; + tags = Tagging.cleanTagFromAutotagging(tags); + String dc_publisher = crypt.simpleDecode(prop.getProperty("publisher", "")); if (dc_publisher == null) dc_publisher = ""; + String lons = crypt.simpleDecode(prop.getProperty("lon", "0.0")); if (lons == null) lons = "0.0"; + String lats = crypt.simpleDecode(prop.getProperty("lat", "0.0")); if (lats == null) lats = "0.0"; + + this.entry = rowdef.newEntry(); + this.entry.setCol(col_hash, url.hash()); // FIXME potential null pointer access + this.entry.setCol(col_comp, encodeComp(url, descr, dc_creator, tags, dc_publisher, Float.parseFloat(lats), Float.parseFloat(lons))); + + // create new formatters to make concurrency possible + final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); + + try { + encodeDate(col_mod, formatter.parse(prop.getProperty("mod", "20000101"))); + } catch (final ParseException e) { + encodeDate(col_mod, new Date()); + } + try { + encodeDate(col_load, formatter.parse(prop.getProperty("load", "20000101"))); + } catch (final ParseException e) { + encodeDate(col_load, new Date()); + } + try { + encodeDate(col_fresh, formatter.parse(prop.getProperty("fresh", "20000101"))); + } catch (final ParseException e) { + encodeDate(col_fresh, new Date()); + } + this.entry.setCol(col_referrer, UTF8.getBytes(prop.getProperty("referrer", ""))); + this.entry.setCol(col_md5, Digest.decodeHex(prop.getProperty("md5", ""))); + this.entry.setCol(col_size, Integer.parseInt(prop.getProperty("size", "0"))); + this.entry.setCol(col_wc, Integer.parseInt(prop.getProperty("wc", "0"))); + final String dt = prop.getProperty("dt", "t"); + this.entry.setCol(col_dt, dt.isEmpty() ? new byte[]{(byte) 't'} : new byte[]{(byte) dt.charAt(0)}); + final String flags = prop.getProperty("flags", "AAAAAA"); + this.entry.setCol(col_flags, (flags.length() > 6) ? QueryParams.empty_constraint.bytes() : (new Bitfield(4, flags)).bytes()); + this.entry.setCol(col_lang, UTF8.getBytes(prop.getProperty("lang", ""))); + this.entry.setCol(col_llocal, Integer.parseInt(prop.getProperty("llocal", "0"))); + this.entry.setCol(col_lother, Integer.parseInt(prop.getProperty("lother", "0"))); + this.entry.setCol(col_limage, Integer.parseInt(prop.getProperty("limage", "0"))); + this.entry.setCol(col_laudio, Integer.parseInt(prop.getProperty("laudio", "0"))); + this.entry.setCol(col_lvideo, Integer.parseInt(prop.getProperty("lvideo", "0"))); + this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0"))); + this.snippet = crypt.simpleDecode(prop.getProperty("snippet", "")); + this.word = null; + if (prop.containsKey("wi")) { + this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", "")))); + } + this.comp = null; + } + + public static URIMetadataRow importEntry(final String propStr) { + if (propStr == null || propStr.isEmpty() || propStr.charAt(0) != '{' || !propStr.endsWith("}")) { + Log.logSevere("URIMetadataRow", "importEntry: propStr is not proper: " + propStr); + return null; + } + try { + return new URIMetadataRow(MapTools.s2p(propStr.substring(1, propStr.length() - 1))); + } catch (final kelondroException e) { + // wrong format + Log.logSevere("URIMetadataRow", e.getMessage()); + return null; + } + } + + private void encodeDate(final int col, final Date d) { + // calculates the number of days since 1.1.1970 and returns this as 4-byte array + // 86400000 is the number of milliseconds in one day + long time = d.getTime(); + long now = System.currentTimeMillis(); + this.entry.setCol(col, NaturalOrder.encodeLong((time > now ? now : time) / 86400000L, 4)); + } + + private Date decodeDate(final int col) { + final long t = this.entry.getColLong(col); + /*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch + /* + if (t < 350400) return new Date(3600000L * t); // hours since epoch + if (t < 21024000) return new Date(60000L * t); // minutes since epoch + */ + } + + private static byte[] encodeComp( + final DigestURI url, + final String dc_title, + final String dc_creator, + final String dc_subject, + final String dc_publisher, + final double lat, + final double lon) { + final CharBuffer s = new CharBuffer(3600, 360); + s.append(url.toNormalform(true)).appendLF(); + s.append(dc_title).appendLF(); + if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); + s.appendLF(); + if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject); + s.appendLF(); + if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher); + s.appendLF(); + if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF(); + String s0 = s.toString(); + s.close(); + return UTF8.getBytes(s0); + } + + public byte[] hash() { + // return a url-hash, based on the md5 algorithm + // the result is a String of 12 bytes within a 72-bit space + // (each byte has an 6-bit range) + // that should be enough for all web pages on the world + return this.entry.getPrimaryKeyBytes(); + } + + private String hostHash = null; + public String hosthash() { + if (this.hostHash != null) return this.hostHash; + this.hostHash = ASCII.String(this.entry.getPrimaryKeyBytes(), 6, 6); + return this.hostHash; + } + + public boolean matches(final Pattern matcher) { + return this.metadata().matches(matcher); + } + + public DigestURI url() { + return this.metadata().url(); + } + + public String dc_title() { + return this.metadata().dc_title(); + } + + public String dc_creator() { + return this.metadata().dc_creator(); + } + + public String dc_publisher() { + return this.metadata().dc_publisher(); + } + + public String dc_subject() { + return this.metadata().dc_subject(); + } + + public double lat() { + return this.metadata().lat(); + } + + public double lon() { + return this.metadata().lon(); + } + + private Components metadata() { + // avoid double computation of metadata elements + if (this.comp != null) return this.comp; + // parse elements from comp field; + final byte[] c = this.entry.getColBytes(col_comp, true); + final List cl = ByteBuffer.split(c, (byte) 10); + this.comp = new Components( + (cl.size() > 0) ? UTF8.String(cl.get(0)) : "", + hash(), + (cl.size() > 1) ? UTF8.String(cl.get(1)) : "", + (cl.size() > 2) ? UTF8.String(cl.get(2)) : "", + (cl.size() > 3) ? UTF8.String(cl.get(3)) : "", + (cl.size() > 4) ? UTF8.String(cl.get(4)) : "", + (cl.size() > 5) ? UTF8.String(cl.get(5)) : ""); + return this.comp; + } + + public Date moddate() { + return decodeDate(col_mod); + } + + public Date loaddate() { + return decodeDate(col_load); + } + + public Date freshdate() { + return decodeDate(col_fresh); + } + + public byte[] referrerHash() { + // return the creator's hash or null if there is none + // FIXME: There seem to be some malformed entries in the databasees like "null\0\0\0\0\0\0\0\0" + final byte[] r = this.entry.getColBytes(col_referrer, true); + if (r != null) { + int i = r.length; + while (i > 0) { + if (r[--i] == 0) return null; + } + } + return r; + } + + public String md5() { + // returns the md5 in hex representation + return Digest.encodeHex(this.entry.getColBytes(col_md5, true)); + } + + public char doctype() { + return (char) this.entry.getColByte(col_dt); + } + + public byte[] language() { + byte[] b = this.entry.getColBytes(col_lang, true); + if ((b == null || b[0] == (byte)'[') && this.metadata().url != null) { + String lang = this.metadata().url.language(); // calculate lang by TLD + this.entry.setCol(col_lang, UTF8.getBytes(lang)); //remember calculation + return ASCII.getBytes(lang); + } + return b; + } + + public int size() { + return (int) this.entry.getColLong(col_size); + } + + public Bitfield flags() { + return new Bitfield(this.entry.getColBytes(col_flags, true)); + } + + public int wordCount() { + return (int) this.entry.getColLong(col_wc); + } + + public int llocal() { + return (int) this.entry.getColLong(col_llocal); + } + + public int lother() { + return (int) this.entry.getColLong(col_lother); + } + + public int limage() { + return (int) this.entry.getColLong(col_limage); + } + + public int laudio() { + return (int) this.entry.getColLong(col_laudio); + } + + public int lvideo() { + return (int) this.entry.getColLong(col_lvideo); + } + + public int lapp() { + return (int) this.entry.getColLong(col_lapp); + } + + public String snippet() { + // the snippet may appear here if the url was transported in a remote search + // it will not be saved anywhere, but can only be requested here + return this.snippet; + } + + + + public WordReference word() { + return this.word; + } + + private static StringBuilder corePropList(URIMetadataRow md) { + // generate a parseable string; this is a simple property-list + final StringBuilder s = new StringBuilder(300); + + // create new formatters to make concurrency possible + final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); + + try { + s.append("hash=").append(ASCII.String(md.hash())); + s.append(",url=").append(crypt.simpleEncode(md.url().toNormalform(true))); + s.append(",descr=").append(crypt.simpleEncode(md.dc_title())); + s.append(",author=").append(crypt.simpleEncode(md.dc_creator())); + s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(md.dc_subject()))); + s.append(",publisher=").append(crypt.simpleEncode(md.dc_publisher())); + s.append(",lat=").append(md.lat()); + s.append(",lon=").append(md.lon()); + s.append(",mod=").append(formatter.format(md.moddate())); + s.append(",load=").append(formatter.format(md.loaddate())); + s.append(",fresh=").append(formatter.format(md.freshdate())); + s.append(",referrer=").append(md.referrerHash() == null ? "" : ASCII.String(md.referrerHash())); + s.append(",md5=").append(md.md5()); + s.append(",size=").append(md.size()); + s.append(",wc=").append(md.wordCount()); + s.append(",dt=").append(md.doctype()); + s.append(",flags=").append(md.flags().exportB64()); + s.append(",lang=").append(md.language() == null ? "EN" : UTF8.String(md.language())); + s.append(",llocal=").append(md.llocal()); + s.append(",lother=").append(md.lother()); + s.append(",limage=").append(md.limage()); + s.append(",laudio=").append(md.laudio()); + s.append(",lvideo=").append(md.lvideo()); + s.append(",lapp=").append(md.lapp()); + if (md.word() != null) { + // append also word properties + final String wprop = md.word().toPropertyForm(); + s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop)); + } + return s; + } catch (final Throwable e) { + Log.logException(e); + return null; + } + } + + public Request toBalancerEntry(final String initiatorHash) { + return new Request( + ASCII.getBytes(initiatorHash), + metadata().url(), + referrerHash(), + metadata().dc_title(), + moddate(), + null, + 0, + 0, + 0, + 0); + } + + /** + * @return the object as String.
+ * This e.g. looks like this: + *
{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}
+ */ + @Override + public String toString() { + final StringBuilder core = corePropList(this); + if (core == null) return null; + + core.insert(0, "{"); + core.append("}"); + + return core.toString(); + //return "{" + core + "}"; + } + + private class Components { + private DigestURI url; + private String urlRaw; + private byte[] urlHash; + private final String dc_title, dc_creator, dc_subject, dc_publisher; + private final String latlon; // a comma-separated tuple as "," where the coordinates are given as WGS84 spatial coordinates in decimal degrees + + public Components( + final String urlRaw, + final byte[] urlhash, + final String title, + final String author, + final String tags, + final String publisher, + final String latlon) { + this.url = null; + this.urlRaw = urlRaw; + this.urlHash = urlhash; + this.dc_title = title; + this.dc_creator = author; + this.dc_subject = tags; + this.dc_publisher = publisher; + this.latlon = latlon; + } + public boolean matches(final Pattern matcher) { + if (this.urlRaw != null) return matcher.matcher(this.urlRaw.toLowerCase()).matches(); + if (this.url != null) return matcher.matcher(this.url.toNormalform(true).toLowerCase()).matches(); + return false; + } + public DigestURI url() { + if (this.url == null) { + try { + this.url = new DigestURI(this.urlRaw, this.urlHash); + } catch (final MalformedURLException e) { + this.url = null; + } + this.urlRaw = null; + this.urlHash = null; + } + return this.url; + } + public String dc_title() { return this.dc_title; } + public String dc_creator() { return this.dc_creator; } + public String dc_publisher() { return this.dc_publisher; } + public String dc_subject() { return this.dc_subject; } + public double lat() { + if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; + final int p = this.latlon.indexOf(','); + if (p < 0) { + return 0.0d; + } else { // old index entries might contain text "NaN,NaN" + return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p)); + } + } + public double lon() { + if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; + final int p = this.latlon.indexOf(','); + if (p < 0) { + return 0.0d; + } else { // old index entries might contain text "NaN,NaN" + return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1)); + } + } + } + +}