Refactoring and redesign of data architecture to make URIMetadataRow

superfluous. The target is to make a solr document as the core of YaCy
documents which would cause that many conversions can be removed. On the
way to this target the Equivalence of URIMetadataRow and URIMetadataNode
had to be removed to expose the usage of the old URIMetadataRow data
structure.
This refactoring already removes unneccessary conversions and should
make memory usage during indexing lower.
pull/1/head
Michael Peter Christen 13 years ago
parent 7f71dfab03
commit ccc3760a47

@ -45,7 +45,6 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.ListManager;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -270,8 +269,8 @@ public class IndexControlRWIs_p {
index = segment.termIndex().get(keyhash, null);
// built urlCache
final Iterator<WordReference> urlIter = index.entries();
final TreeMap<byte[], URIMetadataRow> knownURLs =
new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
final TreeMap<byte[], URIMetadataNode> knownURLs =
new TreeMap<byte[], URIMetadataNode>(Base64Order.enhancedCoder);
final HandleSet unknownURLEntries =
new RowHandleSet(
WordReferenceRow.urlEntryRow.primaryKeyLength,
@ -290,7 +289,7 @@ public class IndexControlRWIs_p {
}
urlIter.remove();
} else {
knownURLs.put(iEntry.urlhash(), lurl.toRow());
knownURLs.put(iEntry.urlhash(), lurl);
}
}
@ -376,7 +375,7 @@ public class IndexControlRWIs_p {
} catch ( final SpaceExceededException e ) {
Log.logException(e);
}
final URIMetadata e = segment.fulltext().getMetadata(b);
final URIMetadataNode e = segment.fulltext().getMetadata(b);
segment.fulltext().remove(b);
if ( e != null ) {
url = e.url();
@ -411,7 +410,7 @@ public class IndexControlRWIs_p {
} catch ( final SpaceExceededException e ) {
Log.logException(e);
}
final URIMetadata e = segment.fulltext().getMetadata(b);
final URIMetadataNode e = segment.fulltext().getMetadata(b);
segment.fulltext().remove(b);
if ( e != null ) {
url = e.url();

@ -41,7 +41,6 @@ import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
@ -206,7 +205,7 @@ public class IndexControlURLs_p {
final DigestURI url = new DigestURI(urlstring);
urlhash = ASCII.String(url.hash());
prop.put("urlhash", urlhash);
final URIMetadata entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash));
final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash));
if (entry == null) {
prop.putHTML("result", "No Entry for URL " + url.toNormalform(true));
prop.putHTML("urlstring", urlstring);
@ -222,7 +221,7 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashsearch")) {
final URIMetadata entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash));
final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash));
if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash);
} else {
@ -236,7 +235,7 @@ public class IndexControlURLs_p {
if (post.containsKey("urlhashsimilar")) {
final Iterator<URIMetadataNode> entryIt = new RotateIterator<URIMetadataNode>(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
URIMetadata entry;
URIMetadataNode entry;
int i = 0, rows = 0, cols = 0;
prop.put("urlhashsimilar", "1");
while (entryIt.hasNext() && i < 256) {
@ -341,7 +340,7 @@ public class IndexControlURLs_p {
return prop;
}
private static serverObjects genUrlProfile(final Segment segment, final URIMetadata entry, final String urlhash) {
private static serverObjects genUrlProfile(final Segment segment, final URIMetadataNode entry, final String urlhash) {
final serverObjects prop = new serverObjects();
if (entry == null) {
prop.put("genUrlProfile", "1");

@ -36,7 +36,6 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.RequestHeader.FileType;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
@ -48,7 +47,6 @@ import net.yacy.server.serverSwitch;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.RDFNode;
public class yacydoc {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
@ -106,7 +104,7 @@ public class yacydoc {
if (entry.url() == null) {
return prop;
}
final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash());
final URIMetadataNode le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash());
prop.putXML("dc_title", entry.dc_title());
prop.putXML("dc_creator", entry.dc_creator());

@ -34,7 +34,6 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.Protocol;
@ -116,7 +115,7 @@ public final class crawlReceipt {
}
// generating a new loaded URL entry
final URIMetadata entry = URIMetadataRow.importEntry(propStr);
final URIMetadataRow entry = URIMetadataRow.importEntry(propStr);
if (entry == null) {
if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "3600");
@ -148,7 +147,7 @@ public final class crawlReceipt {
if ("fill".equals(result)) try {
// put new entry into database
sb.index.fulltext().putMetadata(entry);
ResultURLs.stack(entry, youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS);
ResultURLs.stack(ASCII.String(entry.url().hash()), entry.url().getHost(), youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS);
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done
if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false));

@ -30,6 +30,7 @@ import java.io.IOException;
import java.text.ParseException;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.ResultURLs;
@ -149,7 +150,7 @@ public final class transferURL {
if (Network.log.isFine()) Network.log.logFine("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.url().toNormalform(true));
try {
sb.index.fulltext().putMetadata(lEntry);
ResultURLs.stack(lEntry, iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER);
ResultURLs.stack(ASCII.String(lEntry.url().hash()), lEntry.url().getHost(), iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER);
if (Network.log.isFine()) Network.log.logFine("transferURL: received URL '" + lEntry.url().toNormalform(false) + "' from peer " + otherPeerName);
received++;
} catch (final IOException e) {

@ -1976,16 +1976,18 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return false;
}
public long length() throws IOException {
if (isFile()) return getFSFile().length();
public long length() {
if (isFile()) try {
return getFSFile().length();
} catch (final Throwable e) {
return -1;
}
if (isSMB()) try {
return TimeoutRequest.length(getSmbFile(), SMB_TIMEOUT);
} catch (final SmbException e) {
throw new IOException("SMB.length SmbException (" + e.getMessage() + ") for " + toString());
} catch (final MalformedURLException e) {
throw new IOException("SMB.length MalformedURLException (" + e.getMessage() + ") for " + toString());
} catch (final Throwable e) {
return -1;
}
return 0;
return -1;
}
public long lastModified() throws IOException {

@ -25,22 +25,13 @@
package net.yacy.crawler.data;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.ReverseMapIterator;
public final class ResultURLs {
@ -97,17 +88,18 @@ public final class ResultURLs {
}
public static void stack(
final URIMetadata urlEntry,
final String urlhash,
final String hostname,
final byte[] initiatorHash,
final byte[] executorHash,
final EventOrigin stackType) {
// assert initiatorHash != null; // null == proxy !
assert executorHash != null;
if (urlEntry == null) { return; }
if (urlhash == null || hostname == null) { return; }
try {
final Map<String, InitExecEntry> resultStack = getStack(stackType);
if (resultStack != null) {
resultStack.put(ASCII.String(urlEntry.hash()), new InitExecEntry(initiatorHash, executorHash));
resultStack.put(urlhash, new InitExecEntry(initiatorHash, executorHash));
}
} catch (final Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString());
@ -116,7 +108,7 @@ public final class ResultURLs {
try {
final ScoreMap<String> domains = getDomains(stackType);
if (domains != null) {
domains.inc(urlEntry.url().getHost());
domains.inc(hostname);
}
} catch (final Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/3: " + ex.toString());
@ -216,23 +208,4 @@ public final class ResultURLs {
return true;
}
/**
* test and benchmark
* @param args
*/
public static void main(final String[] args) {
try {
final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/");
final URIMetadata urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0d, 0.0d, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0, new String[0]);
final EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
System.out.println("valid test:\n=======");
// add
stack(urlRef, urlRef.hash(), url.hash(), stackNo);
// size
System.out.println("size of stack:\t"+ getStackSize(stackNo));
} catch (final MalformedURLException e) {
Log.logException(e);
}
}
}

@ -293,7 +293,7 @@ dc_rights
return this.source.toNormalform(true);
}
public MultiProtocolURI dc_source() {
public DigestURI dc_source() {
return this.source;
}

@ -197,6 +197,10 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
return this.hash;
}
public String hosthash() {
return ASCII.String(this.hash(), 6, 6);
}
/**
* calculated YaCy-Hash of this URI
*

@ -1,132 +0,0 @@
/**
* URIMetadata
* Copyright 2012 by Michael Peter Christen
* First released 3.4.2012 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.data.meta;
import java.util.Date;
import java.util.regex.Pattern;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.util.Bitfield;
public interface URIMetadata {
/**
* The hash of a URIReference is a unique key for the stored URL.
* It is in fact equal to url().hash()
* @return the hash of the stored url
*/
public byte[] hash();
/**
* the second half of a uri hash is the host hash
* @return
*/
public String hosthash();
/**
* The modification date of the URIReference is given if
* the record was created first and is defined with the
* creation date. If the record is modified later, the date shall change.
* @return the modification date of this record
*/
public Date moddate();
/**
* The DigestURI is the payload of the URIReference
* @return the url as DigestURI with assigned URL hash according to the record hash
*/
public DigestURI url();
/**
* check if the url matches agains a given matcher
* @param matcher
* @return true if the url() matches
*/
public boolean matches(final Pattern matcher);
/**
* produce a visible representation of the record
* @return a string for the url()
*/
@Override
public String toString();
public String dc_title();
public String dc_creator();
public String dc_publisher();
public String dc_subject();
public double lat();
public double lon();
public long ranking();
public Date loaddate();
public Date freshdate();
public String md5();
public char doctype();
public byte[] language();
public int size();
public Bitfield flags();
public int wordCount();
public int llocal();
public int lother();
public int limage();
public int laudio();
public int lvideo();
public int lapp();
public String snippet();
public String[] collections();
public WordReference word();
public boolean isOlder(final URIMetadata other);
public String toString(final String snippet);
public byte[] referrerHash();
public Request toBalancerEntry(final String initiatorHash);
}

@ -35,7 +35,6 @@ import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.order.Base64Order;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.word.WordReference;
@ -43,7 +42,9 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.utils.crypt;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
/**
@ -51,7 +52,7 @@ import org.apache.solr.common.SolrDocument;
* The purpose of this object is the migration from the old metadata structure to solr document.
* Future implementations should try to replace URIMetadata objects completely by SolrDocument objects
*/
public class URIMetadataNode implements URIMetadata {
public class URIMetadataNode {
private byte[] hash = null;
private String urlRaw = null, keywords = null;
@ -64,6 +65,10 @@ public class URIMetadataNode implements URIMetadata {
private String snippet = null;
private WordReference word = null; // this is only used if the url is transported via remote search requests
public URIMetadataNode(final SolrInputDocument doc) {
this(ClientUtils.toSolrDocument(doc));
}
public URIMetadataNode(final SolrDocument doc) {
this.doc = doc;
this.snippet = "";
@ -79,123 +84,58 @@ public class URIMetadataNode implements URIMetadata {
}
}
public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) {
this(doc);
public URIMetadataNode(final SolrInputDocument doc, final WordReference searchedWord, final long ranking) {
this(ClientUtils.toSolrDocument(doc));
this.word = searchedWord;
this.ranking = ranking;
}
public URIMetadataRow toRow() {
return URIMetadataRow.importEntry(this.toString());
public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) {
this(doc);
this.word = searchedWord;
this.ranking = ranking;
}
public SolrDocument getDocument() {
return this.doc;
}
private int getInt(YaCySchema field) {
assert !field.isMultiValued();
assert field.getType() == SolrType.integer;
Integer x = (Integer) this.doc.getFieldValue(field.name());
if (x == null) return 0;
return x.intValue();
}
private Date getDate(YaCySchema field) {
assert !field.isMultiValued();
assert field.getType() == SolrType.date;
Date x = (Date) this.doc.getFieldValue(field.name());
if (x == null) return new Date(0);
Date now = new Date();
return x.after(now) ? now : x;
}
private String getString(YaCySchema field) {
assert !field.isMultiValued();
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight;
Object x = this.doc.getFieldValue(field.name());
if (x == null) return "";
if (x instanceof ArrayList) {
@SuppressWarnings("unchecked")
ArrayList<String> xa = (ArrayList<String>) x;
return xa.size() == 0 ? "" : xa.get(0);
}
return (String) x;
}
@SuppressWarnings("unchecked")
private ArrayList<String> getStringList(YaCySchema field) {
assert field.isMultiValued();
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general;
Object r = this.doc.getFieldValue(field.name());
if (r == null) return new ArrayList<String>(0);
if (r instanceof ArrayList) {
return (ArrayList<String>) r;
}
ArrayList<String> a = new ArrayList<String>(1);
a.add((String) r);
return a;
}
@SuppressWarnings("unchecked")
private ArrayList<Integer> getIntList(YaCySchema field) {
assert field.isMultiValued();
assert field.getType() == SolrType.integer;
Object r = this.doc.getFieldValue(field.name());
if (r == null) return new ArrayList<Integer>(0);
if (r instanceof ArrayList) {
return (ArrayList<Integer>) r;
}
ArrayList<Integer> a = new ArrayList<Integer>(1);
a.add((Integer) r);
return a;
}
@Override
public byte[] hash() {
return this.hash;
}
@Override
public String hosthash() {
String hosthash = (String) this.doc.getFieldValue(YaCySchema.host_id_s.name());
if (hosthash == null) hosthash = ASCII.String(this.hash, 6, 6);
return hosthash;
}
@Override
public Date moddate() {
return getDate(YaCySchema.last_modified);
}
@Override
public DigestURI url() {
return this.url;
}
@Override
public boolean matches(Pattern matcher) {
return matcher.matcher(this.urlRaw.toLowerCase()).matches();
}
@Override
public String dc_title() {
ArrayList<String> a = getStringList(YaCySchema.title);
if (a == null || a.size() == 0) return "";
return a.get(0);
}
@Override
public String dc_creator() {
return getString(YaCySchema.author);
}
@Override
public String dc_publisher() {
return getString(YaCySchema.publisher_t);
}
@Override
public String dc_subject() {
if (this.keywords == null) {
this.keywords = getString(YaCySchema.keywords);
@ -203,7 +143,6 @@ public class URIMetadataNode implements URIMetadata {
return this.keywords;
}
@Override
public double lat() {
if (this.lat == Double.NaN) {
this.lon = 0.0d;
@ -220,60 +159,49 @@ public class URIMetadataNode implements URIMetadata {
return this.lat;
}
@Override
public double lon() {
if (this.lon == Double.NaN) lat();
return this.lon;
}
@Override
public long ranking() {
return this.ranking;
}
@Override
public Date loaddate() {
return getDate(YaCySchema.load_date_dt);
}
@Override
public Date freshdate() {
return getDate(YaCySchema.fresh_date_dt);
}
@Override
public String md5() {
return getString(YaCySchema.md5_s);
}
@Override
public char doctype() {
ArrayList<String> a = getStringList(YaCySchema.content_type);
if (a == null || a.size() == 0) return Response.docType(url());
return Response.docType(a.get(0));
}
@Override
public byte[] language() {
String language = getString(YaCySchema.language_s);
if (language == null || language.length() == 0) return ASCII.getBytes("en");
return UTF8.getBytes(language);
}
@Override
public byte[] referrerHash() {
ArrayList<String> referrer = getStringList(YaCySchema.referrer_id_txt);
if (referrer == null || referrer.size() == 0) return null;
return ASCII.getBytes(referrer.get(0));
}
@Override
public int size() {
return getInt(YaCySchema.size_i);
}
@Override
public Bitfield flags() {
if (flags == null) {
this.flags = new Bitfield();
@ -287,22 +215,18 @@ public class URIMetadataNode implements URIMetadata {
return this.flags;
}
@Override
public int wordCount() {
return getInt(YaCySchema.wordcount_i);
}
@Override
public int llocal() {
return getInt(YaCySchema.inboundlinkscount_i);
}
@Override
public int lother() {
return getInt(YaCySchema.outboundlinkscount_i);
}
@Override
public int limage() {
if (this.imagec == -1) {
this.imagec = getInt(YaCySchema.imagescount_i);
@ -310,7 +234,6 @@ public class URIMetadataNode implements URIMetadata {
return this.imagec;
}
@Override
public int laudio() {
if (this.audioc == -1) {
this.audioc = getInt(YaCySchema.audiolinkscount_i);
@ -318,7 +241,6 @@ public class URIMetadataNode implements URIMetadata {
return this.audioc;
}
@Override
public int lvideo() {
if (this.videoc == -1) {
this.videoc = getInt(YaCySchema.videolinkscount_i);
@ -326,7 +248,6 @@ public class URIMetadataNode implements URIMetadata {
return this.videoc;
}
@Override
public int lapp() {
if (this.appc == -1) {
this.appc = getInt(YaCySchema.videolinkscount_i);
@ -348,24 +269,20 @@ public class URIMetadataNode implements URIMetadata {
return getInt(YaCySchema.url_chars_i);
}
@Override
public String snippet() {
return this.snippet;
}
@Override
public String[] collections() {
ArrayList<String> a = getStringList(YaCySchema.collection_sxt);
return a.toArray(new String[a.size()]);
}
@Override
public WordReference word() {
return this.word;
}
@Override
public boolean isOlder(URIMetadata other) {
public boolean isOlder(URIMetadataRow other) {
if (other == null) return false;
final Date tmoddate = moddate();
final Date omoddate = other.moddate();
@ -379,7 +296,7 @@ public class URIMetadataNode implements URIMetadata {
return false;
}
protected static StringBuilder corePropList(URIMetadata md) {
private static StringBuilder corePropList(URIMetadataNode md) {
// generate a parseable string; this is a simple property-list
final StringBuilder s = new StringBuilder(300);
@ -427,7 +344,6 @@ public class URIMetadataNode implements URIMetadata {
* the toString format must be completely identical to URIMetadataRow because that is used
* to transport the data over p2p connections.
*/
@Override
public String toString(String snippet) {
// add information needed for remote transport
final StringBuilder core = corePropList(this);
@ -457,19 +373,65 @@ public class URIMetadataNode implements URIMetadata {
core.append('}');
return core.toString();
}
private int getInt(YaCySchema field) {
assert !field.isMultiValued();
assert field.getType() == SolrType.integer;
Object x = this.doc.getFieldValue(field.name());
if (x == null) return 0;
if (x instanceof Integer) return ((Integer) x).intValue();
if (x instanceof Long) return ((Long) x).intValue();
return 0;
}
@Override
public Request toBalancerEntry(final String initiatorHash) {
return new Request(
ASCII.getBytes(initiatorHash),
url(),
referrerHash(),
dc_title(),
moddate(),
null,
0,
0,
0,
0);
private Date getDate(YaCySchema field) {
assert !field.isMultiValued();
assert field.getType() == SolrType.date;
Date x = (Date) this.doc.getFieldValue(field.name());
if (x == null) return new Date(0);
Date now = new Date();
return x.after(now) ? now : x;
}
private String getString(YaCySchema field) {
assert !field.isMultiValued();
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight;
Object x = this.doc.getFieldValue(field.name());
if (x == null) return "";
if (x instanceof ArrayList) {
@SuppressWarnings("unchecked")
ArrayList<String> xa = (ArrayList<String>) x;
return xa.size() == 0 ? "" : xa.get(0);
}
return (String) x;
}
@SuppressWarnings("unchecked")
private ArrayList<String> getStringList(YaCySchema field) {
assert field.isMultiValued();
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general;
Object r = this.doc.getFieldValue(field.name());
if (r == null) return new ArrayList<String>(0);
if (r instanceof ArrayList) {
return (ArrayList<String>) r;
}
ArrayList<String> a = new ArrayList<String>(1);
a.add((String) r);
return a;
}
@SuppressWarnings("unchecked")
private ArrayList<Integer> getIntList(YaCySchema field) {
assert field.isMultiValued();
assert field.getType() == SolrType.integer;
Object r = this.doc.getFieldValue(field.name());
if (r == null) return new ArrayList<Integer>(0);
if (r instanceof ArrayList) {
return (ArrayList<Integer>) r;
}
ArrayList<Integer> a = new ArrayList<Integer>(1);
a.add((Integer) r);
return a;
}
}

@ -54,7 +54,7 @@ import net.yacy.kelondro.util.kelondroException;
import net.yacy.search.query.QueryParams;
import net.yacy.utils.crypt;
public class URIMetadataRow implements URIMetadata {
public class URIMetadataRow {
// this object stores attributes for URL entries
@ -104,120 +104,14 @@ public class URIMetadataRow implements URIMetadata {
private final Row.Entry entry;
private final String snippet;
private final String[] collections;
private WordReference word; // this is only used if the url is transported via remote search requests
private final long ranking; // during generation of a search result this value is set
private Components comp;
public URIMetadataRow() {
// create a dummy entry, good to produce poison objects
this.entry = rowdef.newEntry();
this.snippet = "";
this.collections = new String[0];
this.word = null;
this.ranking = 0;
this.comp = null;
}
public URIMetadataRow(
final DigestURI url,
final String dc_title,
final String dc_creator,
final String dc_subject,
final String dc_publisher,
final double lon, final double lat, // decimal degrees as in WGS84; if unknown both values may be 0.0d;
final Date mod,
final Date load,
final Date fresh,
final String referrer,
final byte[] md5,
final long size,
final int wc,
final char dt,
final Bitfield flags,
final byte[] lang,
final int llocal,
final int lother,
final int laudio,
final int limage,
final int lvideo,
final int lapp,
final String[] collections) {
// create new entry
this.entry = rowdef.newEntry();
this.entry.setCol(col_hash, url.hash());
this.entry.setCol(col_comp, encodeComp(url, dc_title, dc_creator, dc_subject, dc_publisher, lat, lon));
encodeDate(col_mod, mod);
encodeDate(col_load, load);
encodeDate(col_fresh, fresh);
this.entry.setCol(col_referrer, (referrer == null) ? null : UTF8.getBytes(referrer));
this.entry.setCol(col_md5, md5);
this.entry.setCol(col_size, size);
this.entry.setCol(col_wc, wc);
this.entry.setCol(col_dt, new byte[]{(byte) dt});
this.entry.setCol(col_flags, flags.bytes());
this.entry.setCol(col_lang, lang);
this.entry.setCol(col_llocal, llocal);
this.entry.setCol(col_lother, lother);
this.entry.setCol(col_limage, limage);
this.entry.setCol(col_laudio, laudio);
this.entry.setCol(col_lvideo, lvideo);
this.entry.setCol(col_lapp, lapp);
//System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString());
this.snippet = "";
this.collections = collections;
this.word = null;
this.ranking = 0;
this.comp = null;
}
private void encodeDate(final int col, final Date d) {
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
// 86400000 is the number of milliseconds in one day
long time = d.getTime();
long now = System.currentTimeMillis();
this.entry.setCol(col, NaturalOrder.encodeLong((time > now ? now : time) / 86400000L, 4));
}
private Date decodeDate(final int col) {
final long t = this.entry.getColLong(col);
/*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch
/*
if (t < 350400) return new Date(3600000L * t); // hours since epoch
if (t < 21024000) return new Date(60000L * t); // minutes since epoch
*/
}
private static byte[] encodeComp(
final DigestURI url,
final String dc_title,
final String dc_creator,
final String dc_subject,
final String dc_publisher,
final double lat,
final double lon) {
final CharBuffer s = new CharBuffer(3600, 360);
s.append(url.toNormalform(true)).appendLF();
s.append(dc_title).appendLF();
if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator);
s.appendLF();
if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject);
s.appendLF();
if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher);
s.appendLF();
if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF();
String s0 = s.toString();
s.close();
return UTF8.getBytes(s0);
}
public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord, final long ranking) {
public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord) {
this.entry = entry;
this.snippet = "";
this.word = searchedWord;
this.ranking = ranking;
this.comp = null;
this.collections = new String[0];
}
private URIMetadataRow(final Properties prop) throws kelondroException {
@ -278,17 +172,15 @@ public class URIMetadataRow implements URIMetadata {
this.entry.setCol(col_lapp, Integer.parseInt(prop.getProperty("lapp", "0")));
this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""));
this.word = null;
if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported");
if (prop.containsKey("wi")) {
this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))));
}
this.ranking = 0;
this.comp = null;
this.collections = new String[0];
}
public static URIMetadataRow importEntry(final String propStr) {
if (propStr == null || (!propStr.isEmpty() && propStr.charAt(0) != '{') || !propStr.endsWith("}")) {
if (propStr == null || propStr.isEmpty() || propStr.charAt(0) != '{' || !propStr.endsWith("}")) {
Log.logSevere("URIMetadataRow", "importEntry: propStr is not proper: " + propStr);
return null;
}
try {
@ -300,7 +192,46 @@ public class URIMetadataRow implements URIMetadata {
}
}
@Override
private void encodeDate(final int col, final Date d) {
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
// 86400000 is the number of milliseconds in one day
long time = d.getTime();
long now = System.currentTimeMillis();
this.entry.setCol(col, NaturalOrder.encodeLong((time > now ? now : time) / 86400000L, 4));
}
private Date decodeDate(final int col) {
final long t = this.entry.getColLong(col);
/*if (t < 14600) */return new Date(86400000L * t); // time was stored as number of days since epoch
/*
if (t < 350400) return new Date(3600000L * t); // hours since epoch
if (t < 21024000) return new Date(60000L * t); // minutes since epoch
*/
}
private static byte[] encodeComp(
final DigestURI url,
final String dc_title,
final String dc_creator,
final String dc_subject,
final String dc_publisher,
final double lat,
final double lon) {
final CharBuffer s = new CharBuffer(3600, 360);
s.append(url.toNormalform(true)).appendLF();
s.append(dc_title).appendLF();
if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator);
s.appendLF();
if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject);
s.appendLF();
if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher);
s.appendLF();
if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Double.toString(lat)).append(',').append(Double.toString(lon)).appendLF();
String s0 = s.toString();
s.close();
return UTF8.getBytes(s0);
}
public byte[] hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
@ -310,54 +241,40 @@ public class URIMetadataRow implements URIMetadata {
}
private String hostHash = null;
@Override
public String hosthash() {
if (this.hostHash != null) return this.hostHash;
this.hostHash = ASCII.String(this.entry.getPrimaryKeyBytes(), 6, 6);
return this.hostHash;
}
@Override
public long ranking() {
return this.ranking;
}
@Override
public boolean matches(final Pattern matcher) {
return this.metadata().matches(matcher);
}
@Override
public DigestURI url() {
return this.metadata().url();
}
@Override
public String dc_title() {
return this.metadata().dc_title();
}
@Override
public String dc_creator() {
return this.metadata().dc_creator();
}
@Override
public String dc_publisher() {
return this.metadata().dc_publisher();
}
@Override
public String dc_subject() {
return this.metadata().dc_subject();
}
@Override
public double lat() {
return this.metadata().lat();
}
@Override
public double lon() {
return this.metadata().lon();
}
@ -379,22 +296,18 @@ public class URIMetadataRow implements URIMetadata {
return this.comp;
}
@Override
public Date moddate() {
return decodeDate(col_mod);
}
@Override
public Date loaddate() {
return decodeDate(col_load);
}
@Override
public Date freshdate() {
return decodeDate(col_fresh);
}
@Override
public byte[] referrerHash() {
// return the creator's hash or null if there is none
// FIXME: There seem to be some malformed entries in the databasees like "null\0\0\0\0\0\0\0\0"
@ -408,18 +321,15 @@ public class URIMetadataRow implements URIMetadata {
return r;
}
@Override
public String md5() {
// returns the md5 in hex representation
return Digest.encodeHex(this.entry.getColBytes(col_md5, true));
}
@Override
public char doctype() {
return (char) this.entry.getColByte(col_dt);
}
@Override
public byte[] language() {
byte[] b = this.entry.getColBytes(col_lang, true);
if ((b == null || b[0] == (byte)'[') && this.metadata().url != null) {
@ -430,100 +340,98 @@ public class URIMetadataRow implements URIMetadata {
return b;
}
@Override
public int size() {
return (int) this.entry.getColLong(col_size);
}
@Override
public Bitfield flags() {
return new Bitfield(this.entry.getColBytes(col_flags, true));
}
@Override
public int wordCount() {
return (int) this.entry.getColLong(col_wc);
}
@Override
public int llocal() {
return (int) this.entry.getColLong(col_llocal);
}
@Override
public int lother() {
return (int) this.entry.getColLong(col_lother);
}
@Override
public int limage() {
return (int) this.entry.getColLong(col_limage);
}
@Override
public int laudio() {
return (int) this.entry.getColLong(col_laudio);
}
@Override
public int lvideo() {
return (int) this.entry.getColLong(col_lvideo);
}
@Override
public int lapp() {
return (int) this.entry.getColLong(col_lapp);
}
@Override
public String snippet() {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return this.snippet;
}
@Override
public String[] collections() {
return this.collections;
}
@Override
public WordReference word() {
return this.word;
}
@Override
public boolean isOlder(final URIMetadata other) {
if (other == null) return false;
final Date tmoddate = moddate();
final Date omoddate = other.moddate();
if (tmoddate.before(omoddate)) return true;
if (tmoddate.equals(omoddate)) {
final Date tloaddate = loaddate();
final Date oloaddate = other.loaddate();
if (tloaddate.before(oloaddate)) return true;
if (tloaddate.equals(oloaddate)) return true;
}
return false;
}
private static StringBuilder corePropList(URIMetadataRow md) {
// generate a parseable string; this is a simple property-list
final StringBuilder s = new StringBuilder(300);
@Override
public String toString(final String snippet) {
// add information needed for remote transport
final StringBuilder core = URIMetadataNode.corePropList(this);
if (core == null)
return null;
core.ensureCapacity(core.length() + snippet.length() * 2);
core.insert(0, "{");
core.append(",snippet=").append(crypt.simpleEncode(snippet));
core.append("}");
// create new formatters to make concurrency possible
final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute);
return core.toString();
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
try {
s.append("hash=").append(ASCII.String(md.hash()));
s.append(",url=").append(crypt.simpleEncode(md.url().toNormalform(true)));
s.append(",descr=").append(crypt.simpleEncode(md.dc_title()));
s.append(",author=").append(crypt.simpleEncode(md.dc_creator()));
s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(md.dc_subject())));
s.append(",publisher=").append(crypt.simpleEncode(md.dc_publisher()));
s.append(",lat=").append(md.lat());
s.append(",lon=").append(md.lon());
s.append(",mod=").append(formatter.format(md.moddate()));
s.append(",load=").append(formatter.format(md.loaddate()));
s.append(",fresh=").append(formatter.format(md.freshdate()));
s.append(",referrer=").append(md.referrerHash() == null ? "" : ASCII.String(md.referrerHash()));
s.append(",md5=").append(md.md5());
s.append(",size=").append(md.size());
s.append(",wc=").append(md.wordCount());
s.append(",dt=").append(md.doctype());
s.append(",flags=").append(md.flags().exportB64());
s.append(",lang=").append(md.language() == null ? "EN" : UTF8.String(md.language()));
s.append(",llocal=").append(md.llocal());
s.append(",lother=").append(md.lother());
s.append(",limage=").append(md.limage());
s.append(",laudio=").append(md.laudio());
s.append(",lvideo=").append(md.lvideo());
s.append(",lapp=").append(md.lapp());
if (md.word() != null) {
// append also word properties
final String wprop = md.word().toPropertyForm();
s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop));
}
return s;
} catch (final Throwable e) {
Log.logException(e);
return null;
}
}
@Override
public Request toBalancerEntry(final String initiatorHash) {
return new Request(
ASCII.getBytes(initiatorHash),
@ -545,7 +453,7 @@ public class URIMetadataRow implements URIMetadata {
*/
@Override
public String toString() {
final StringBuilder core = URIMetadataNode.corePropList(this);
final StringBuilder core = corePropList(this);
if (core == null) return null;
core.insert(0, "{");

@ -37,7 +37,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.Base64Order;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.logging.Log;
@ -71,7 +71,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
private final Queue<Integer> positions;
private double termFrequency;
public WordReferenceVars(final URIMetadata md) {
public WordReferenceVars(final URIMetadataRow md) {
this.language = md.language();
this.flags = md.flags();
this.lastModified = md.moddate().getTime();

@ -81,7 +81,6 @@ import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -812,7 +811,8 @@ public final class Protocol
try {
event.getQuery().getSegment().fulltext().putMetadata(urlEntry);
ResultURLs.stack(
urlEntry,
ASCII.String(urlEntry.url().hash()),
urlEntry.url().getHost(),
event.peers.mySeed().hash.getBytes(),
UTF8.getBytes(target.hash),
EventOrigin.QUERIES);
@ -1103,7 +1103,8 @@ public final class Protocol
try {
event.getQuery().getSegment().fulltext().putDocument(ClientUtils.toSolrInputDocument(doc));
ResultURLs.stack(
urlEntry,
ASCII.String(urlEntry.url().hash()),
urlEntry.url().getHost(),
event.peers.mySeed().hash.getBytes(),
UTF8.getBytes(target.hash),
EventOrigin.QUERIES);
@ -1187,7 +1188,7 @@ public final class Protocol
final String process,
final String result,
final String reason,
final URIMetadata entry,
final URIMetadataNode entry,
final String wordhashes) {
assert (target != null);
assert (mySeed != null);
@ -1225,8 +1226,7 @@ public final class Protocol
// send request
try {
// prepare request
final Map<String, ContentBody> parts =
basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt);
final Map<String, ContentBody> parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt);
parts.put("process", UTF8.StringBody(process));
parts.put("urlhash", UTF8.StringBody(((entry == null) ? "" : ASCII.String(entry.hash()))));
parts.put("result", UTF8.StringBody(result));
@ -1266,7 +1266,7 @@ public final class Protocol
public static String transferIndex(
final Seed targetSeed,
final ReferenceContainerCache<WordReference> indexes,
final SortedMap<byte[], URIMetadataRow> urlCache,
final SortedMap<byte[], URIMetadataNode> urlCache,
final boolean gzipBody,
final int timeout) {
@ -1327,7 +1327,7 @@ public final class Protocol
} // all url's known
// extract the urlCache from the result
final URIMetadata[] urls = new URIMetadata[uhs.length];
final URIMetadataNode[] urls = new URIMetadataNode[uhs.length];
for ( int i = 0; i < uhs.length; i++ ) {
urls[i] = urlCache.get(ASCII.getBytes(uhs[i]));
if ( urls[i] == null ) {
@ -1435,7 +1435,7 @@ public final class Protocol
private static Map<String, String> transferURL(
final Seed targetSeed,
final URIMetadata[] urls,
final URIMetadataNode[] urls,
boolean gzipBody,
final int timeout) {
// this post a message to the remote message board
@ -1457,7 +1457,7 @@ public final class Protocol
String resource;
int urlc = 0;
int urlPayloadSize = 0;
for ( final URIMetadata url : urls ) {
for ( final URIMetadataNode url : urls ) {
if ( url != null ) {
resource = url.toString();
//System.out.println("*** DEBUG resource = " + resource);

@ -36,7 +36,6 @@ import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
@ -89,7 +88,7 @@ public class Transmission {
*/
private final byte[] primaryTarget;
private final ReferenceContainerCache<WordReference> containers;
private final SortedMap<byte[], URIMetadataRow> references;
private final SortedMap<byte[], URIMetadataNode> references;
private final HandleSet badReferences;
private final List<Seed> targets;
private int hit, miss;
@ -105,7 +104,7 @@ public class Transmission {
super();
this.primaryTarget = primaryTarget;
this.containers = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength);
this.references = new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
this.references = new TreeMap<byte[], URIMetadataNode>(Base64Order.enhancedCoder);
this.badReferences = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
this.targets = targets;
this.hit = 0;
@ -179,7 +178,7 @@ public class Transmission {
notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash());
} else {
this.references.put(e.urlhash(), r.toRow());
this.references.put(e.urlhash(), r);
}
}
// now delete all references that were not found

@ -46,7 +46,7 @@ import java.util.regex.PatternSyntaxException;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.logging.Log;
@ -328,17 +328,19 @@ public class Blacklist {
return ret;
}
public boolean isListed(final BlacklistType blacklistType, final URIMetadataNode entry) {
return isListed(blacklistType, entry.url());
}
public boolean isListed(final BlacklistType blacklistType, final URIMetadataRow entry) {
return isListed(blacklistType, entry.url());
}
/**
* Checks whether the given entry is listed in given blacklist type
* @param blacklistType The used blacklist
* @param entry Entry to be checked
* @return Whether the given entry is blacklisted
* @return Whether the given entry is blacklisted
*/
public boolean isListed(final BlacklistType blacklistType, final URIMetadata entry) {
// Call inner method
return isListed(blacklistType, entry.url());
}
public boolean isListed(final BlacklistType blacklistType, final DigestURI url) {
if (url == null) {
throw new IllegalArgumentException("url may not be null");

@ -76,6 +76,8 @@ import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification;
@ -146,9 +148,7 @@ import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread;
import net.yacy.interaction.contentcontrol.ContentControlImportThread;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.ReferenceContainer;
@ -2589,13 +2589,10 @@ public final class Switchboard extends serverSwitch
this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url);
// STORE WORD INDEX
URIMetadataRow newEntry =
SolrInputDocument newEntry =
this.index.storeDocument(
url,
referrerURL,
queueEntry.lastModified(),
new Date(),
queueEntry.size(),
queueEntry.profile(),
queueEntry.getResponseHeader(),
document,
@ -2628,7 +2625,9 @@ public final class Switchboard extends serverSwitch
}
// update url result list statistics
ResultURLs.stack(newEntry, // loaded url db entry
ResultURLs.stack(
ASCII.String(url.hash()), // loaded url db entry
url.getHost(),
queueEntry.initiator(), // initiator peer hash
UTF8.getBytes(this.peers.mySeed().hash), // executor peer hash
processCase // process case
@ -2654,8 +2653,7 @@ public final class Switchboard extends serverSwitch
initiatorPeer.setAlternativeAddress(this.clusterhashes.get(queueEntry.initiator()));
}
// start a thread for receipt sending to avoid a blocking here
new Thread(new receiptSending(initiatorPeer, newEntry), "sending receipt to "
+ ASCII.String(queueEntry.initiator())).start();
new Thread(new receiptSending(initiatorPeer, new URIMetadataNode(newEntry)), "sending receipt to " + ASCII.String(queueEntry.initiator())).start();
}
}
}
@ -2820,9 +2818,9 @@ public final class Switchboard extends serverSwitch
public class receiptSending implements Runnable
{
private final Seed initiatorPeer;
private final URIMetadata reference;
private final URIMetadataNode reference;
public receiptSending(final Seed initiatorPeer, final URIMetadata reference) {
public receiptSending(final Seed initiatorPeer, final URIMetadataNode reference) {
this.initiatorPeer = initiatorPeer;
this.reference = reference;
}

@ -30,10 +30,11 @@ import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.Classification;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Condenser;
@ -41,7 +42,6 @@ import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.query.QueryParams;
@ -101,12 +101,12 @@ public class DocumentIndex extends Segment {
@Override
public void run() {
DigestURI f;
URIMetadata[] resultRows;
SolrInputDocument[] resultRows;
try {
while ( (f = DocumentIndex.this.queue.take()) != poison ) {
try {
resultRows = add(f);
for ( final URIMetadata resultRow : resultRows ) {
for ( final SolrInputDocument resultRow : resultRows ) {
if ( DocumentIndex.this.callback != null ) {
if ( resultRow == null ) {
DocumentIndex.this.callback.fail(f, "result is null");
@ -138,7 +138,7 @@ public class DocumentIndex extends Segment {
this.queue.clear();
}
private URIMetadata[] add(final DigestURI url) throws IOException {
private SolrInputDocument[] add(final DigestURI url) throws IOException {
if ( url == null ) {
throw new IOException("file = null");
}
@ -161,7 +161,7 @@ public class DocumentIndex extends Segment {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
}
//Document document = Document.mergeDocuments(url, null, documents);
final URIMetadata[] rows = new URIMetadata[documents.length];
final SolrInputDocument[] rows = new SolrInputDocument[documents.length];
int c = 0;
for ( final Document document : documents ) {
if (document == null) continue;
@ -170,9 +170,6 @@ public class DocumentIndex extends Segment {
super.storeDocument(
url,
null,
new Date(url.lastModified()),
new Date(),
url.length(),
null,
null,
document,
@ -275,7 +272,7 @@ public class DocumentIndex extends Segment {
public interface CallbackListener
{
public void commit(DigestURI f, URIMetadata resultRow);
public void commit(DigestURI f, SolrInputDocument resultRow);
public void fail(DigestURI f, String failReason);
}
@ -296,7 +293,7 @@ public class DocumentIndex extends Segment {
System.out.println("using index files at " + segmentPath.getAbsolutePath());
final CallbackListener callback = new CallbackListener() {
@Override
public void commit(final DigestURI f, final URIMetadata resultRow) {
public void commit(final DigestURI f, final SolrInputDocument resultRow) {
System.out.println("indexed: " + f.toString());
}

@ -50,7 +50,6 @@ import net.yacy.cora.storage.ZIPReader;
import net.yacy.cora.storage.ZIPWriter;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReference;
@ -64,7 +63,6 @@ import net.yacy.kelondro.util.MergeIterator;
import net.yacy.search.Switchboard;
import org.apache.lucene.util.Version;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
@ -226,10 +224,10 @@ public final class Fulltext implements Iterable<byte[]> {
// slow migration to solr
final Row.Entry entry = this.urlIndexFile.remove(urlHash);
if (entry == null) return null;
URIMetadataRow row = new URIMetadataRow(entry, wre, weight);
URIMetadataRow row = new URIMetadataRow(entry, wre);
SolrInputDocument solrInput = this.solrScheme.metadata2solr(row);
this.putDocument(solrInput);
return new URIMetadataNode(ClientUtils.toSolrDocument(solrInput), wre, weight);
return new URIMetadataNode(solrInput, wre, weight);
} catch (final IOException e) {
Log.logException(e);
}
@ -244,9 +242,9 @@ public final class Fulltext implements Iterable<byte[]> {
if (this.urlIndexFile != null) this.urlIndexFile.remove(idb);
SolrDocument sd = this.solr.get(id);
Date now = new Date();
Date sdDate = this.solrScheme.getDate(sd, YaCySchema.last_modified);
if (sdDate.after(now)) sdDate = now;
Date docDate = this.solrScheme.getDate(doc, YaCySchema.last_modified);
Date sdDate = sd == null ? null : SolrConfiguration.getDate(sd, YaCySchema.last_modified);
if (sdDate == null || sdDate.after(now)) sdDate = now;
Date docDate = SolrConfiguration.getDate(doc, YaCySchema.last_modified);
if (docDate.after(now)) docDate = now;
if (sd == null || sdDate.before(docDate)) {
if (this.solrScheme.contains(YaCySchema.ip_s)) {
@ -263,13 +261,8 @@ public final class Fulltext implements Iterable<byte[]> {
if (MemoryControl.shortStatus()) clearCache();
}
public void putMetadata(final URIMetadata entry) throws IOException {
if (entry instanceof URIMetadataNode) {
putDocument(ClientUtils.toSolrInputDocument(((URIMetadataNode) entry).getDocument()));
return;
}
assert entry instanceof URIMetadataRow;
URIMetadataRow row = (URIMetadataRow) entry;
public void putMetadata(final URIMetadataRow entry) throws IOException {
URIMetadataRow row = entry;
byte[] idb = row.hash();
String id = ASCII.String(idb);
@ -516,7 +509,7 @@ public final class Fulltext implements Iterable<byte[]> {
}
} else {
final Iterator<URIMetadataNode> i = entries(); // iterates indexURLEntry objects
URIMetadata entry;
URIMetadataNode entry;
String url;
while (i.hasNext()) {
entry = i.next();

@ -35,6 +35,8 @@ import java.util.Properties;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
@ -57,7 +59,6 @@ import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.citation.CitationReferenceFactory;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
@ -284,7 +285,7 @@ public class Segment {
if (this.urlCitationIndex != null) this.urlCitationIndex.close();
}
private String votedLanguage(
private static String votedLanguage(
final DigestURI url,
final String urlNormalform,
final Document document,
@ -295,23 +296,17 @@ public class Segment {
if (language == null) {
// no statistics available, we take either the metadata (if given) or the TLD
language = (bymetadata == null) ? url.language() : bymetadata;
if (this.log.isFine()) this.log.logFine("LANGUAGE-BY-STATISTICS: " + url + " FAILED, taking " + ((bymetadata == null) ? "TLD" : "metadata") + ": " + language);
} else {
if (bymetadata == null) {
// two possible results: compare and report conflicts
if (language.equals(url.language()))
if (this.log.isFine()) this.log.logFine("LANGUAGE-BY-STATISTICS: " + url + " CONFIRMED - TLD IDENTICAL: " + language);
else {
final String error = "LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")";
if (!language.equals(url.language())) {
// see if we have a hint in the url that the statistic was right
final String u = urlNormalform.toLowerCase();
if (!u.contains("/" + language + "/") && !u.contains("/" + ISO639.country(language).toLowerCase() + "/")) {
// no confirmation using the url, use the TLD
language = url.language();
if (this.log.isFine()) this.log.logFine(error + ", corrected using the TLD");
} else {
// this is a strong hint that the statistics was in fact correct
if (this.log.isFine()) this.log.logFine(error + ", but the url proves that the statistic is correct");
}
}
} else {
@ -340,12 +335,9 @@ public class Segment {
if (this.termIndex != null) this.termIndex.add(termHash, entry);
}
public URIMetadataRow storeDocument(
public SolrInputDocument storeDocument(
final DigestURI url,
final DigestURI referrerURL,
Date modDate,
final Date loadDate,
final long sourcesize,
final CrawlProfile profile,
final ResponseHeader responseHeader,
final Document document,
@ -359,44 +351,21 @@ public class Segment {
// CREATE INDEX
// load some document metadata
final Date loadDate = new Date();
final String id = ASCII.String(url.hash());
final String dc_title = document.dc_title();
final String urlNormalform = url.toNormalform(true);
final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language
// STORE URL TO LOADED-URL-DB
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader
Date modDate = responseHeader.lastModified();
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
char docType = Response.docType(document.dc_format());
final URIMetadataRow metadata = new URIMetadataRow(
url, // URL
dc_title, // document description
document.dc_creator(), // author
document.dc_subject(' '), // tags
document.dc_publisher(), // publisher (may be important to get location data)
document.lon(), // decimal degrees as in WGS84;
document.lat(), // if unknown both values may be 0.0d;
modDate, // modification date
loadDate, // loaded date
new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula
(referrerURL == null) ? null : ASCII.String(referrerURL.hash()), // referer hash
new byte[0], // md5
(int) sourcesize, // size
condenser.RESULT_NUMB_WORDS, // word count
docType, // doctype
condenser.RESULT_FLAGS, // flags
UTF8.getBytes(language), // language
document.inboundLinks().size(), // inbound links
document.outboundLinks().size(), // outbound links
document.getAudiolinks().size(), // laudio
document.getImages().size(), // limage
document.getVideolinks().size(), // lvideo
document.getApplinks().size(), // lapp
profile.collections() // collections
);
// STORE TO SOLR
final SolrInputDocument solrInputDoc = this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language);
try {
this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, metadata));
this.fulltext.putDocument(solrInputDoc);
} catch ( final IOException e ) {
Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage());
}
@ -487,7 +456,7 @@ public class Segment {
}
// finished
return metadata;
return solrInputDoc;
}
public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {

@ -55,13 +55,10 @@ import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.Bitfield;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
@ -174,18 +171,12 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (isEmpty() || contains(key)) key.add(doc, value);
}
public Date getDate(SolrInputDocument doc, final YaCySchema key) {
public static Date getDate(SolrInputDocument doc, final YaCySchema key) {
Date x = (Date) doc.getFieldValue(key.name());
Date now = new Date();
return (x == null) ? new Date(0) : x.after(now) ? now : x;
}
public Date getDate(SolrDocument doc, final YaCySchema key) {
Date x = doc == null ? null : (Date) doc.getFieldValue(key.name());
Date now = new Date();
return (x == null) ? new Date(0) : x.after(now) ? now : x;
}
/**
* save configuration to file and update enum SolrFields
* @throws IOException
@ -207,11 +198,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
} catch (final IOException e) {}
}
public SolrInputDocument metadata2solr(final URIMetadata md) {
assert md instanceof URIMetadataRow;
if (md instanceof URIMetadataNode) {
return ClientUtils.toSolrInputDocument(((URIMetadataNode) md).getDocument());
}
public SolrInputDocument metadata2solr(final URIMetadataRow md) {
final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = DigestURI.toDigestURI(md.url());
@ -339,10 +326,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, Condenser condenser, final URIMetadata metadata) {
public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader responseHeader, final Document document, Condenser condenser, DigestURI referrerURL, String language) {
// we use the SolrCell design as index scheme
final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = DigestURI.toDigestURI(yacydoc.dc_source());
final DigestURI digestURI = DigestURI.toDigestURI(document.dc_source());
boolean allAttr = this.isEmpty();
add(doc, YaCySchema.id, id);
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
@ -377,7 +364,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom);
}
List<String> titles = yacydoc.titles();
List<String> titles = document.titles();
if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, titles);
if (allAttr || contains(YaCySchema.title_count_i)) add(doc, YaCySchema.title_count_i, titles.size());
if (allAttr || contains(YaCySchema.title_chars_val)) {
@ -391,7 +378,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
add(doc, YaCySchema.title_words_val, cv);
}
String description = yacydoc.dc_description();
String description = document.dc_description();
List<String> descriptions = new ArrayList<String>();
for (String s: description.split("\n")) descriptions.add(s);
if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, description);
@ -407,11 +394,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
add(doc, YaCySchema.description_words_val, cv);
}
if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, yacydoc.dc_creator());
if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, new String[]{yacydoc.dc_format()});
if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, header == null ? new Date() : header.lastModified());
if (allAttr || contains(YaCySchema.keywords)) add(doc, YaCySchema.keywords, yacydoc.dc_subject(' '));
final String content = yacydoc.getTextString();
if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, document.dc_creator());
if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, new String[]{document.dc_format()});
if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
if (allAttr || contains(YaCySchema.keywords)) add(doc, YaCySchema.keywords, document.dc_subject(' '));
final String content = document.getTextString();
if (allAttr || contains(YaCySchema.text_t)) add(doc, YaCySchema.text_t, content);
if (allAttr || contains(YaCySchema.wordcount_i)) {
final int contentwc = content.split(" ").length;
@ -427,11 +414,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.url_file_ext_s)) add(doc, YaCySchema.url_file_ext_s, digestURI.getFileExtension());
// get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme
Set<MultiProtocolURI> inboundLinks = yacydoc.inboundLinks();
Set<MultiProtocolURI> outboundLinks = yacydoc.outboundLinks();
Set<MultiProtocolURI> inboundLinks = document.inboundLinks();
Set<MultiProtocolURI> outboundLinks = document.outboundLinks();
int c = 0;
final Object parser = yacydoc.getParserObject();
final Object parser = document.getParserObject();
Map<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
@ -482,10 +469,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (robots_meta.indexOf("nofollow",0) >= 0) b += 8; // set bit 3
}
String x_robots_tag = "";
if (header != null) {
x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, "");
if (responseHeader != null) {
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, "");
if (x_robots_tag.isEmpty()) {
x_robots_tag = header.get(HeaderFramework.X_ROBOTS, "");
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, "");
}
}
if (!x_robots_tag.isEmpty()) {
@ -670,14 +657,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// response time
add(doc, YaCySchema.responsetime_i, header == null ? 0 : Integer.parseInt(header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
add(doc, YaCySchema.responsetime_i, responseHeader == null ? 0 : Integer.parseInt(responseHeader.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")));
}
// list all links
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
final Map<MultiProtocolURI, Properties> alllinks = document.getAnchors();
c = 0;
if (allAttr || contains(YaCySchema.inboundlinkscount_i)) add(doc, YaCySchema.inboundlinkscount_i, inboundLinks.size());
if (allAttr || contains(YaCySchema.inboundlinksnofollowcount_i)) add(doc, YaCySchema.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
if (allAttr || contains(YaCySchema.inboundlinksnofollowcount_i)) add(doc, YaCySchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount());
final List<String> inboundlinksTag = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksURLProtocol = new ArrayList<String>(inboundLinks.size());
final List<String> inboundlinksURLStub = new ArrayList<String>(inboundLinks.size());
@ -725,7 +712,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
c = 0;
if (allAttr || contains(YaCySchema.outboundlinkscount_i)) add(doc, YaCySchema.outboundlinkscount_i, outboundLinks.size());
if (allAttr || contains(YaCySchema.outboundlinksnofollowcount_i)) add(doc, YaCySchema.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
if (allAttr || contains(YaCySchema.outboundlinksnofollowcount_i)) add(doc, YaCySchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
final List<String> outboundlinksTag = new ArrayList<String>(outboundLinks.size());
final List<String> outboundlinksURLProtocol = new ArrayList<String>(outboundLinks.size());
final List<String> outboundlinksURLStub = new ArrayList<String>(outboundLinks.size());
@ -772,26 +759,30 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.outboundlinks_alttag_txt)) add(doc, YaCySchema.outboundlinks_alttag_txt, outboundlinksAltTag);
// charset
if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, yacydoc.getCharset());
if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, document.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(yacydoc.lat()) + "," + Double.toString(yacydoc.lon()));
if (document.lat() != 0.0f && document.lon() != 0.0f) {
if (allAttr || contains(YaCySchema.coordinate_p)) add(doc, YaCySchema.coordinate_p, Double.toString(document.lat()) + "," + Double.toString(document.lon()));
}
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, header == null ? 200 : header.getStatusCode());
// fields that are additionally in URIMetadataRow
if (allAttr || contains(YaCySchema.load_date_dt)) add(doc, YaCySchema.load_date_dt, metadata.loaddate());
if (allAttr || contains(YaCySchema.fresh_date_dt)) add(doc, YaCySchema.fresh_date_dt, metadata.freshdate());
if (allAttr || contains(YaCySchema.host_id_s)) add(doc, YaCySchema.host_id_s, metadata.hosthash());
if ((allAttr || contains(YaCySchema.referrer_id_txt)) && metadata.referrerHash() != null) add(doc, YaCySchema.referrer_id_txt, new String[]{ASCII.String(metadata.referrerHash())});
if (allAttr || contains(YaCySchema.httpstatus_i)) add(doc, YaCySchema.httpstatus_i, responseHeader == null ? 200 : responseHeader.getStatusCode());
// fields that were additionally in URIMetadataRow
Date loadDate = new Date();
Date modDate = responseHeader.lastModified();
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
int size = (int) Math.max(document.dc_source().length(), responseHeader.getContentLength());
if (allAttr || contains(YaCySchema.load_date_dt)) add(doc, YaCySchema.load_date_dt, loadDate);
if (allAttr || contains(YaCySchema.fresh_date_dt)) add(doc, YaCySchema.fresh_date_dt, new Date(loadDate.getTime() + Math.max(0, loadDate.getTime() - modDate.getTime()) / 2)); // freshdate, computed with Proxy-TTL formula
if (allAttr || contains(YaCySchema.host_id_s)) add(doc, YaCySchema.host_id_s, document.dc_source().hosthash());
if ((allAttr || contains(YaCySchema.referrer_id_txt)) && referrerURL != null) add(doc, YaCySchema.referrer_id_txt, new String[]{ASCII.String(referrerURL.hash())});
//if (allAttr || contains(SolrField.md5_s)) add(solrdoc, SolrField.md5_s, new byte[0]);
if (allAttr || contains(YaCySchema.publisher_t)) add(doc, YaCySchema.publisher_t, yacydoc.dc_publisher());
if ((allAttr || contains(YaCySchema.language_s)) && metadata.language() != null) add(doc, YaCySchema.language_s, UTF8.String(metadata.language()));
if (allAttr || contains(YaCySchema.size_i)) add(doc, YaCySchema.size_i, metadata.size());
if (allAttr || contains(YaCySchema.audiolinkscount_i)) add(doc, YaCySchema.audiolinkscount_i, yacydoc.getAudiolinks().size());
if (allAttr || contains(YaCySchema.videolinkscount_i)) add(doc, YaCySchema.videolinkscount_i, yacydoc.getVideolinks().size());
if (allAttr || contains(YaCySchema.applinkscount_i)) add(doc, YaCySchema.applinkscount_i, yacydoc.getApplinks().size());
if (allAttr || contains(YaCySchema.publisher_t)) add(doc, YaCySchema.publisher_t, document.dc_publisher());
if ((allAttr || contains(YaCySchema.language_s)) && language != null) add(doc, YaCySchema.language_s, language);
if (allAttr || contains(YaCySchema.size_i)) add(doc, YaCySchema.size_i, size);
if (allAttr || contains(YaCySchema.audiolinkscount_i)) add(doc, YaCySchema.audiolinkscount_i, document.getAudiolinks().size());
if (allAttr || contains(YaCySchema.videolinkscount_i)) add(doc, YaCySchema.videolinkscount_i, document.getVideolinks().size());
if (allAttr || contains(YaCySchema.applinkscount_i)) add(doc, YaCySchema.applinkscount_i, document.getApplinks().size());
return doc;
}
@ -827,6 +818,25 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
return a;
}
/**
* encode a string containing attributes from anchor rel properties binary:
* bit 0: "me" contained in rel
* bit 1: "nofollow" contained in rel
* @param rel
* @return binary encoded information about rel
*/
private static List<Integer> relEval(final List<String> rel) {
List<Integer> il = new ArrayList<Integer>(rel.size());
for (final String s: rel) {
int i = 0;
final String s0 = s.toLowerCase().trim();
if ("me".equals(s0)) i += 1;
if ("nofollow".equals(s0)) i += 2;
il.add(i);
}
return il;
}
public static Iterator<String> getLinks(SolrDocument doc, boolean inbound) {
Collection<Object> urlstub = doc.getFieldValues((inbound ? YaCySchema.inboundlinks_urlstub_txt : YaCySchema.outboundlinks_urlstub_txt).name());
Collection<String> urlprot = urlstub == null ? null : indexedList2protocolList(doc.getFieldValues((inbound ? YaCySchema.inboundlinks_protocol_sxt : YaCySchema.outboundlinks_protocol_sxt).name()), urlstub.size());
@ -846,30 +856,17 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
return list.iterator();
}
/**
* encode a string containing attributes from anchor rel properties binary:
* bit 0: "me" contained in rel
* bit 1: "nofollow" contained in rel
* @param rel
* @return binary encoded information about rel
*/
private static List<Integer> relEval(final List<String> rel) {
List<Integer> il = new ArrayList<Integer>(rel.size());
for (final String s: rel) {
int i = 0;
final String s0 = s.toLowerCase().trim();
if ("me".equals(s0)) i += 1;
if ("nofollow".equals(s0)) i += 2;
il.add(i);
}
return il;
public static Date getDate(SolrDocument doc, final YaCySchema key) {
Date x = doc == null ? null : (Date) doc.getFieldValue(key.name());
Date now = new Date();
return (x == null) ? new Date(0) : x.after(now) ? now : x;
}
public String solrGetID(final SolrDocument solr) {
public static String solrGetID(final SolrDocument solr) {
return (String) solr.getFieldValue(YaCySchema.id.getSolrFieldName());
}
public DigestURI solrGetURL(final SolrDocument solr) {
public static DigestURI solrGetURL(final SolrDocument solr) {
try {
return new DigestURI((String) solr.getFieldValue(YaCySchema.sku.getSolrFieldName()));
} catch (final MalformedURLException e) {
@ -877,29 +874,29 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
}
public String solrGetTitle(final SolrDocument solr) {
public static String solrGetTitle(final SolrDocument solr) {
return (String) solr.getFieldValue(YaCySchema.title.getSolrFieldName());
}
public String solrGetText(final SolrDocument solr) {
public static String solrGetText(final SolrDocument solr) {
return (String) solr.getFieldValue(YaCySchema.text_t.getSolrFieldName());
}
public String solrGetAuthor(final SolrDocument solr) {
public static String solrGetAuthor(final SolrDocument solr) {
return (String) solr.getFieldValue(YaCySchema.author.getSolrFieldName());
}
public String solrGetDescription(final SolrDocument solr) {
public static String solrGetDescription(final SolrDocument solr) {
return (String) solr.getFieldValue(YaCySchema.description.getSolrFieldName());
}
public Date solrGetDate(final SolrDocument solr) {
public static Date solrGetDate(final SolrDocument solr) {
Date date = (Date) solr.getFieldValue(YaCySchema.last_modified.getSolrFieldName());
Date now = new Date();
return date.after(now) ? now : date;
}
public Collection<String> solrGetKeywords(final SolrDocument solr) {
public static Collection<String> solrGetKeywords(final SolrDocument solr) {
final Collection<Object> c = solr.getFieldValues(YaCySchema.keywords.getSolrFieldName());
final ArrayList<String> a = new ArrayList<String>();
for (final Object s: c) {

@ -44,7 +44,6 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.Cache;
import net.yacy.data.WorkTables;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet;

@ -48,7 +48,6 @@ import net.yacy.document.SnippetExtractor;
import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.util.ByteArray;

Loading…
Cancel
Save