refactore URIMetadataNode to further unify interaction with index

-  URIMetadataNode extending SolrDocument
- use language as stored (String), reducing conversion to string
- optimize debug code in transferIndex
pull/1/head
reger 11 years ago
parent 79e7947442
commit 727dfb5875

@ -108,7 +108,7 @@ public class yacydoc {
prop.putXML("dc_date", ISO8601Formatter.FORMATTER.format(entry.moddate())); prop.putXML("dc_date", ISO8601Formatter.FORMATTER.format(entry.moddate()));
prop.putXML("dc_type", String.valueOf(entry.doctype())); prop.putXML("dc_type", String.valueOf(entry.doctype()));
prop.putXML("dc_identifier", entry.url().toNormalform(true)); prop.putXML("dc_identifier", entry.url().toNormalform(true));
prop.putXML("dc_language", ASCII.String(entry.language())); prop.putXML("dc_language", entry.language());
prop.putXML("collection", Arrays.toString(entry.collections())); prop.putXML("collection", Arrays.toString(entry.collections()));
prop.put("geo_lat", entry.lat()); prop.put("geo_lat", entry.lat());
prop.put("geo_long", entry.lon()); prop.put("geo_long", entry.lon());

@ -136,7 +136,7 @@ public final class crawlReceipt {
} }
// Check URL against DHT blacklist // Check URL against DHT blacklist
if (Switchboard.urlBlacklist.isListed(BlacklistType.DHT, entry)) { if (Switchboard.urlBlacklist.isListed(BlacklistType.DHT, entry.url())) {
// URL is blacklisted // URL is blacklisted
log.warn("crawlReceipt: RECEIVED wrong RECEIPT (URL is blacklisted) for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false) + " from peer " + iam); log.warn("crawlReceipt: RECEIVED wrong RECEIPT (URL is blacklisted) for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false) + " from peer " + iam);
prop.put("delay", "9999"); prop.put("delay", "9999");

@ -123,7 +123,7 @@ public final class transferURL {
} }
// check if the entry is blacklisted // check if the entry is blacklisted
if ((blockBlacklist) && (Switchboard.urlBlacklist.isListed(BlacklistType.DHT, lEntry))) { if ((blockBlacklist) && (Switchboard.urlBlacklist.isListed(BlacklistType.DHT, lEntry.url()))) {
if (Network.log.isFine()) Network.log.fine("transferURL: blocked blacklisted URL '" + lEntry.url().toNormalform(false) + "' from peer " + otherPeerName); if (Network.log.isFine()) Network.log.fine("transferURL: blocked blacklisted URL '" + lEntry.url().toNormalform(false) + "' from peer " + otherPeerName);
lEntry = null; lEntry = null;
blocked++; blocked++;

@ -32,7 +32,6 @@ import java.util.EnumMap;
import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
@ -117,7 +116,7 @@ public class YMarkMetadata {
metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet())); metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet()));
metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount())); metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount()));
metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype())); metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype()));
metadata.put(METADATA.LANGUAGE, UTF8.String(urlEntry.language())); metadata.put(METADATA.LANGUAGE, urlEntry.language());
metadata.put(METADATA.TITLE, urlEntry.dc_title()); metadata.put(METADATA.TITLE, urlEntry.dc_title());
metadata.put(METADATA.CREATOR, urlEntry.dc_creator()); metadata.put(METADATA.CREATOR, urlEntry.dc_creator());
metadata.put(METADATA.KEYWORDS, urlEntry.dc_subject()); metadata.put(METADATA.KEYWORDS, urlEntry.dc_subject());

@ -38,7 +38,6 @@ import net.yacy.cora.date.MicroDate;
import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging;
@ -63,7 +62,7 @@ import org.apache.solr.common.SolrDocument;
* The purpose of this object is the migration from the old metadata structure to solr document. * The purpose of this object is the migration from the old metadata structure to solr document.
* Future implementations should try to replace URIMetadata objects completely by SolrDocument objects * Future implementations should try to replace URIMetadata objects completely by SolrDocument objects
*/ */
public class URIMetadataNode { public class URIMetadataNode extends SolrDocument {
protected byte[] hash = null; protected byte[] hash = null;
protected String urlRaw = null, keywords = null; protected String urlRaw = null, keywords = null;
@ -72,7 +71,6 @@ public class URIMetadataNode {
protected int imagec = -1, audioc = -1, videoc = -1, appc = -1; protected int imagec = -1, audioc = -1, videoc = -1, appc = -1;
protected double lat = Double.NaN, lon = Double.NaN; protected double lat = Double.NaN, lon = Double.NaN;
protected long ranking = 0; // during generation of a search result this value is set protected long ranking = 0; // during generation of a search result this value is set
protected SolrDocument doc = null;
protected String snippet = null; protected String snippet = null;
protected WordReferenceVars word = null; // this is only used if the url is transported via remote search requests protected WordReferenceVars word = null; // this is only used if the url is transported via remote search requests
@ -80,7 +78,7 @@ public class URIMetadataNode {
// generates an plasmaLURLEntry using the properties from the argument // generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString // the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); //System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
this.doc = new SolrDocument(); super();
urlRaw = crypt.simpleDecode(prop.getProperty("url", "")); urlRaw = crypt.simpleDecode(prop.getProperty("url", ""));
try { try {
url = new DigestURL(urlRaw); url = new DigestURL(urlRaw);
@ -98,10 +96,9 @@ public class URIMetadataNode {
String lons = crypt.simpleDecode(prop.getProperty("lon", "0.0")); if (lons == null) lons = "0.0"; String lons = crypt.simpleDecode(prop.getProperty("lon", "0.0")); if (lons == null) lons = "0.0";
String lats = crypt.simpleDecode(prop.getProperty("lat", "0.0")); if (lats == null) lats = "0.0"; String lats = crypt.simpleDecode(prop.getProperty("lat", "0.0")); if (lats == null) lats = "0.0";
this.setField(CollectionSchema.title.name(), descr);
this.doc.setField(CollectionSchema.title.name(), descr); this.setField(CollectionSchema.author.name(), dc_creator);
this.doc.setField(CollectionSchema.author.name(), dc_creator); this.setField(CollectionSchema.publisher_t.name(), dc_publisher);
this.doc.setField(CollectionSchema.publisher_t.name(), dc_publisher);
this.lat = Float.parseFloat(lats); this.lat = Float.parseFloat(lats);
this.lon = Float.parseFloat(lons); this.lon = Float.parseFloat(lons);
@ -109,32 +106,32 @@ public class URIMetadataNode {
final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute);
try { try {
this.doc.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"))); this.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101")));
} catch (final ParseException e) { } catch (final ParseException e) {
this.doc.setField(CollectionSchema.last_modified.name(), new Date()); this.setField(CollectionSchema.last_modified.name(), new Date());
} }
try { try {
this.doc.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"))); this.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101")));
} catch (final ParseException e) { } catch (final ParseException e) {
this.doc.setField(CollectionSchema.load_date_dt.name(), new Date()); this.setField(CollectionSchema.load_date_dt.name(), new Date());
} }
try { try {
this.doc.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"))); this.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101")));
} catch (final ParseException e) { } catch (final ParseException e) {
this.doc.setField(CollectionSchema.fresh_date_dt.name(), new Date()); this.setField(CollectionSchema.fresh_date_dt.name(), new Date());
} }
this.doc.setField(CollectionSchema.referrer_id_s.name(), prop.getProperty("referrer", "")); this.setField(CollectionSchema.referrer_id_s.name(), prop.getProperty("referrer", ""));
this.doc.setField(CollectionSchema.md5_s.name(), prop.getProperty("md5", "")); this.setField(CollectionSchema.md5_s.name(), prop.getProperty("md5", ""));
this.doc.setField(CollectionSchema.size_i.name(), Integer.parseInt(prop.getProperty("size", "0"))); this.setField(CollectionSchema.size_i.name(), Integer.parseInt(prop.getProperty("size", "0")));
this.doc.setField(CollectionSchema.wordcount_i.name(), Integer.parseInt(prop.getProperty("wc", "0"))); this.setField(CollectionSchema.wordcount_i.name(), Integer.parseInt(prop.getProperty("wc", "0")));
final String dt = prop.getProperty("dt", "t"); final String dt = prop.getProperty("dt", "t");
String[] mime = Response.doctype2mime(null,dt.charAt(0)); String[] mime = Response.doctype2mime(null,dt.charAt(0));
this.doc.setField(CollectionSchema.content_type.name(), mime); this.setField(CollectionSchema.content_type.name(), mime);
final String flagsp = prop.getProperty("flags", "AAAAAA"); final String flagsp = prop.getProperty("flags", "AAAAAA");
this.flags = (flagsp.length() > 6) ? QueryParams.empty_constraint : (new Bitfield(4, flagsp)); this.flags = (flagsp.length() > 6) ? QueryParams.empty_constraint : (new Bitfield(4, flagsp));
this.doc.setField(CollectionSchema.language_s.name(), prop.getProperty("lang", "")); this.setField(CollectionSchema.language_s.name(), prop.getProperty("lang", ""));
this.doc.setField(CollectionSchema.inboundlinkscount_i.name(), Integer.parseInt(prop.getProperty("llocal", "0"))); this.setField(CollectionSchema.inboundlinkscount_i.name(), Integer.parseInt(prop.getProperty("llocal", "0")));
this.doc.setField(CollectionSchema.outboundlinkscount_i.name(), Integer.parseInt(prop.getProperty("lother", "0"))); this.setField(CollectionSchema.outboundlinkscount_i.name(), Integer.parseInt(prop.getProperty("lother", "0")));
this.imagec = Integer.parseInt(prop.getProperty("limage", "0")); this.imagec = Integer.parseInt(prop.getProperty("limage", "0"));
this.audioc = Integer.parseInt(prop.getProperty("laudio", "0")); this.audioc = Integer.parseInt(prop.getProperty("laudio", "0"));
this.videoc = Integer.parseInt(prop.getProperty("lvideo", "0")); this.videoc = Integer.parseInt(prop.getProperty("lvideo", "0"));
@ -147,9 +144,11 @@ public class URIMetadataNode {
} }
public URIMetadataNode(final SolrDocument doc) { public URIMetadataNode(final SolrDocument doc) {
this.doc = doc; super();
for (String name : doc.getFieldNames()) {
this.addField(name, doc.getFieldValue(name));
}
this.snippet = ""; this.snippet = "";
this.word = null;
Float score = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result Float score = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result
this.ranking = score == null ? 0 : (long) (1000000.0f * score.floatValue()); // solr score values are sometimes very low this.ranking = score == null ? 0 : (long) (1000000.0f * score.floatValue()); // solr score values are sometimes very low
this.hash = ASCII.getBytes(getString(CollectionSchema.id)); this.hash = ASCII.getBytes(getString(CollectionSchema.id));
@ -174,24 +173,19 @@ public class URIMetadataNode {
* @return the content domain which classifies the content type * @return the content domain which classifies the content type
*/ */
public ContentDomain getContentDomain() { public ContentDomain getContentDomain() {
if (this.doc == null) return this.url.getContentDomainFromExt();
String mime = mime(); String mime = mime();
if (mime == null) return this.url.getContentDomainFromExt(); if (mime == null) return this.url.getContentDomainFromExt();
ContentDomain contentDomain = Classification.getContentDomainFromMime(mime); ContentDomain contentDomain = Classification.getContentDomainFromMime(mime);
if (contentDomain != ContentDomain.ALL) return contentDomain; if (contentDomain != ContentDomain.ALL) return contentDomain;
return this.url.getContentDomainFromExt(); return this.url.getContentDomainFromExt();
} }
public SolrDocument getDocument() {
return this.doc;
}
public byte[] hash() { public byte[] hash() {
return this.hash; return this.hash;
} }
public String hosthash() { public String hosthash() {
String hosthash = (String) this.doc.getFieldValue(CollectionSchema.host_id_s.getSolrFieldName()); String hosthash = (String) this.getFieldValue(CollectionSchema.host_id_s.getSolrFieldName());
if (hosthash == null) hosthash = ASCII.String(this.hash, 6, 6); if (hosthash == null) hosthash = ASCII.String(this.hash, 6, 6);
return hosthash; return hosthash;
} }
@ -233,7 +227,7 @@ public class URIMetadataNode {
if (Double.isNaN(this.lat)) { if (Double.isNaN(this.lat)) {
this.lon = 0.0d; this.lon = 0.0d;
this.lat = 0.0d; this.lat = 0.0d;
String latlon = (String) this.doc.getFieldValue(CollectionSchema.coordinate_p.getSolrFieldName()); String latlon = (String) this.getFieldValue(CollectionSchema.coordinate_p.getSolrFieldName());
if (latlon != null) { if (latlon != null) {
int p = latlon.indexOf(','); int p = latlon.indexOf(',');
if (p > 0) { if (p > 0) {
@ -277,10 +271,10 @@ public class URIMetadataNode {
return mime == null || mime.size() == 0 ? null : mime.get(0); return mime == null || mime.size() == 0 ? null : mime.get(0);
} }
public byte[] language() { public String language() {
String language = getString(CollectionSchema.language_s); String language = getString(CollectionSchema.language_s);
if (language == null || language.length() == 0) return ASCII.getBytes("en"); if (language == null || language.length() == 0) return "en";
return UTF8.getBytes(language); return language;
} }
public byte[] referrerHash() { public byte[] referrerHash() {
@ -401,7 +395,7 @@ public class URIMetadataNode {
} }
return list.iterator(); return list.iterator();
} }
public static Date getDate(SolrDocument doc, final CollectionSchema key) { public static Date getDate(SolrDocument doc, final CollectionSchema key) {
Date x = doc == null ? null : (Date) doc.getFieldValue(key.getSolrFieldName()); Date x = doc == null ? null : (Date) doc.getFieldValue(key.getSolrFieldName());
Date now = new Date(); Date now = new Date();
@ -430,7 +424,7 @@ public class URIMetadataNode {
} }
} }
protected static StringBuilder corePropList(URIMetadataNode md) { protected StringBuilder corePropList() {
// generate a parseable string; this is a simple property-list // generate a parseable string; this is a simple property-list
final StringBuilder s = new StringBuilder(300); final StringBuilder s = new StringBuilder(300);
@ -438,33 +432,33 @@ public class URIMetadataNode {
final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute);
try { try {
s.append("hash=").append(ASCII.String(md.hash())); s.append("hash=").append(ASCII.String(this.hash()));
s.append(",url=").append(crypt.simpleEncode(md.url().toNormalform(true))); s.append(",url=").append(crypt.simpleEncode(this.url().toNormalform(true)));
s.append(",descr=").append(crypt.simpleEncode(md.dc_title())); s.append(",descr=").append(crypt.simpleEncode(this.dc_title()));
s.append(",author=").append(crypt.simpleEncode(md.dc_creator())); s.append(",author=").append(crypt.simpleEncode(this.dc_creator()));
s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(md.dc_subject()))); s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(this.dc_subject())));
s.append(",publisher=").append(crypt.simpleEncode(md.dc_publisher())); s.append(",publisher=").append(crypt.simpleEncode(this.dc_publisher()));
s.append(",lat=").append(md.lat()); s.append(",lat=").append(this.lat());
s.append(",lon=").append(md.lon()); s.append(",lon=").append(this.lon());
s.append(",mod=").append(formatter.format(md.moddate())); s.append(",mod=").append(formatter.format(this.moddate()));
s.append(",load=").append(formatter.format(md.loaddate())); s.append(",load=").append(formatter.format(this.loaddate()));
s.append(",fresh=").append(formatter.format(md.freshdate())); s.append(",fresh=").append(formatter.format(this.freshdate()));
s.append(",referrer=").append(md.referrerHash() == null ? "" : ASCII.String(md.referrerHash())); s.append(",referrer=").append(this.referrerHash() == null ? "" : ASCII.String(this.referrerHash()));
s.append(",md5=").append(md.md5()); s.append(",md5=").append(this.md5());
s.append(",size=").append(md.size()); s.append(",size=").append(this.size());
s.append(",wc=").append(md.wordCount()); s.append(",wc=").append(this.wordCount());
s.append(",dt=").append(md.doctype()); s.append(",dt=").append(this.doctype());
s.append(",flags=").append(md.flags().exportB64()); s.append(",flags=").append(this.flags().exportB64());
s.append(",lang=").append(md.language() == null ? "EN" : UTF8.String(md.language())); s.append(",lang=").append(this.language());
s.append(",llocal=").append(md.llocal()); s.append(",llocal=").append(this.llocal());
s.append(",lother=").append(md.lother()); s.append(",lother=").append(this.lother());
s.append(",limage=").append(md.limage()); s.append(",limage=").append(this.limage());
s.append(",laudio=").append(md.laudio()); s.append(",laudio=").append(this.laudio());
s.append(",lvideo=").append(md.lvideo()); s.append(",lvideo=").append(this.lvideo());
s.append(",lapp=").append(md.lapp()); s.append(",lapp=").append(this.lapp());
if (md.word() != null) { if (this.word() != null) {
// append also word properties // append also word properties
final String wprop = md.word().toPropertyForm(); final String wprop = this.word().toPropertyForm();
s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop)); s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop));
} }
return s; return s;
@ -480,7 +474,7 @@ public class URIMetadataNode {
*/ */
public String toString(String snippet) { public String toString(String snippet) {
// add information needed for remote transport // add information needed for remote transport
final StringBuilder core = corePropList(this); final StringBuilder core = corePropList();
if (core == null) if (core == null)
return null; return null;
@ -501,7 +495,7 @@ public class URIMetadataNode {
*/ */
@Override @Override
public String toString() { public String toString() {
final StringBuilder core = corePropList(this); final StringBuilder core = corePropList();
if (core == null) return null; if (core == null) return null;
core.insert(0, '{'); core.insert(0, '{');
core.append('}'); core.append('}');
@ -511,7 +505,7 @@ public class URIMetadataNode {
private int getInt(CollectionSchema field) { private int getInt(CollectionSchema field) {
assert !field.isMultiValued(); assert !field.isMultiValued();
assert field.getType() == SolrType.num_integer; assert field.getType() == SolrType.num_integer;
Object x = this.doc.getFieldValue(field.getSolrFieldName()); Object x = this.getFieldValue(field.getSolrFieldName());
if (x == null) return 0; if (x == null) return 0;
if (x instanceof Integer) return ((Integer) x).intValue(); if (x instanceof Integer) return ((Integer) x).intValue();
if (x instanceof Long) return ((Long) x).intValue(); if (x instanceof Long) return ((Long) x).intValue();
@ -521,7 +515,7 @@ public class URIMetadataNode {
private Date getDate(CollectionSchema field) { private Date getDate(CollectionSchema field) {
assert !field.isMultiValued(); assert !field.isMultiValued();
assert field.getType() == SolrType.date; assert field.getType() == SolrType.date;
Date x = (Date) this.doc.getFieldValue(field.getSolrFieldName()); Date x = (Date) this.getFieldValue(field.getSolrFieldName());
if (x == null) return new Date(0); if (x == null) return new Date(0);
Date now = new Date(); Date now = new Date();
return x.after(now) ? now : x; return x.after(now) ? now : x;
@ -530,7 +524,7 @@ public class URIMetadataNode {
private String getString(CollectionSchema field) { private String getString(CollectionSchema field) {
assert !field.isMultiValued(); assert !field.isMultiValued();
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight; assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight;
Object x = this.doc.getFieldValue(field.getSolrFieldName()); Object x = this.getFieldValue(field.getSolrFieldName());
if (x == null) return ""; if (x == null) return "";
if (x instanceof ArrayList) { if (x instanceof ArrayList) {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
@ -544,7 +538,7 @@ public class URIMetadataNode {
private ArrayList<String> getStringList(CollectionSchema field) { private ArrayList<String> getStringList(CollectionSchema field) {
assert field.isMultiValued(); assert field.isMultiValued();
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general; assert field.getType() == SolrType.string || field.getType() == SolrType.text_general;
Object r = this.doc.getFieldValue(field.getSolrFieldName()); Object r = this.getFieldValue(field.getSolrFieldName());
if (r == null) return new ArrayList<String>(0); if (r == null) return new ArrayList<String>(0);
if (r instanceof ArrayList) { if (r instanceof ArrayList) {
return (ArrayList<String>) r; return (ArrayList<String>) r;
@ -558,7 +552,7 @@ public class URIMetadataNode {
private ArrayList<Integer> getIntList(CollectionSchema field) { private ArrayList<Integer> getIntList(CollectionSchema field) {
assert field.isMultiValued(); assert field.isMultiValued();
assert field.getType() == SolrType.num_integer; assert field.getType() == SolrType.num_integer;
Object r = this.doc.getFieldValue(field.getSolrFieldName()); Object r = this.getFieldValue(field.getSolrFieldName());
if (r == null) return new ArrayList<Integer>(0); if (r == null) return new ArrayList<Integer>(0);
if (r instanceof ArrayList) { if (r instanceof ArrayList) {
return (ArrayList<Integer>) r; return (ArrayList<Integer>) r;

@ -58,7 +58,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
private final Bitfield flags; private final Bitfield flags;
private long lastModified; private long lastModified;
private final byte[] language; private final String language;
public final byte[] urlHash; public final byte[] urlHash;
private String hostHash = null; private String hostHash = null;
private final char type; private final char type;
@ -108,7 +108,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
final int posinphrase, // position of word in its phrase final int posinphrase, // position of word in its phrase
final int posofphrase, // number of the phrase where word appears final int posofphrase, // number of the phrase where word appears
final long lastmodified, // last-modified time of the document where word appears final long lastmodified, // last-modified time of the document where word appears
byte[] language, // (guessed) language of document String language, // (guessed) language of document
final char doctype, // type of document final char doctype, // type of document
final int outlinksSame, // outlinks to same domain final int outlinksSame, // outlinks to same domain
final int outlinksOther, // outlinks to other domain final int outlinksOther, // outlinks to other domain
@ -143,7 +143,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.flags = e.flags(); this.flags = e.flags();
//this.freshUntil = e.freshUntil(); //this.freshUntil = e.freshUntil();
this.lastModified = e.lastModified(); this.lastModified = e.lastModified();
this.language = e.getLanguage(); this.language = ASCII.String(e.getLanguage());
this.urlHash = e.urlhash(); this.urlHash = e.urlhash();
this.type = e.getType(); this.type = e.getType();
this.hitcount = e.hitcount(); this.hitcount = e.hitcount();
@ -229,7 +229,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
@Override @Override
public byte[] getLanguage() { public byte[] getLanguage() {
return this.language; return ASCII.getBytes(this.language);
} }
@Override @Override
@ -291,7 +291,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.posofphrase, // number of the phrase where word appears this.posofphrase, // number of the phrase where word appears
this.lastModified, // last-modified time of the document where word appears this.lastModified, // last-modified time of the document where word appears
System.currentTimeMillis(), // update time; System.currentTimeMillis(), // update time;
this.language, // (guessed) language of document ASCII.getBytes(this.language), // (guessed) language of document
this.type, // type of document this.type, // type of document
this.llocal, // outlinks to same domain this.llocal, // outlinks to same domain
this.lother, // outlinks to other domain this.lother, // outlinks to other domain

@ -1362,22 +1362,22 @@ public final class Protocol {
final int timeout) { final int timeout) {
// check if we got all necessary urls in the urlCache (only for debugging) // check if we got all necessary urls in the urlCache (only for debugging)
Iterator<WordReference> eenum; if (Network.log.isFine()) {
Reference entry; Iterator<WordReference> eenum;
for ( final ReferenceContainer<WordReference> ic : indexes ) { Reference entry;
eenum = ic.entries(); for ( final ReferenceContainer<WordReference> ic : indexes ) {
while ( eenum.hasNext() ) { eenum = ic.entries();
entry = eenum.next(); while ( eenum.hasNext() ) {
if ( !urlRefs.has(entry.urlhash()) ) { entry = eenum.next();
if ( Network.log.isFine() ) { if ( !urlRefs.has(entry.urlhash()) ) {
Network.log.fine("DEBUG transferIndex: to-send url hash '" Network.log.fine("DEBUG transferIndex: to-send url hash '"
+ ASCII.String(entry.urlhash()) + ASCII.String(entry.urlhash())
+ "' is not contained in urlCache"); + "' is not contained in urlCache");
} }
} }
} }
} }
// transfer the RWI without the URLs // transfer the RWI without the URLs
Map<String, String> in = transferRWI(targetSeed, indexes, gzipBody, timeout); Map<String, String> in = transferRWI(targetSeed, indexes, gzipBody, timeout);

@ -53,7 +53,6 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.data.ListManager; import net.yacy.data.ListManager;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -457,10 +456,6 @@ public class Blacklist {
return ret; return ret;
} }
public final boolean isListed(final BlacklistType blacklistType, final URIMetadataNode entry) {
return isListed(blacklistType, entry.url());
}
/** /**
* Checks whether the given entry is listed in given blacklist type. * Checks whether the given entry is listed in given blacklist type.
* @param blacklistType The used blacklist * @param blacklistType The used blacklist

@ -47,7 +47,6 @@ import net.yacy.contentcontrol.ContentControlFilterUpdateThread;
import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
@ -279,7 +278,7 @@ public final class SearchEvent {
this.snippetFetchAlive = new AtomicInteger(0); this.snippetFetchAlive = new AtomicInteger(0);
this.addRunning = true; this.addRunning = true;
this.receivedRemoteReferences = new AtomicInteger(0); this.receivedRemoteReferences = new AtomicInteger(0);
this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang)); this.order = new ReferenceOrder(this.query.ranking, this.query.targetlang);
this.urlhashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 100); this.urlhashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 100);
this.taggingPredicates = new HashMap<String, String>(); this.taggingPredicates = new HashMap<String, String>();
for (Tagging t: LibraryProvider.autotagging.getVocabularies()) { for (Tagging t: LibraryProvider.autotagging.getVocabularies()) {
@ -897,7 +896,7 @@ public final class SearchEvent {
} }
if (this.query.modifier.language != null) { if (this.query.modifier.language != null) {
if (!this.query.modifier.language.equals(UTF8.String(iEntry.language()))) { if (!this.query.modifier.language.equals(iEntry.language())) {
if (log.isFine()) log.fine("dropped Node: language"); if (log.isFine()) log.fine("dropped Node: language");
continue pollloop; continue pollloop;
} }
@ -1083,7 +1082,7 @@ public final class SearchEvent {
// check modifier constraint (language) // check modifier constraint (language)
// TODO: : page.language() never null but defaults to "en" (may cause false drop of result) // TODO: : page.language() never null but defaults to "en" (may cause false drop of result)
if (this.query.modifier.language != null && !this.query.modifier.language.equals(ASCII.String(page.language()))) { if (this.query.modifier.language != null && !this.query.modifier.language.equals(page.language())) {
if (log.isFine()) log.fine("dropped RWI: language constraint = " + this.query.modifier.language); if (log.isFine()) log.fine("dropped RWI: language constraint = " + this.query.modifier.language);
if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet(); if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet();
continue; continue;
@ -1165,7 +1164,7 @@ public final class SearchEvent {
// TODO: vocabulary is only valid and available in local Solr index (considere to auto-switch to Searchdom.LOCAL) // TODO: vocabulary is only valid and available in local Solr index (considere to auto-switch to Searchdom.LOCAL)
if (this.query.metatags != null && !this.query.metatags.isEmpty()) { if (this.query.metatags != null && !this.query.metatags.isEmpty()) {
tagloop: for (Tagging.Metatag tag : this.query.metatags) { tagloop: for (Tagging.Metatag tag : this.query.metatags) {
SolrDocument sdoc = page.getDocument(); SolrDocument sdoc = page;
if (sdoc != null) { if (sdoc != null) {
Collection<Object> tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_SUFFIX); Collection<Object> tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_SUFFIX);
if (tagvalues != null && tagvalues.contains(tag.getObject())) { if (tagvalues != null && tagvalues.contains(tag.getObject())) {
@ -1462,7 +1461,7 @@ public final class SearchEvent {
ResultEntry ms = oneResult(item, timeout); ResultEntry ms = oneResult(item, timeout);
// check if the match was made in the url or in the image links // check if the match was made in the url or in the image links
if (ms != null) { if (ms != null) {
SolrDocument doc = ms.getNode().getDocument(); SolrDocument doc = ms.getNode();
Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()); Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());

@ -33,6 +33,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore; import java.util.concurrent.Semaphore;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ByteBuffer;
@ -55,9 +56,9 @@ public class ReferenceOrder {
private WordReferenceVars min, max; private WordReferenceVars min, max;
private final ConcurrentScoreMap<String> doms; // collected for "authority" heuristic private final ConcurrentScoreMap<String> doms; // collected for "authority" heuristic
private final RankingProfile ranking; private final RankingProfile ranking;
private final byte[] language; private final String language;
public ReferenceOrder(final RankingProfile profile, final byte[] language) { public ReferenceOrder(final RankingProfile profile, final String language) {
this.min = null; this.min = null;
this.max = null; this.max = null;
this.ranking = profile; this.ranking = profile;
@ -256,7 +257,7 @@ public class ReferenceOrder {
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0) + ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0) + ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0) + ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
+ ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0); + ((ByteBuffer.equals(t.getLanguage(), ASCII.getBytes(this.language))) ? 255 << this.ranking.coeff_language : 0);
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0; //if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;
@ -289,7 +290,7 @@ public class ReferenceOrder {
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0) + ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0) + ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0) + ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
+ ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0); + ((this.language.equals(t.language())) ? 255 << this.ranking.coeff_language : 0);
return r; // the higher the number the better the ranking. return r; // the higher the number the better the ranking.
} }

@ -48,7 +48,6 @@ import java.util.regex.Pattern;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
@ -306,7 +305,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash())); if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash()));
if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5()); if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5());
if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher()); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher());
if ((allAttr || contains(CollectionSchema.language_s)) && md.language() != null) add(doc, CollectionSchema.language_s, UTF8.String(md.language())); if (allAttr || contains(CollectionSchema.language_s)) add(doc, CollectionSchema.language_s, md.language());
if (allAttr || contains(CollectionSchema.size_i)) add(doc, CollectionSchema.size_i, md.size()); if (allAttr || contains(CollectionSchema.size_i)) add(doc, CollectionSchema.size_i, md.size());
if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, md.laudio()); if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, md.laudio());
if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, md.lvideo()); if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, md.lvideo());

@ -70,7 +70,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
final List<MediaSnippet> mediaSnippets, final List<MediaSnippet> mediaSnippets,
final long snippetComputationTime) { final long snippetComputationTime) {
this.urlentry = urlentry; this.urlentry = urlentry;
this.urlentry.getDocument().setField(CollectionSchema.text_t.getSolrFieldName(), ""); // clear the text field which eats up most of the space; it was used for snippet computation which is in a separate field here this.urlentry.setField(CollectionSchema.text_t.getSolrFieldName(), ""); // clear the text field which eats up most of the space; it was used for snippet computation which is in a separate field here
this.indexSegment = indexSegment; this.indexSegment = indexSegment;
this.alternative_urlstring = null; this.alternative_urlstring = null;
this.alternative_urlname = null; this.alternative_urlname = null;

Loading…
Cancel
Save