diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index ad9d43b1e..33b14ad67 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -108,7 +108,7 @@ public class yacydoc { prop.putXML("dc_date", ISO8601Formatter.FORMATTER.format(entry.moddate())); prop.putXML("dc_type", String.valueOf(entry.doctype())); prop.putXML("dc_identifier", entry.url().toNormalform(true)); - prop.putXML("dc_language", ASCII.String(entry.language())); + prop.putXML("dc_language", entry.language()); prop.putXML("collection", Arrays.toString(entry.collections())); prop.put("geo_lat", entry.lat()); prop.put("geo_long", entry.lon()); diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 1b175b121..e73ed3ae9 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -136,7 +136,7 @@ public final class crawlReceipt { } // Check URL against DHT blacklist - if (Switchboard.urlBlacklist.isListed(BlacklistType.DHT, entry)) { + if (Switchboard.urlBlacklist.isListed(BlacklistType.DHT, entry.url())) { // URL is blacklisted log.warn("crawlReceipt: RECEIVED wrong RECEIPT (URL is blacklisted) for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false) + " from peer " + iam); prop.put("delay", "9999"); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index f2406971f..54650c2e3 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -123,7 +123,7 @@ public final class transferURL { } // check if the entry is blacklisted - if ((blockBlacklist) && (Switchboard.urlBlacklist.isListed(BlacklistType.DHT, lEntry))) { + if ((blockBlacklist) && (Switchboard.urlBlacklist.isListed(BlacklistType.DHT, lEntry.url()))) { if (Network.log.isFine()) Network.log.fine("transferURL: blocked blacklisted URL '" + lEntry.url().toNormalform(false) + "' from peer " + otherPeerName); lEntry = null; blocked++; diff --git a/source/net/yacy/data/ymark/YMarkMetadata.java b/source/net/yacy/data/ymark/YMarkMetadata.java index 9e52fb571..ab7b47e7d 100644 --- a/source/net/yacy/data/ymark/YMarkMetadata.java +++ b/source/net/yacy/data/ymark/YMarkMetadata.java @@ -32,7 +32,6 @@ import java.util.EnumMap; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.encoding.ASCII; -import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; @@ -117,7 +116,7 @@ public class YMarkMetadata { metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet())); metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount())); metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype())); - metadata.put(METADATA.LANGUAGE, UTF8.String(urlEntry.language())); + metadata.put(METADATA.LANGUAGE, urlEntry.language()); metadata.put(METADATA.TITLE, urlEntry.dc_title()); metadata.put(METADATA.CREATOR, urlEntry.dc_creator()); metadata.put(METADATA.KEYWORDS, urlEntry.dc_subject()); diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index c369e72c8..e0dd984da 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -38,7 +38,6 @@ import net.yacy.cora.date.MicroDate; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.ASCII; -import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.lod.vocabulary.Tagging; @@ -63,7 +62,7 @@ import org.apache.solr.common.SolrDocument; * The purpose of this object is the migration from the old metadata structure to solr document. * Future implementations should try to replace URIMetadata objects completely by SolrDocument objects */ -public class URIMetadataNode { +public class URIMetadataNode extends SolrDocument { protected byte[] hash = null; protected String urlRaw = null, keywords = null; @@ -72,7 +71,6 @@ public class URIMetadataNode { protected int imagec = -1, audioc = -1, videoc = -1, appc = -1; protected double lat = Double.NaN, lon = Double.NaN; protected long ranking = 0; // during generation of a search result this value is set - protected SolrDocument doc = null; protected String snippet = null; protected WordReferenceVars word = null; // this is only used if the url is transported via remote search requests @@ -80,7 +78,7 @@ public class URIMetadataNode { // generates an plasmaLURLEntry using the properties from the argument // the property names must correspond to the one from toString //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); - this.doc = new SolrDocument(); + super(); urlRaw = crypt.simpleDecode(prop.getProperty("url", "")); try { url = new DigestURL(urlRaw); @@ -98,10 +96,9 @@ public class URIMetadataNode { String lons = crypt.simpleDecode(prop.getProperty("lon", "0.0")); if (lons == null) lons = "0.0"; String lats = crypt.simpleDecode(prop.getProperty("lat", "0.0")); if (lats == null) lats = "0.0"; - - this.doc.setField(CollectionSchema.title.name(), descr); - this.doc.setField(CollectionSchema.author.name(), dc_creator); - this.doc.setField(CollectionSchema.publisher_t.name(), dc_publisher); + this.setField(CollectionSchema.title.name(), descr); + this.setField(CollectionSchema.author.name(), dc_creator); + this.setField(CollectionSchema.publisher_t.name(), dc_publisher); this.lat = Float.parseFloat(lats); this.lon = Float.parseFloat(lons); @@ -109,32 +106,32 @@ public class URIMetadataNode { final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); try { - this.doc.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"))); + this.setField(CollectionSchema.last_modified.name(), formatter.parse(prop.getProperty("mod", "20000101"))); } catch (final ParseException e) { - this.doc.setField(CollectionSchema.last_modified.name(), new Date()); + this.setField(CollectionSchema.last_modified.name(), new Date()); } try { - this.doc.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"))); + this.setField(CollectionSchema.load_date_dt.name(), formatter.parse(prop.getProperty("load", "20000101"))); } catch (final ParseException e) { - this.doc.setField(CollectionSchema.load_date_dt.name(), new Date()); + this.setField(CollectionSchema.load_date_dt.name(), new Date()); } try { - this.doc.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"))); + this.setField(CollectionSchema.fresh_date_dt.name(), formatter.parse(prop.getProperty("fresh", "20000101"))); } catch (final ParseException e) { - this.doc.setField(CollectionSchema.fresh_date_dt.name(), new Date()); + this.setField(CollectionSchema.fresh_date_dt.name(), new Date()); } - this.doc.setField(CollectionSchema.referrer_id_s.name(), prop.getProperty("referrer", "")); - this.doc.setField(CollectionSchema.md5_s.name(), prop.getProperty("md5", "")); - this.doc.setField(CollectionSchema.size_i.name(), Integer.parseInt(prop.getProperty("size", "0"))); - this.doc.setField(CollectionSchema.wordcount_i.name(), Integer.parseInt(prop.getProperty("wc", "0"))); + this.setField(CollectionSchema.referrer_id_s.name(), prop.getProperty("referrer", "")); + this.setField(CollectionSchema.md5_s.name(), prop.getProperty("md5", "")); + this.setField(CollectionSchema.size_i.name(), Integer.parseInt(prop.getProperty("size", "0"))); + this.setField(CollectionSchema.wordcount_i.name(), Integer.parseInt(prop.getProperty("wc", "0"))); final String dt = prop.getProperty("dt", "t"); String[] mime = Response.doctype2mime(null,dt.charAt(0)); - this.doc.setField(CollectionSchema.content_type.name(), mime); + this.setField(CollectionSchema.content_type.name(), mime); final String flagsp = prop.getProperty("flags", "AAAAAA"); this.flags = (flagsp.length() > 6) ? QueryParams.empty_constraint : (new Bitfield(4, flagsp)); - this.doc.setField(CollectionSchema.language_s.name(), prop.getProperty("lang", "")); - this.doc.setField(CollectionSchema.inboundlinkscount_i.name(), Integer.parseInt(prop.getProperty("llocal", "0"))); - this.doc.setField(CollectionSchema.outboundlinkscount_i.name(), Integer.parseInt(prop.getProperty("lother", "0"))); + this.setField(CollectionSchema.language_s.name(), prop.getProperty("lang", "")); + this.setField(CollectionSchema.inboundlinkscount_i.name(), Integer.parseInt(prop.getProperty("llocal", "0"))); + this.setField(CollectionSchema.outboundlinkscount_i.name(), Integer.parseInt(prop.getProperty("lother", "0"))); this.imagec = Integer.parseInt(prop.getProperty("limage", "0")); this.audioc = Integer.parseInt(prop.getProperty("laudio", "0")); this.videoc = Integer.parseInt(prop.getProperty("lvideo", "0")); @@ -147,9 +144,11 @@ public class URIMetadataNode { } public URIMetadataNode(final SolrDocument doc) { - this.doc = doc; + super(); + for (String name : doc.getFieldNames()) { + this.addField(name, doc.getFieldValue(name)); + } this.snippet = ""; - this.word = null; Float score = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result this.ranking = score == null ? 0 : (long) (1000000.0f * score.floatValue()); // solr score values are sometimes very low this.hash = ASCII.getBytes(getString(CollectionSchema.id)); @@ -174,24 +173,19 @@ public class URIMetadataNode { * @return the content domain which classifies the content type */ public ContentDomain getContentDomain() { - if (this.doc == null) return this.url.getContentDomainFromExt(); String mime = mime(); if (mime == null) return this.url.getContentDomainFromExt(); ContentDomain contentDomain = Classification.getContentDomainFromMime(mime); if (contentDomain != ContentDomain.ALL) return contentDomain; return this.url.getContentDomainFromExt(); } - - public SolrDocument getDocument() { - return this.doc; - } public byte[] hash() { return this.hash; } public String hosthash() { - String hosthash = (String) this.doc.getFieldValue(CollectionSchema.host_id_s.getSolrFieldName()); + String hosthash = (String) this.getFieldValue(CollectionSchema.host_id_s.getSolrFieldName()); if (hosthash == null) hosthash = ASCII.String(this.hash, 6, 6); return hosthash; } @@ -233,7 +227,7 @@ public class URIMetadataNode { if (Double.isNaN(this.lat)) { this.lon = 0.0d; this.lat = 0.0d; - String latlon = (String) this.doc.getFieldValue(CollectionSchema.coordinate_p.getSolrFieldName()); + String latlon = (String) this.getFieldValue(CollectionSchema.coordinate_p.getSolrFieldName()); if (latlon != null) { int p = latlon.indexOf(','); if (p > 0) { @@ -277,10 +271,10 @@ public class URIMetadataNode { return mime == null || mime.size() == 0 ? null : mime.get(0); } - public byte[] language() { + public String language() { String language = getString(CollectionSchema.language_s); - if (language == null || language.length() == 0) return ASCII.getBytes("en"); - return UTF8.getBytes(language); + if (language == null || language.length() == 0) return "en"; + return language; } public byte[] referrerHash() { @@ -401,7 +395,7 @@ public class URIMetadataNode { } return list.iterator(); } - + public static Date getDate(SolrDocument doc, final CollectionSchema key) { Date x = doc == null ? null : (Date) doc.getFieldValue(key.getSolrFieldName()); Date now = new Date(); @@ -430,7 +424,7 @@ public class URIMetadataNode { } } - protected static StringBuilder corePropList(URIMetadataNode md) { + protected StringBuilder corePropList() { // generate a parseable string; this is a simple property-list final StringBuilder s = new StringBuilder(300); @@ -438,33 +432,33 @@ public class URIMetadataNode { final GenericFormatter formatter = new GenericFormatter(GenericFormatter.FORMAT_SHORT_DAY, GenericFormatter.time_minute); try { - s.append("hash=").append(ASCII.String(md.hash())); - s.append(",url=").append(crypt.simpleEncode(md.url().toNormalform(true))); - s.append(",descr=").append(crypt.simpleEncode(md.dc_title())); - s.append(",author=").append(crypt.simpleEncode(md.dc_creator())); - s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(md.dc_subject()))); - s.append(",publisher=").append(crypt.simpleEncode(md.dc_publisher())); - s.append(",lat=").append(md.lat()); - s.append(",lon=").append(md.lon()); - s.append(",mod=").append(formatter.format(md.moddate())); - s.append(",load=").append(formatter.format(md.loaddate())); - s.append(",fresh=").append(formatter.format(md.freshdate())); - s.append(",referrer=").append(md.referrerHash() == null ? "" : ASCII.String(md.referrerHash())); - s.append(",md5=").append(md.md5()); - s.append(",size=").append(md.size()); - s.append(",wc=").append(md.wordCount()); - s.append(",dt=").append(md.doctype()); - s.append(",flags=").append(md.flags().exportB64()); - s.append(",lang=").append(md.language() == null ? "EN" : UTF8.String(md.language())); - s.append(",llocal=").append(md.llocal()); - s.append(",lother=").append(md.lother()); - s.append(",limage=").append(md.limage()); - s.append(",laudio=").append(md.laudio()); - s.append(",lvideo=").append(md.lvideo()); - s.append(",lapp=").append(md.lapp()); - if (md.word() != null) { + s.append("hash=").append(ASCII.String(this.hash())); + s.append(",url=").append(crypt.simpleEncode(this.url().toNormalform(true))); + s.append(",descr=").append(crypt.simpleEncode(this.dc_title())); + s.append(",author=").append(crypt.simpleEncode(this.dc_creator())); + s.append(",tags=").append(crypt.simpleEncode(Tagging.cleanTagFromAutotagging(this.dc_subject()))); + s.append(",publisher=").append(crypt.simpleEncode(this.dc_publisher())); + s.append(",lat=").append(this.lat()); + s.append(",lon=").append(this.lon()); + s.append(",mod=").append(formatter.format(this.moddate())); + s.append(",load=").append(formatter.format(this.loaddate())); + s.append(",fresh=").append(formatter.format(this.freshdate())); + s.append(",referrer=").append(this.referrerHash() == null ? "" : ASCII.String(this.referrerHash())); + s.append(",md5=").append(this.md5()); + s.append(",size=").append(this.size()); + s.append(",wc=").append(this.wordCount()); + s.append(",dt=").append(this.doctype()); + s.append(",flags=").append(this.flags().exportB64()); + s.append(",lang=").append(this.language()); + s.append(",llocal=").append(this.llocal()); + s.append(",lother=").append(this.lother()); + s.append(",limage=").append(this.limage()); + s.append(",laudio=").append(this.laudio()); + s.append(",lvideo=").append(this.lvideo()); + s.append(",lapp=").append(this.lapp()); + if (this.word() != null) { // append also word properties - final String wprop = md.word().toPropertyForm(); + final String wprop = this.word().toPropertyForm(); s.append(",wi=").append(Base64Order.enhancedCoder.encodeString(wprop)); } return s; @@ -480,7 +474,7 @@ public class URIMetadataNode { */ public String toString(String snippet) { // add information needed for remote transport - final StringBuilder core = corePropList(this); + final StringBuilder core = corePropList(); if (core == null) return null; @@ -501,7 +495,7 @@ public class URIMetadataNode { */ @Override public String toString() { - final StringBuilder core = corePropList(this); + final StringBuilder core = corePropList(); if (core == null) return null; core.insert(0, '{'); core.append('}'); @@ -511,7 +505,7 @@ public class URIMetadataNode { private int getInt(CollectionSchema field) { assert !field.isMultiValued(); assert field.getType() == SolrType.num_integer; - Object x = this.doc.getFieldValue(field.getSolrFieldName()); + Object x = this.getFieldValue(field.getSolrFieldName()); if (x == null) return 0; if (x instanceof Integer) return ((Integer) x).intValue(); if (x instanceof Long) return ((Long) x).intValue(); @@ -521,7 +515,7 @@ public class URIMetadataNode { private Date getDate(CollectionSchema field) { assert !field.isMultiValued(); assert field.getType() == SolrType.date; - Date x = (Date) this.doc.getFieldValue(field.getSolrFieldName()); + Date x = (Date) this.getFieldValue(field.getSolrFieldName()); if (x == null) return new Date(0); Date now = new Date(); return x.after(now) ? now : x; @@ -530,7 +524,7 @@ public class URIMetadataNode { private String getString(CollectionSchema field) { assert !field.isMultiValued(); assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight; - Object x = this.doc.getFieldValue(field.getSolrFieldName()); + Object x = this.getFieldValue(field.getSolrFieldName()); if (x == null) return ""; if (x instanceof ArrayList) { @SuppressWarnings("unchecked") @@ -544,7 +538,7 @@ public class URIMetadataNode { private ArrayList getStringList(CollectionSchema field) { assert field.isMultiValued(); assert field.getType() == SolrType.string || field.getType() == SolrType.text_general; - Object r = this.doc.getFieldValue(field.getSolrFieldName()); + Object r = this.getFieldValue(field.getSolrFieldName()); if (r == null) return new ArrayList(0); if (r instanceof ArrayList) { return (ArrayList) r; @@ -558,7 +552,7 @@ public class URIMetadataNode { private ArrayList getIntList(CollectionSchema field) { assert field.isMultiValued(); assert field.getType() == SolrType.num_integer; - Object r = this.doc.getFieldValue(field.getSolrFieldName()); + Object r = this.getFieldValue(field.getSolrFieldName()); if (r == null) return new ArrayList(0); if (r instanceof ArrayList) { return (ArrayList) r; diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index 3c2cf2137..734574948 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -58,7 +58,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc private final Bitfield flags; private long lastModified; - private final byte[] language; + private final String language; public final byte[] urlHash; private String hostHash = null; private final char type; @@ -108,7 +108,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc final int posinphrase, // position of word in its phrase final int posofphrase, // number of the phrase where word appears final long lastmodified, // last-modified time of the document where word appears - byte[] language, // (guessed) language of document + String language, // (guessed) language of document final char doctype, // type of document final int outlinksSame, // outlinks to same domain final int outlinksOther, // outlinks to other domain @@ -143,7 +143,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc this.flags = e.flags(); //this.freshUntil = e.freshUntil(); this.lastModified = e.lastModified(); - this.language = e.getLanguage(); + this.language = ASCII.String(e.getLanguage()); this.urlHash = e.urlhash(); this.type = e.getType(); this.hitcount = e.hitcount(); @@ -229,7 +229,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc @Override public byte[] getLanguage() { - return this.language; + return ASCII.getBytes(this.language); } @Override @@ -291,7 +291,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc this.posofphrase, // number of the phrase where word appears this.lastModified, // last-modified time of the document where word appears System.currentTimeMillis(), // update time; - this.language, // (guessed) language of document + ASCII.getBytes(this.language), // (guessed) language of document this.type, // type of document this.llocal, // outlinks to same domain this.lother, // outlinks to other domain diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index aa1f72da0..03af2b969 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -1362,22 +1362,22 @@ public final class Protocol { final int timeout) { // check if we got all necessary urls in the urlCache (only for debugging) - Iterator eenum; - Reference entry; - for ( final ReferenceContainer ic : indexes ) { - eenum = ic.entries(); - while ( eenum.hasNext() ) { - entry = eenum.next(); - if ( !urlRefs.has(entry.urlhash()) ) { - if ( Network.log.isFine() ) { + if (Network.log.isFine()) { + Iterator eenum; + Reference entry; + for ( final ReferenceContainer ic : indexes ) { + eenum = ic.entries(); + while ( eenum.hasNext() ) { + entry = eenum.next(); + if ( !urlRefs.has(entry.urlhash()) ) { Network.log.fine("DEBUG transferIndex: to-send url hash '" - + ASCII.String(entry.urlhash()) - + "' is not contained in urlCache"); + + ASCII.String(entry.urlhash()) + + "' is not contained in urlCache"); } } } } - + // transfer the RWI without the URLs Map in = transferRWI(targetSeed, indexes, gzipBody, timeout); diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index f2b7748ac..f1c11cfb0 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -53,7 +53,6 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.data.ListManager; -import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; @@ -457,10 +456,6 @@ public class Blacklist { return ret; } - public final boolean isListed(final BlacklistType blacklistType, final URIMetadataNode entry) { - return isListed(blacklistType, entry.url()); - } - /** * Checks whether the given entry is listed in given blacklist type. * @param blacklistType The used blacklist diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 7e34d616a..003f25b11 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -47,7 +47,6 @@ import net.yacy.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.ASCII; -import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; @@ -279,7 +278,7 @@ public final class SearchEvent { this.snippetFetchAlive = new AtomicInteger(0); this.addRunning = true; this.receivedRemoteReferences = new AtomicInteger(0); - this.order = new ReferenceOrder(this.query.ranking, UTF8.getBytes(this.query.targetlang)); + this.order = new ReferenceOrder(this.query.ranking, this.query.targetlang); this.urlhashes = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 100); this.taggingPredicates = new HashMap(); for (Tagging t: LibraryProvider.autotagging.getVocabularies()) { @@ -897,7 +896,7 @@ public final class SearchEvent { } if (this.query.modifier.language != null) { - if (!this.query.modifier.language.equals(UTF8.String(iEntry.language()))) { + if (!this.query.modifier.language.equals(iEntry.language())) { if (log.isFine()) log.fine("dropped Node: language"); continue pollloop; } @@ -1083,7 +1082,7 @@ public final class SearchEvent { // check modifier constraint (language) // TODO: : page.language() never null but defaults to "en" (may cause false drop of result) - if (this.query.modifier.language != null && !this.query.modifier.language.equals(ASCII.String(page.language()))) { + if (this.query.modifier.language != null && !this.query.modifier.language.equals(page.language())) { if (log.isFine()) log.fine("dropped RWI: language constraint = " + this.query.modifier.language); if (page.word().local()) this.local_rwi_available.decrementAndGet(); else this.remote_rwi_available.decrementAndGet(); continue; @@ -1165,7 +1164,7 @@ public final class SearchEvent { // TODO: vocabulary is only valid and available in local Solr index (considere to auto-switch to Searchdom.LOCAL) if (this.query.metatags != null && !this.query.metatags.isEmpty()) { tagloop: for (Tagging.Metatag tag : this.query.metatags) { - SolrDocument sdoc = page.getDocument(); + SolrDocument sdoc = page; if (sdoc != null) { Collection tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_SUFFIX); if (tagvalues != null && tagvalues.contains(tag.getObject())) { @@ -1462,7 +1461,7 @@ public final class SearchEvent { ResultEntry ms = oneResult(item, timeout); // check if the match was made in the url or in the image links if (ms != null) { - SolrDocument doc = ms.getNode().getDocument(); + SolrDocument doc = ms.getNode(); Collection alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName()); Collection img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName()); Collection prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()); diff --git a/source/net/yacy/search/ranking/ReferenceOrder.java b/source/net/yacy/search/ranking/ReferenceOrder.java index 10fd7373c..6f0391252 100644 --- a/source/net/yacy/search/ranking/ReferenceOrder.java +++ b/source/net/yacy/search/ranking/ReferenceOrder.java @@ -33,6 +33,7 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.util.ByteBuffer; @@ -55,9 +56,9 @@ public class ReferenceOrder { private WordReferenceVars min, max; private final ConcurrentScoreMap doms; // collected for "authority" heuristic private final RankingProfile ranking; - private final byte[] language; + private final String language; - public ReferenceOrder(final RankingProfile profile, final byte[] language) { + public ReferenceOrder(final RankingProfile profile, final String language) { this.min = null; this.max = null; this.ranking = profile; @@ -256,7 +257,7 @@ public class ReferenceOrder { + ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0) + ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0) + ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0) - + ((ByteBuffer.equals(t.getLanguage(), this.language)) ? 255 << this.ranking.coeff_language : 0); + + ((ByteBuffer.equals(t.getLanguage(), ASCII.getBytes(this.language))) ? 255 << this.ranking.coeff_language : 0); //if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0; @@ -289,7 +290,7 @@ public class ReferenceOrder { + ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0) + ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0) + ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0) - + ((ByteBuffer.equals(t.language(), this.language)) ? 255 << this.ranking.coeff_language : 0); + + ((this.language.equals(t.language())) ? 255 << this.ranking.coeff_language : 0); return r; // the higher the number the better the ranking. } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index cd6f729bb..09166bf1c 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -48,7 +48,6 @@ import java.util.regex.Pattern; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.encoding.ASCII; -import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; @@ -306,7 +305,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if ((allAttr || contains(CollectionSchema.referrer_id_s)) && md.referrerHash() != null) add(doc, CollectionSchema.referrer_id_s, ASCII.String(md.referrerHash())); if (allAttr || contains(CollectionSchema.md5_s)) add(doc, CollectionSchema.md5_s, md.md5()); if (allAttr || contains(CollectionSchema.publisher_t)) add(doc, CollectionSchema.publisher_t, md.dc_publisher()); - if ((allAttr || contains(CollectionSchema.language_s)) && md.language() != null) add(doc, CollectionSchema.language_s, UTF8.String(md.language())); + if (allAttr || contains(CollectionSchema.language_s)) add(doc, CollectionSchema.language_s, md.language()); if (allAttr || contains(CollectionSchema.size_i)) add(doc, CollectionSchema.size_i, md.size()); if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, md.laudio()); if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, md.lvideo()); diff --git a/source/net/yacy/search/snippet/ResultEntry.java b/source/net/yacy/search/snippet/ResultEntry.java index e74f6e108..7eb6b855f 100644 --- a/source/net/yacy/search/snippet/ResultEntry.java +++ b/source/net/yacy/search/snippet/ResultEntry.java @@ -70,7 +70,7 @@ public class ResultEntry implements Comparable, Comparator mediaSnippets, final long snippetComputationTime) { this.urlentry = urlentry; - this.urlentry.getDocument().setField(CollectionSchema.text_t.getSolrFieldName(), ""); // clear the text field which eats up most of the space; it was used for snippet computation which is in a separate field here + this.urlentry.setField(CollectionSchema.text_t.getSolrFieldName(), ""); // clear the text field which eats up most of the space; it was used for snippet computation which is in a separate field here this.indexSegment = indexSegment; this.alternative_urlstring = null; this.alternative_urlname = null;