From ca3d26a4016df325e3611331a3a00ba580eb54ec Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 15 Nov 2015 06:06:37 +0100 Subject: [PATCH] harmonize wordsintitle & CollectionSchema.title_words_val calculation, remove obsolete partial init of wordreference from urimetadata --- .../kelondro/data/word/WordReferenceRow.java | 8 ++-- .../kelondro/data/word/WordReferenceVars.java | 37 +++---------------- source/net/yacy/search/index/Segment.java | 5 ++- 3 files changed, 12 insertions(+), 38 deletions(-) diff --git a/source/net/yacy/kelondro/data/word/WordReferenceRow.java b/source/net/yacy/kelondro/data/word/WordReferenceRow.java index 45614a3a8..5575c06d7 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceRow.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceRow.java @@ -118,7 +118,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef final byte[] urlHash, final int urlLength, // byte-length of complete URL final int urlComps, // number of path components - final int titleLength, // length of description/length (longer are better?) + final int titlewordcount,// length of description/length (longer are better?) final int hitcount, // how often appears this word in the text final int wordcount, // total number of words final int phrasecount, // total number of phrases @@ -141,7 +141,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef this.entry.setCol(col_urlhash, urlHash); this.entry.setCol(col_lastModified, mddlm); this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation - this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words + this.entry.setCol(col_wordsInTitle, titlewordcount); this.entry.setCol(col_wordsInText, wordcount); this.entry.setCol(col_phrasesInText, phrasecount); this.entry.setCol(col_doctype, new byte[]{(byte) doctype}); @@ -163,7 +163,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef public WordReferenceRow(final byte[] urlHash, final int urlLength, // byte-length of complete URL final int urlComps, // number of path components - final int titleLength, // length of description/length (longer are better?) + final int titlewordcount,// length of description/length (longer are better?) final int wordcount, // total number of words final int phrasecount, // total number of phrases final long lastmodified, // last-modified time of the document where word appears @@ -180,7 +180,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef this.entry.setCol(col_urlhash, urlHash); this.entry.setCol(col_lastModified, mddlm); this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation - this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words + this.entry.setCol(col_wordsInTitle, titlewordcount); this.entry.setCol(col_wordsInText, wordcount); this.entry.setCol(col_phrasesInText, phrasecount); this.entry.setCol(col_doctype, new byte[]{(byte) doctype}); diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index 734574948..af30c4db7 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -34,11 +34,9 @@ import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.date.MicroDate; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.order.Base64Order; import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.rwi.AbstractReference; @@ -50,11 +48,11 @@ import net.yacy.kelondro.workflow.WorkflowProcessor; public class WordReferenceVars extends AbstractReference implements WordReference, Reference, Cloneable, Comparable, Comparator { - /** - * object for termination of concurrent blocking queue processing - */ - public static final WordReferenceVars poison = new WordReferenceVars(); - protected static final byte[] default_language = UTF8.getBytes("en"); + /** + * object for termination of concurrent blocking queue processing + */ + public static final WordReferenceVars poison = new WordReferenceVars(); + protected static final byte[] default_language = UTF8.getBytes("en"); private final Bitfield flags; private long lastModified; @@ -71,31 +69,6 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc private double termFrequency; private final boolean local; - public WordReferenceVars(final URIMetadataNode md, final boolean local) { - this.language = md.language(); - this.flags = md.flags(); - this.lastModified = md.moddate().getTime(); - this.urlHash = md.hash(); - this.type = md.doctype(); - this.llocal = md.llocal(); - this.lother = md.lother(); - this.positions = new LinkedBlockingQueue(); - this.positions.add(1); - String urlNormalform = md.url().toNormalform(true); - this.urlcomps = MultiProtocolURL.urlComps(urlNormalform).length; - this.urllength = urlNormalform.length(); - this.virtualAge = -1; // compute that later - // the following fields cannot be computed here very easy and are just filled with dummy values - this.phrasesintext = 1; - this.hitcount = 1; - this.posinphrase = 1; - this.posofphrase = 1; - this.wordsintext = 1; - this.wordsintitle = 1; - this.termFrequency = 1; - this.local = local; - } - public WordReferenceVars( final byte[] urlHash, final int urlLength, // byte-length of complete URL diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 2308af7e2..d8683e356 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -56,6 +56,7 @@ import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ByteBuffer; +import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.LookAheadIterator; import net.yacy.cora.util.SpaceExceededException; @@ -686,10 +687,10 @@ public class Segment { // create a word prototype which is re-used for all entries if ((this.termIndex != null && storeToRWI) || searchEvent != null) { - final int len = (document == null) ? urlLength : document.dc_title().length(); + final int wordsintitle = CommonPattern.SPACES.split(dc_title).length; // same calculation as for CollectionSchema.title_words_val final WordReferenceRow ientry = new WordReferenceRow( url.hash(), - urlLength, urlComps, len, + urlLength, urlComps, wordsintitle, condenser.RESULT_NUMB_WORDS, condenser.RESULT_NUMB_SENTENCES, modDate.getTime(),