harmonize wordsintitle & CollectionSchema.title_words_val calculation,

remove obsolete partial init of wordreference from urimetadata
pull/27/head
reger 9 years ago
parent 7bf03856d1
commit ca3d26a401

@ -118,7 +118,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
final byte[] urlHash, final byte[] urlHash,
final int urlLength, // byte-length of complete URL final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components final int urlComps, // number of path components
final int titleLength, // length of description/length (longer are better?) final int titlewordcount,// length of description/length (longer are better?)
final int hitcount, // how often appears this word in the text final int hitcount, // how often appears this word in the text
final int wordcount, // total number of words final int wordcount, // total number of words
final int phrasecount, // total number of phrases final int phrasecount, // total number of phrases
@ -141,7 +141,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry.setCol(col_urlhash, urlHash); this.entry.setCol(col_urlhash, urlHash);
this.entry.setCol(col_lastModified, mddlm); this.entry.setCol(col_lastModified, mddlm);
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words this.entry.setCol(col_wordsInTitle, titlewordcount);
this.entry.setCol(col_wordsInText, wordcount); this.entry.setCol(col_wordsInText, wordcount);
this.entry.setCol(col_phrasesInText, phrasecount); this.entry.setCol(col_phrasesInText, phrasecount);
this.entry.setCol(col_doctype, new byte[]{(byte) doctype}); this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
@ -163,7 +163,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
public WordReferenceRow(final byte[] urlHash, public WordReferenceRow(final byte[] urlHash,
final int urlLength, // byte-length of complete URL final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components final int urlComps, // number of path components
final int titleLength, // length of description/length (longer are better?) final int titlewordcount,// length of description/length (longer are better?)
final int wordcount, // total number of words final int wordcount, // total number of words
final int phrasecount, // total number of phrases final int phrasecount, // total number of phrases
final long lastmodified, // last-modified time of the document where word appears final long lastmodified, // last-modified time of the document where word appears
@ -180,7 +180,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry.setCol(col_urlhash, urlHash); this.entry.setCol(col_urlhash, urlHash);
this.entry.setCol(col_lastModified, mddlm); this.entry.setCol(col_lastModified, mddlm);
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words this.entry.setCol(col_wordsInTitle, titlewordcount);
this.entry.setCol(col_wordsInText, wordcount); this.entry.setCol(col_wordsInText, wordcount);
this.entry.setCol(col_phrasesInText, phrasecount); this.entry.setCol(col_phrasesInText, phrasecount);
this.entry.setCol(col_doctype, new byte[]{(byte) doctype}); this.entry.setCol(col_doctype, new byte[]{(byte) doctype});

@ -34,11 +34,9 @@ import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.date.MicroDate; import net.yacy.cora.date.MicroDate;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.ByteArray;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.rwi.AbstractReference; import net.yacy.kelondro.rwi.AbstractReference;
@ -50,11 +48,11 @@ import net.yacy.kelondro.workflow.WorkflowProcessor;
public class WordReferenceVars extends AbstractReference implements WordReference, Reference, Cloneable, Comparable<WordReferenceVars>, Comparator<WordReferenceVars> { public class WordReferenceVars extends AbstractReference implements WordReference, Reference, Cloneable, Comparable<WordReferenceVars>, Comparator<WordReferenceVars> {
/** /**
* object for termination of concurrent blocking queue processing * object for termination of concurrent blocking queue processing
*/ */
public static final WordReferenceVars poison = new WordReferenceVars(); public static final WordReferenceVars poison = new WordReferenceVars();
protected static final byte[] default_language = UTF8.getBytes("en"); protected static final byte[] default_language = UTF8.getBytes("en");
private final Bitfield flags; private final Bitfield flags;
private long lastModified; private long lastModified;
@ -71,31 +69,6 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
private double termFrequency; private double termFrequency;
private final boolean local; private final boolean local;
public WordReferenceVars(final URIMetadataNode md, final boolean local) {
this.language = md.language();
this.flags = md.flags();
this.lastModified = md.moddate().getTime();
this.urlHash = md.hash();
this.type = md.doctype();
this.llocal = md.llocal();
this.lother = md.lother();
this.positions = new LinkedBlockingQueue<Integer>();
this.positions.add(1);
String urlNormalform = md.url().toNormalform(true);
this.urlcomps = MultiProtocolURL.urlComps(urlNormalform).length;
this.urllength = urlNormalform.length();
this.virtualAge = -1; // compute that later
// the following fields cannot be computed here very easy and are just filled with dummy values
this.phrasesintext = 1;
this.hitcount = 1;
this.posinphrase = 1;
this.posofphrase = 1;
this.wordsintext = 1;
this.wordsintitle = 1;
this.termFrequency = 1;
this.local = local;
}
public WordReferenceVars( public WordReferenceVars(
final byte[] urlHash, final byte[] urlHash,
final int urlLength, // byte-length of complete URL final int urlLength, // byte-length of complete URL

@ -56,6 +56,7 @@ import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.storage.HandleSet; import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.LookAheadIterator; import net.yacy.cora.util.LookAheadIterator;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
@ -686,10 +687,10 @@ public class Segment {
// create a word prototype which is re-used for all entries // create a word prototype which is re-used for all entries
if ((this.termIndex != null && storeToRWI) || searchEvent != null) { if ((this.termIndex != null && storeToRWI) || searchEvent != null) {
final int len = (document == null) ? urlLength : document.dc_title().length(); final int wordsintitle = CommonPattern.SPACES.split(dc_title).length; // same calculation as for CollectionSchema.title_words_val
final WordReferenceRow ientry = new WordReferenceRow( final WordReferenceRow ientry = new WordReferenceRow(
url.hash(), url.hash(),
urlLength, urlComps, len, urlLength, urlComps, wordsintitle,
condenser.RESULT_NUMB_WORDS, condenser.RESULT_NUMB_WORDS,
condenser.RESULT_NUMB_SENTENCES, condenser.RESULT_NUMB_SENTENCES,
modDate.getTime(), modDate.getTime(),

Loading…
Cancel
Save