You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
345 lines
17 KiB
345 lines
17 KiB
// indexRWIEntry.java
|
|
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
|
|
// first published 20.05.2006 on http://www.anomic.de
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
|
// $LastChangedRevision: 1986 $
|
|
// $LastChangedBy: orbiter $
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.index;
|
|
|
|
import de.anomic.kelondro.kelondroBase64Order;
|
|
import de.anomic.kelondro.kelondroBitfield;
|
|
import de.anomic.kelondro.kelondroColumn;
|
|
import de.anomic.kelondro.kelondroRow;
|
|
import de.anomic.kelondro.kelondroRow.Entry;
|
|
import de.anomic.plasma.plasmaWordIndex;
|
|
import de.anomic.yacy.yacySeedDB;
|
|
|
|
public class indexRWIEntry implements Cloneable {
|
|
|
|
// this object stores attributes to URL references inside RWI collections
|
|
|
|
|
|
public static kelondroRow urlEntryRow = new kelondroRow(new kelondroColumn[]{
|
|
new kelondroColumn("h", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, yacySeedDB.commonHashLength, "urlhash"),
|
|
new kelondroColumn("a", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "lastModified"),
|
|
new kelondroColumn("s", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "freshUntil"),
|
|
new kelondroColumn("u", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "wordsInTitle"),
|
|
new kelondroColumn("w", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "wordsInText"),
|
|
new kelondroColumn("p", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "phrasesInText"),
|
|
new kelondroColumn("d", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "doctype"),
|
|
new kelondroColumn("l", kelondroColumn.celltype_string, kelondroColumn.encoder_bytes, 2, "language"),
|
|
new kelondroColumn("x", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "llocal"),
|
|
new kelondroColumn("y", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "lother"),
|
|
new kelondroColumn("m", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "urlLength"),
|
|
new kelondroColumn("n", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "urlComps"),
|
|
new kelondroColumn("g", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "typeofword"),
|
|
new kelondroColumn("z", kelondroColumn.celltype_bitfield, kelondroColumn.encoder_bytes, 4, "flags"),
|
|
new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "hitcount"),
|
|
new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "posintext"),
|
|
new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "posinphrase"),
|
|
new kelondroColumn("o", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "posofphrase"),
|
|
new kelondroColumn("i", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "worddistance"),
|
|
new kelondroColumn("k", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "reserve")
|
|
},
|
|
kelondroBase64Order.enhancedCoder,
|
|
0);
|
|
// available chars: b,e,j,q
|
|
|
|
// static properties
|
|
private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
|
|
private static final int col_lastModified = 1; // a 2 last-modified time of the document where word appears
|
|
private static final int col_freshUntil = 2; // s 2 TTL for the word, so it can be removed easily if the TTL is short
|
|
private static final int col_wordsInTitle = 3; // u 1 words in description/length (longer are better?)
|
|
private static final int col_wordsInText = 4; // w 2 total number of words in document
|
|
private static final int col_phrasesInText = 5; // p 2 total number of phrases in document
|
|
private static final int col_doctype = 6; // d 1 type of document
|
|
private static final int col_language = 7; // l 2 (guessed) language of document
|
|
private static final int col_llocal = 8; // x 1 outlinks to same domain
|
|
private static final int col_lother = 9; // y 1 outlinks to other domain
|
|
private static final int col_urlLength = 10; // m 1 byte-length of complete URL
|
|
private static final int col_urlComps = 11; // n 1 number of path components
|
|
|
|
// dynamic properties
|
|
private static final int col_typeofword = 12; // g 1 grammatical classification
|
|
private static final int col_flags = 13; // z 4 b64-encoded appearance flags (24 bit, see definition below)
|
|
private static final int col_hitcount = 14; // c 1 number of occurrences of this word in text
|
|
private static final int col_posintext = 15; // t 2 first appearance of word in text
|
|
private static final int col_posinphrase = 16; // r 1 position of word in its phrase
|
|
private static final int col_posofphrase = 17; // o 1 number of the phrase where word appears
|
|
private static final int col_worddistance = 18; // i 1 initial zero; may be used as reserve: is filled during search
|
|
private static final int col_reserve = 19; // k 1 reserve
|
|
|
|
// appearance flags, used in RWI entry
|
|
// the flags 0..23 are identical to the category flags in plasmaCondenser
|
|
public static final int flag_app_url = 24; // word appears in url
|
|
public static final int flag_app_descr = 25; // word appears in headline (or any description part)
|
|
public static final int flag_app_author = 26; // word appears in author
|
|
public static final int flag_app_tags = 27; // word appears in header tags
|
|
public static final int flag_app_reference = 28; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
|
|
public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size)
|
|
|
|
private kelondroRow.Entry entry;
|
|
|
|
public indexRWIEntry(String urlHash,
|
|
int urlLength, // byte-length of complete URL
|
|
int urlComps, // number of path components
|
|
int titleLength, // length of description/length (longer are better?)
|
|
int hitcount, // how often appears this word in the text
|
|
int wordcount, // total number of words
|
|
int phrasecount, // total number of phrases
|
|
int posintext, // position of word in all words
|
|
int posinphrase, // position of word in its phrase
|
|
int posofphrase, // number of the phrase where word appears
|
|
int worddistance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
|
|
int sizeOfPage, // # of bytes of the page TODO: not needed any more
|
|
long lastmodified, // last-modified time of the document where word appears
|
|
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
|
|
String language, // (guessed) language of document
|
|
char doctype, // type of document
|
|
int outlinksSame, // outlinks to same domain
|
|
int outlinksOther, // outlinks to other domain
|
|
kelondroBitfield flags // attributes to the url and to the word according the url
|
|
) {
|
|
|
|
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
|
|
if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
|
|
this.entry = urlEntryRow.newEntry();
|
|
int mddlm = plasmaWordIndex.microDateDays(lastmodified);
|
|
int mddct = plasmaWordIndex.microDateDays(updatetime);
|
|
this.entry.setCol(col_urlhash, urlHash, null);
|
|
this.entry.setCol(col_lastModified, mddlm);
|
|
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
|
|
this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
|
|
this.entry.setCol(col_wordsInText, wordcount);
|
|
this.entry.setCol(col_phrasesInText, phrasecount);
|
|
this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
|
|
this.entry.setCol(col_language, language, null);
|
|
this.entry.setCol(col_llocal, outlinksSame);
|
|
this.entry.setCol(col_lother, outlinksOther);
|
|
this.entry.setCol(col_urlLength, urlLength);
|
|
this.entry.setCol(col_urlComps, urlComps);
|
|
this.entry.setCol(col_typeofword, new byte[]{(byte) 0}); // TODO: grammatical classification
|
|
this.entry.setCol(col_flags, flags.bytes());
|
|
this.entry.setCol(col_hitcount, hitcount);
|
|
this.entry.setCol(col_posintext, posintext);
|
|
this.entry.setCol(col_posinphrase, posinphrase);
|
|
this.entry.setCol(col_posofphrase, posofphrase);
|
|
this.entry.setCol(col_worddistance, worddistance);
|
|
this.entry.setCol(col_reserve, 0);
|
|
}
|
|
|
|
public indexRWIEntry(String urlHash, String code) {
|
|
// the code is the external form of the row minus the leading urlHash entry
|
|
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
|
|
}
|
|
|
|
public indexRWIEntry(String external) {
|
|
this.entry = urlEntryRow.newEntry(external, true);
|
|
}
|
|
|
|
public indexRWIEntry(byte[] row) {
|
|
this.entry = urlEntryRow.newEntry(row);
|
|
}
|
|
|
|
public indexRWIEntry(kelondroRow.Entry rentry) {
|
|
// FIXME: see if cloning is necessary
|
|
this.entry = rentry;
|
|
}
|
|
|
|
public static int days(long time) {
|
|
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
|
|
return (int) (time / 86400000);
|
|
}
|
|
|
|
public Object clone() {
|
|
byte[] b = new byte[urlEntryRow.objectsize()];
|
|
System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize());
|
|
return new indexRWIEntry(b);
|
|
}
|
|
|
|
public String toPropertyForm() {
|
|
return entry.toPropertyForm(true, true, false);
|
|
}
|
|
|
|
public Entry toKelondroEntry() {
|
|
return this.entry;
|
|
}
|
|
|
|
public String urlHash() {
|
|
return this.entry.getColString(col_urlhash, null);
|
|
}
|
|
|
|
public int quality() {
|
|
return 0; // not used any more
|
|
}
|
|
|
|
public int virtualAge() {
|
|
return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format
|
|
}
|
|
|
|
public long lastModified() {
|
|
return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
|
|
}
|
|
|
|
public long freshUntil() {
|
|
return plasmaWordIndex.reverseMicroDateDays((int) this.entry.getColLong(col_freshUntil));
|
|
}
|
|
|
|
public int hitcount() {
|
|
return (int) this.entry.getColLong(col_hitcount);
|
|
}
|
|
|
|
public int posintext() {
|
|
return (int) this.entry.getColLong(col_posintext);
|
|
}
|
|
|
|
public int posinphrase() {
|
|
return (int) this.entry.getColLong(col_posinphrase);
|
|
}
|
|
|
|
public int posofphrase() {
|
|
return (int) this.entry.getColLong(col_posofphrase);
|
|
}
|
|
|
|
public int wordsintext() {
|
|
return (int) this.entry.getColLong(col_wordsInText);
|
|
}
|
|
|
|
public int phrasesintext() {
|
|
return (int) this.entry.getColLong(col_phrasesInText);
|
|
}
|
|
|
|
public String getLanguage() {
|
|
return this.entry.getColString(col_language, null);
|
|
}
|
|
|
|
public char getType() {
|
|
return (char) this.entry.getColByte(col_doctype);
|
|
}
|
|
|
|
public int wordsintitle() {
|
|
return (int) this.entry.getColLong(col_wordsInTitle);
|
|
}
|
|
|
|
public int llocal() {
|
|
return (int) this.entry.getColLong(col_llocal);
|
|
}
|
|
|
|
public int lother() {
|
|
return (int) this.entry.getColLong(col_lother);
|
|
}
|
|
|
|
public int urllength() {
|
|
return (int) this.entry.getColLong(col_urlLength);
|
|
}
|
|
|
|
public int urlcomps() {
|
|
return (int) this.entry.getColLong(col_urlComps);
|
|
}
|
|
|
|
public kelondroBitfield flags() {
|
|
return new kelondroBitfield(this.entry.getColBytes(col_flags));
|
|
}
|
|
|
|
public String toString() {
|
|
return toPropertyForm();
|
|
}
|
|
|
|
public static indexRWIEntry combineDistance(indexRWIEntry ie1, indexRWIEntry ie2) {
|
|
// returns a modified entry of the first argument
|
|
ie1.entry.setCol(col_worddistance, ie1.worddistance() + ie2.worddistance() + Math.abs(ie1.posintext() - ie2.posintext()));
|
|
ie1.entry.setCol(col_posintext, Math.min(ie1.posintext(), ie2.posintext()));
|
|
ie1.entry.setCol(col_posinphrase, (ie1.posofphrase() == ie2.posofphrase()) ? ie1.posofphrase() : 0 /*unknown*/);
|
|
ie1.entry.setCol(col_posofphrase, Math.min(ie1.posofphrase(), ie2.posofphrase()));
|
|
ie1.entry.setCol(col_wordsInText, (ie1.wordsintext() + ie2.wordsintext()) / 2);
|
|
return ie1;
|
|
}
|
|
|
|
public void combineDistance(indexRWIEntry oe) {
|
|
combineDistance(this, oe);
|
|
}
|
|
|
|
public int worddistance() {
|
|
return (int) this.entry.getColLong(col_worddistance);
|
|
}
|
|
|
|
public static final void min(indexRWIEntry t, indexRWIEntry other) {
|
|
int v;
|
|
long w;
|
|
if (t.hitcount() > (v = other.hitcount())) t.entry.setCol(col_hitcount, other.hitcount());
|
|
if (t.wordsintext() > (v = other.wordsintext())) t.entry.setCol(col_wordsInText, v);
|
|
if (t.phrasesintext() > (v = other.phrasesintext())) t.entry.setCol(col_phrasesInText, v);
|
|
if (t.posintext() > (v = other.posintext())) t.entry.setCol(col_posintext, v);
|
|
if (t.posinphrase() > (v = other.posinphrase())) t.entry.setCol(col_posinphrase, v);
|
|
if (t.posofphrase() > (v = other.posofphrase())) t.entry.setCol(col_posofphrase, v);
|
|
if (t.worddistance() > (v = other.worddistance())) t.entry.setCol(col_worddistance, v);
|
|
if (t.lastModified() > (w = other.lastModified())) t.entry.setCol(col_lastModified, w);
|
|
if (t.urllength() > (v = other.urllength())) t.entry.setCol(col_urlLength, v);
|
|
if (t.urlcomps() > (v = other.urlcomps())) t.entry.setCol(col_urlComps, v);
|
|
if (t.wordsintitle() > (v = other.wordsintitle())) t.entry.setCol(col_wordsInTitle, v);
|
|
}
|
|
|
|
public static final void max(indexRWIEntry t, indexRWIEntry other) {
|
|
int v;
|
|
long w;
|
|
if (t.hitcount() < (v = other.hitcount())) t.entry.setCol(col_hitcount, v);
|
|
if (t.wordsintext() < (v = other.wordsintext())) t.entry.setCol(col_wordsInText, v);
|
|
if (t.phrasesintext() < (v = other.phrasesintext())) t.entry.setCol(col_phrasesInText, v);
|
|
if (t.posintext() < (v = other.posintext())) t.entry.setCol(col_posintext, v);
|
|
if (t.posinphrase() < (v = other.posinphrase())) t.entry.setCol(col_posinphrase, v);
|
|
if (t.posofphrase() < (v = other.posofphrase())) t.entry.setCol(col_posofphrase, v);
|
|
if (t.worddistance() < (v = other.worddistance())) t.entry.setCol(col_worddistance, v);
|
|
if (t.lastModified() < (w = other.lastModified())) t.entry.setCol(col_lastModified, w);
|
|
if (t.urllength() < (v = other.urllength())) t.entry.setCol(col_urlLength, v);
|
|
if (t.urlcomps() < (v = other.urlcomps())) t.entry.setCol(col_urlComps, v);
|
|
if (t.wordsintitle() < (v = other.wordsintitle())) t.entry.setCol(col_wordsInTitle, v);
|
|
}
|
|
|
|
|
|
public void min(indexRWIEntry other) {
|
|
min(this, other);
|
|
}
|
|
|
|
public void max(indexRWIEntry other) {
|
|
max(this, other);
|
|
}
|
|
|
|
public boolean isNewer(indexRWIEntry other) {
|
|
if (other == null) return true;
|
|
if (this.lastModified() > other.lastModified()) return true;
|
|
if (this.lastModified() == other.lastModified()) {
|
|
if (this.quality() > other.quality()) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
public boolean isOlder(indexRWIEntry other) {
|
|
if (other == null) return false;
|
|
if (this.lastModified() < other.lastModified()) return true;
|
|
if (this.lastModified() == other.lastModified()) {
|
|
if (this.quality() < other.quality()) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
} |