You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
257 lines
9.2 KiB
257 lines
9.2 KiB
// indexRWIVarEntry.java
|
|
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 07.11.2007 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
|
// $LastChangedRevision: 1986 $
|
|
// $LastChangedBy: orbiter $
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.index;
|
|
|
|
import de.anomic.kelondro.kelondroBitfield;
|
|
|
|
public class indexRWIVarEntry implements indexRWIEntry {
|
|
|
|
public kelondroBitfield flags;
|
|
public long freshUntil, lastModified;
|
|
public String language, urlHash;
|
|
public char type;
|
|
public int hitcount, llocal, lother, phrasesintext, posintext,
|
|
posinphrase, posofphrase,
|
|
urlcomps, urllength, virtualAge,
|
|
worddistance, wordsintext, wordsintitle;
|
|
public double termFrequency;
|
|
|
|
public indexRWIVarEntry(indexRWIEntry e) {
|
|
this.flags = e.flags();
|
|
this.freshUntil = e.freshUntil();
|
|
this.lastModified = e.lastModified();
|
|
this.language = e.getLanguage();
|
|
this.urlHash = e.urlHash();
|
|
this.type = e.getType();
|
|
this.hitcount = e.hitcount();
|
|
this.llocal = e.llocal();
|
|
this.lother = e.lother();
|
|
this.phrasesintext = e.phrasesintext();
|
|
this.posintext = e.posintext();
|
|
this.posinphrase = e.posinphrase();
|
|
this.posofphrase = e.posofphrase();
|
|
this.urlcomps = e.urlcomps();
|
|
this.urllength = e.urllength();
|
|
this.virtualAge = e.virtualAge();
|
|
this.worddistance = 0;
|
|
this.wordsintext = e.wordsintext();
|
|
this.wordsintitle = e.wordsintitle();
|
|
this.termFrequency = 0.0;
|
|
}
|
|
|
|
public void join(indexRWIVarEntry oe) {
|
|
// combine the distance
|
|
this.worddistance = this.worddistance() + oe.worddistance() + Math.abs(this.posintext() - oe.posintext());
|
|
this.posintext = Math.min(this.posintext(), oe.posintext());
|
|
this.posinphrase = (this.posofphrase() == oe.posofphrase()) ? Math.min(this.posinphrase(), oe.posinphrase()) : 0;
|
|
this.posofphrase = Math.min(this.posofphrase(), oe.posofphrase());
|
|
|
|
// combine term frequency
|
|
this.wordsintext = this.wordsintext() + oe.wordsintext();
|
|
}
|
|
|
|
public kelondroBitfield flags() {
|
|
return flags;
|
|
}
|
|
|
|
public long freshUntil() {
|
|
return freshUntil;
|
|
}
|
|
|
|
public String getLanguage() {
|
|
return language;
|
|
}
|
|
|
|
public char getType() {
|
|
return type;
|
|
}
|
|
|
|
public int hitcount() {
|
|
return hitcount;
|
|
}
|
|
|
|
public boolean isNewer(indexRWIEntry other) {
|
|
assert false; // should not be used
|
|
return false;
|
|
}
|
|
|
|
public boolean isOlder(indexRWIEntry other) {
|
|
assert false; // should not be used
|
|
return false;
|
|
}
|
|
|
|
public long lastModified() {
|
|
return lastModified;
|
|
}
|
|
|
|
public int llocal() {
|
|
return llocal;
|
|
}
|
|
|
|
public int lother() {
|
|
return lother;
|
|
}
|
|
|
|
public int phrasesintext() {
|
|
return phrasesintext;
|
|
}
|
|
|
|
public int posinphrase() {
|
|
return posinphrase;
|
|
}
|
|
|
|
public int posintext() {
|
|
return posintext;
|
|
}
|
|
|
|
public int posofphrase() {
|
|
return posofphrase;
|
|
}
|
|
|
|
public indexRWIRowEntry toRowEntry() {
|
|
return new indexRWIRowEntry(
|
|
urlHash,
|
|
urllength, // byte-length of complete URL
|
|
urlcomps, // number of path components
|
|
wordsintitle, // length of description/length (longer are better?)
|
|
hitcount, // how often appears this word in the text
|
|
wordsintext, // total number of words
|
|
phrasesintext, // total number of phrases
|
|
posintext, // position of word in all words
|
|
posinphrase, // position of word in its phrase
|
|
posofphrase, // number of the phrase where word appears
|
|
lastModified, // last-modified time of the document where word appears
|
|
System.currentTimeMillis(), // update time;
|
|
language, // (guessed) language of document
|
|
type, // type of document
|
|
llocal, // outlinks to same domain
|
|
lother, // outlinks to other domain
|
|
flags // attributes to the url and to the word according the url
|
|
);
|
|
}
|
|
|
|
public String toPropertyForm() {
|
|
return toRowEntry().toPropertyForm();
|
|
}
|
|
|
|
public String urlHash() {
|
|
return urlHash;
|
|
}
|
|
|
|
public int urlcomps() {
|
|
return urlcomps;
|
|
}
|
|
|
|
public int urllength() {
|
|
return urllength;
|
|
}
|
|
|
|
public int virtualAge() {
|
|
return virtualAge;
|
|
}
|
|
|
|
public int worddistance() {
|
|
return worddistance;
|
|
}
|
|
|
|
public int wordsintext() {
|
|
return wordsintext;
|
|
}
|
|
|
|
public int wordsintitle() {
|
|
return wordsintitle;
|
|
}
|
|
|
|
public double termFrequency() {
|
|
if (this.termFrequency == 0.0) this.termFrequency = (((double) this.hitcount()) / ((double) (this.wordsintext() + this.wordsintitle() + 1)));
|
|
return this.termFrequency;
|
|
}
|
|
|
|
public static final void min(indexRWIVarEntry t, indexRWIVarEntry other) {
|
|
int v;
|
|
long w;
|
|
double d;
|
|
if (t.hitcount() > (v = other.hitcount())) t.hitcount = v;
|
|
if (t.llocal() > (v = other.llocal())) t.llocal = v;
|
|
if (t.lother() > (v = other.lother())) t.lother = v;
|
|
if (t.virtualAge() > (v = other.virtualAge())) t.virtualAge = v;
|
|
if (t.wordsintext() > (v = other.wordsintext())) t.wordsintext = v;
|
|
if (t.phrasesintext() > (v = other.phrasesintext())) t.phrasesintext = v;
|
|
if (t.posintext() > (v = other.posintext())) t.posintext = v;
|
|
if (t.posinphrase() > (v = other.posinphrase())) t.posinphrase = v;
|
|
if (t.posofphrase() > (v = other.posofphrase())) t.posofphrase = v;
|
|
if (t.worddistance() > (v = other.worddistance())) t.worddistance = v;
|
|
if (t.lastModified() > (w = other.lastModified())) t.lastModified = w;
|
|
if (t.freshUntil() > (w = other.freshUntil())) t.freshUntil = w;
|
|
if (t.urllength() > (v = other.urllength())) t.urllength = v;
|
|
if (t.urlcomps() > (v = other.urlcomps())) t.urlcomps = v;
|
|
if (t.wordsintitle() > (v = other.wordsintitle())) t.wordsintitle = v;
|
|
if (t.termFrequency > (d = other.termFrequency())) t.termFrequency = d;
|
|
}
|
|
|
|
public static final void max(indexRWIVarEntry t, indexRWIVarEntry other) {
|
|
int v;
|
|
long w;
|
|
double d;
|
|
if (t.hitcount() < (v = other.hitcount())) t.hitcount = v;
|
|
if (t.llocal() < (v = other.llocal())) t.llocal = v;
|
|
if (t.lother() < (v = other.lother())) t.lother = v;
|
|
if (t.virtualAge() < (v = other.virtualAge())) t.virtualAge = v;
|
|
if (t.wordsintext() < (v = other.wordsintext())) t.wordsintext = v;
|
|
if (t.phrasesintext() < (v = other.phrasesintext())) t.phrasesintext = v;
|
|
if (t.posintext() < (v = other.posintext())) t.posintext = v;
|
|
if (t.posinphrase() < (v = other.posinphrase())) t.posinphrase = v;
|
|
if (t.posofphrase() < (v = other.posofphrase())) t.posofphrase = v;
|
|
if (t.worddistance() < (v = other.worddistance())) t.worddistance = v;
|
|
if (t.lastModified() < (w = other.lastModified())) t.lastModified = w;
|
|
if (t.freshUntil() < (w = other.freshUntil())) t.freshUntil = w;
|
|
if (t.urllength() < (v = other.urllength())) t.urllength = v;
|
|
if (t.urlcomps() < (v = other.urlcomps())) t.urlcomps = v;
|
|
if (t.wordsintitle() < (v = other.wordsintitle())) t.wordsintitle = v;
|
|
if (t.termFrequency < (d = other.termFrequency())) t.termFrequency = d;
|
|
}
|
|
|
|
public static void join(indexRWIVarEntry ie1, indexRWIEntry ie2) {
|
|
// returns a modified entry of the first argument
|
|
|
|
// combine the distance
|
|
ie1.worddistance = ie1.worddistance + ((ie2 instanceof indexRWIVarEntry) ? ((indexRWIVarEntry) ie2).worddistance() : 0) + Math.abs(ie1.posintext() - ie2.posintext());
|
|
ie1.posintext = Math.min(ie1.posintext(), ie2.posintext());
|
|
ie1.posinphrase = (ie1.posofphrase() == ie2.posofphrase()) ? Math.min(ie1.posinphrase(), ie2.posinphrase()) : 0;
|
|
ie1.posofphrase = Math.min(ie1.posofphrase(), ie2.posofphrase());
|
|
|
|
// combine term frequency
|
|
ie1.termFrequency = ie1.termFrequency + ie2.termFrequency();
|
|
ie1.wordsintext = ie1.wordsintext() + ie2.wordsintext();
|
|
}
|
|
|
|
public void join(indexRWIEntry oe) {
|
|
join(this, oe);
|
|
}
|
|
|
|
}
|