// indexRWIEntry.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 20.05.2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.index ;
import de.anomic.kelondro.kelondroBase64Order ;
import de.anomic.kelondro.kelondroBitfield ;
import de.anomic.kelondro.kelondroColumn ;
import de.anomic.kelondro.kelondroRow ;
import de.anomic.kelondro.kelondroRow.Entry ;
import de.anomic.plasma.plasmaWordIndex ;
import de.anomic.yacy.yacySeedDB ;
public class indexRWIEntry implements Cloneable {
// this object stores attributes to URL references inside RWI collections
public static kelondroRow urlEntryRow = new kelondroRow ( new kelondroColumn [ ] {
new kelondroColumn ( "h" , kelondroColumn . celltype_string , kelondroColumn . encoder_bytes , yacySeedDB . commonHashLength , "urlhash" ) ,
new kelondroColumn ( "a" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 2 , "lastModified" ) ,
new kelondroColumn ( "s" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 2 , "freshUntil" ) ,
new kelondroColumn ( "u" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 1 , "wordsInTitle" ) ,
new kelondroColumn ( "w" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 2 , "wordsInText" ) ,
new kelondroColumn ( "p" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 2 , "phrasesInText" ) ,
new kelondroColumn ( "d" , kelondroColumn . celltype_binary , kelondroColumn . encoder_bytes , 1 , "doctype" ) ,
new kelondroColumn ( "l" , kelondroColumn . celltype_string , kelondroColumn . encoder_bytes , 2 , "language" ) ,
new kelondroColumn ( "x" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 1 , "llocal" ) ,
new kelondroColumn ( "y" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 1 , "lother" ) ,
new kelondroColumn ( "m" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 1 , "urlLength" ) ,
new kelondroColumn ( "n" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 1 , "urlComps" ) ,
new kelondroColumn ( "g" , kelondroColumn . celltype_binary , kelondroColumn . encoder_bytes , 1 , "typeofword" ) ,
new kelondroColumn ( "z" , kelondroColumn . celltype_bitfield , kelondroColumn . encoder_bytes , 4 , "flags" ) ,
new kelondroColumn ( "c" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 1 , "hitcount" ) ,
new kelondroColumn ( "t" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 2 , "posintext" ) ,
new kelondroColumn ( "r" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 1 , "posinphrase" ) ,
new kelondroColumn ( "o" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 1 , "posofphrase" ) ,
new kelondroColumn ( "i" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 1 , "worddistance" ) ,
new kelondroColumn ( "k" , kelondroColumn . celltype_cardinal , kelondroColumn . encoder_b256 , 1 , "reserve" )
} ,
kelondroBase64Order . enhancedCoder ,
0 ) ;
// available chars: b,e,j,q
// static properties
private static final int col_urlhash = 0 ; // h 12 the url hash b64-encoded
private static final int col_lastModified = 1 ; // a 2 last-modified time of the document where word appears
private static final int col_freshUntil = 2 ; // s 2 TTL for the word, so it can be removed easily if the TTL is short
private static final int col_wordsInTitle = 3 ; // u 1 words in description/length (longer are better?)
private static final int col_wordsInText = 4 ; // w 2 total number of words in document
private static final int col_phrasesInText = 5 ; // p 2 total number of phrases in document
private static final int col_doctype = 6 ; // d 1 type of document
private static final int col_language = 7 ; // l 2 (guessed) language of document
private static final int col_llocal = 8 ; // x 1 outlinks to same domain
private static final int col_lother = 9 ; // y 1 outlinks to other domain
private static final int col_urlLength = 10 ; // m 1 byte-length of complete URL
private static final int col_urlComps = 11 ; // n 1 number of path components
// dynamic properties
private static final int col_typeofword = 12 ; // g 1 grammatical classification
private static final int col_flags = 13 ; // z 4 b64-encoded appearance flags (24 bit, see definition below)
private static final int col_hitcount = 14 ; // c 1 number of occurrences of this word in text
private static final int col_posintext = 15 ; // t 2 first appearance of word in text
private static final int col_posinphrase = 16 ; // r 1 position of word in its phrase
private static final int col_posofphrase = 17 ; // o 1 number of the phrase where word appears
private static final int col_worddistance = 18 ; // i 1 initial zero; may be used as reserve: is filled during search
private static final int col_reserve = 19 ; // k 1 reserve
// appearance flags, used in RWI entry
// the flags 0..23 are identical to the category flags in plasmaCondenser
public static final int flag_app_url = 24 ; // word appears in url
public static final int flag_app_descr = 25 ; // word appears in headline (or any description part)
public static final int flag_app_author = 26 ; // word appears in author
public static final int flag_app_tags = 27 ; // word appears in header tags
public static final int flag_app_reference = 28 ; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
public static final int flag_app_emphasized = 29 ; // word is emphasized in text (i.e. bold, italics, special size)
private kelondroRow . Entry entry ;
public indexRWIEntry ( String urlHash ,
int urlLength , // byte-length of complete URL
int urlComps , // number of path components
int titleLength , // length of description/length (longer are better?)
int hitcount , // how often appears this word in the text
int wordcount , // total number of words
int phrasecount , // total number of phrases
int posintext , // position of word in all words
int posinphrase , // position of word in its phrase
int posofphrase , // number of the phrase where word appears
int worddistance , // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
int sizeOfPage , // # of bytes of the page TODO: not needed any more
long lastmodified , // last-modified time of the document where word appears
long updatetime , // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
String language , // (guessed) language of document
char doctype , // type of document
int outlinksSame , // outlinks to same domain
int outlinksOther , // outlinks to other domain
kelondroBitfield flags // attributes to the url and to the word according the url
) {
assert ( urlHash . length ( ) = = 12 ) : "urlhash = " + urlHash ;
if ( ( language = = null ) | | ( language . length ( ) ! = urlEntryRow . width ( col_language ) ) ) language = "uk" ;
this . entry = urlEntryRow . newEntry ( ) ;
int mddlm = plasmaWordIndex . microDateDays ( lastmodified ) ;
int mddct = plasmaWordIndex . microDateDays ( updatetime ) ;
this . entry . setCol ( col_urlhash , urlHash , null ) ;
this . entry . setCol ( col_lastModified , mddlm ) ;
this . entry . setCol ( col_freshUntil , Math . max ( 0 , mddlm + ( mddct - mddlm ) * 2 ) ) ; // TTL computation
this . entry . setCol ( col_wordsInTitle , titleLength / 6 ) ; // word count estimation; TODO: change value handover to number of words
this . entry . setCol ( col_wordsInText , wordcount ) ;
this . entry . setCol ( col_phrasesInText , phrasecount ) ;
this . entry . setCol ( col_doctype , new byte [ ] { ( byte ) doctype } ) ;
this . entry . setCol ( col_language , language , null ) ;
this . entry . setCol ( col_llocal , outlinksSame ) ;
this . entry . setCol ( col_lother , outlinksOther ) ;
this . entry . setCol ( col_urlLength , urlLength ) ;
this . entry . setCol ( col_urlComps , urlComps ) ;
this . entry . setCol ( col_typeofword , new byte [ ] { ( byte ) 0 } ) ; // TODO: grammatical classification
this . entry . setCol ( col_flags , flags . bytes ( ) ) ;
this . entry . setCol ( col_hitcount , hitcount ) ;
this . entry . setCol ( col_posintext , posintext ) ;
this . entry . setCol ( col_posinphrase , posinphrase ) ;
this . entry . setCol ( col_posofphrase , posofphrase ) ;
this . entry . setCol ( col_worddistance , worddistance ) ;
this . entry . setCol ( col_reserve , 0 ) ;
}
public indexRWIEntry ( String urlHash , String code ) {
// the code is the external form of the row minus the leading urlHash entry
this . entry = urlEntryRow . newEntry ( ( urlHash + code ) . getBytes ( ) ) ;
}
public indexRWIEntry ( String external ) {
this . entry = urlEntryRow . newEntry ( external , true ) ;
}
public indexRWIEntry ( byte [ ] row ) {
this . entry = urlEntryRow . newEntry ( row ) ;
}
public indexRWIEntry ( kelondroRow . Entry rentry ) {
// FIXME: see if cloning is necessary
this . entry = rentry ;
}
public static int days ( long time ) {
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
return ( int ) ( time / 86400000 ) ;
}
public Object clone ( ) {
byte [ ] b = new byte [ urlEntryRow . objectsize ( ) ] ;
System . arraycopy ( entry . bytes ( ) , 0 , b , 0 , urlEntryRow . objectsize ( ) ) ;
return new indexRWIEntry ( b ) ;
}
public String toPropertyForm ( ) {
return entry . toPropertyForm ( true , true , false ) ;
}
public Entry toKelondroEntry ( ) {
return this . entry ;
}
public String urlHash ( ) {
return this . entry . getColString ( col_urlhash , null ) ;
}
public int quality ( ) {
return 0 ; // not used any more
}
public int virtualAge ( ) {
return ( int ) this . entry . getColLong ( col_lastModified ) ; // this is the time in MicoDateDays format
}
public long lastModified ( ) {
return plasmaWordIndex . reverseMicroDateDays ( ( int ) this . entry . getColLong ( col_lastModified ) ) ;
}
public long freshUntil ( ) {
return plasmaWordIndex . reverseMicroDateDays ( ( int ) this . entry . getColLong ( col_freshUntil ) ) ;
}
public int hitcount ( ) {
return ( int ) this . entry . getColLong ( col_hitcount ) ;
}
public int posintext ( ) {
return ( int ) this . entry . getColLong ( col_posintext ) ;
}
public int posinphrase ( ) {
return ( int ) this . entry . getColLong ( col_posinphrase ) ;
}
public int posofphrase ( ) {
return ( int ) this . entry . getColLong ( col_posofphrase ) ;
}
public int wordsintext ( ) {
return ( int ) this . entry . getColLong ( col_wordsInText ) ;
}
public int phrasesintext ( ) {
return ( int ) this . entry . getColLong ( col_phrasesInText ) ;
}
public String getLanguage ( ) {
return this . entry . getColString ( col_language , null ) ;
}
public char getType ( ) {
return ( char ) this . entry . getColByte ( col_doctype ) ;
}
public int wordsintitle ( ) {
return ( int ) this . entry . getColLong ( col_wordsInTitle ) ;
}
public int llocal ( ) {
return ( int ) this . entry . getColLong ( col_llocal ) ;
}
public int lother ( ) {
return ( int ) this . entry . getColLong ( col_lother ) ;
}
public int urllength ( ) {
return ( int ) this . entry . getColLong ( col_urlLength ) ;
}
public int urlcomps ( ) {
return ( int ) this . entry . getColLong ( col_urlComps ) ;
}
public kelondroBitfield flags ( ) {
return new kelondroBitfield ( this . entry . getColBytes ( col_flags ) ) ;
}
public String toString ( ) {
return toPropertyForm ( ) ;
}
public static indexRWIEntry combineDistance ( indexRWIEntry ie1 , indexRWIEntry ie2 ) {
// returns a modified entry of the first argument
ie1 . entry . setCol ( col_worddistance , ie1 . worddistance ( ) + ie2 . worddistance ( ) + Math . abs ( ie1 . posintext ( ) - ie2 . posintext ( ) ) ) ;
ie1 . entry . setCol ( col_posintext , Math . min ( ie1 . posintext ( ) , ie2 . posintext ( ) ) ) ;
ie1 . entry . setCol ( col_posinphrase , ( ie1 . posofphrase ( ) = = ie2 . posofphrase ( ) ) ? ie1 . posofphrase ( ) : 0 /*unknown*/ ) ;
ie1 . entry . setCol ( col_posofphrase , Math . min ( ie1 . posofphrase ( ) , ie2 . posofphrase ( ) ) ) ;
ie1 . entry . setCol ( col_wordsInText , ( ie1 . wordsintext ( ) + ie2 . wordsintext ( ) ) / 2 ) ;
return ie1 ;
}
public void combineDistance ( indexRWIEntry oe ) {
combineDistance ( this , oe ) ;
}
public int worddistance ( ) {
return ( int ) this . entry . getColLong ( col_worddistance ) ;
}
public static final void min ( indexRWIEntry t , indexRWIEntry other ) {
int v ;
long w ;
if ( t . hitcount ( ) > ( v = other . hitcount ( ) ) ) t . entry . setCol ( col_hitcount , other . hitcount ( ) ) ;
if ( t . wordsintext ( ) > ( v = other . wordsintext ( ) ) ) t . entry . setCol ( col_wordsInText , v ) ;
if ( t . phrasesintext ( ) > ( v = other . phrasesintext ( ) ) ) t . entry . setCol ( col_phrasesInText , v ) ;
if ( t . posintext ( ) > ( v = other . posintext ( ) ) ) t . entry . setCol ( col_posintext , v ) ;
if ( t . posinphrase ( ) > ( v = other . posinphrase ( ) ) ) t . entry . setCol ( col_posinphrase , v ) ;
if ( t . posofphrase ( ) > ( v = other . posofphrase ( ) ) ) t . entry . setCol ( col_posofphrase , v ) ;
if ( t . worddistance ( ) > ( v = other . worddistance ( ) ) ) t . entry . setCol ( col_worddistance , v ) ;
if ( t . lastModified ( ) > ( w = other . lastModified ( ) ) ) t . entry . setCol ( col_lastModified , w ) ;
if ( t . urllength ( ) > ( v = other . urllength ( ) ) ) t . entry . setCol ( col_urlLength , v ) ;
if ( t . urlcomps ( ) > ( v = other . urlcomps ( ) ) ) t . entry . setCol ( col_urlComps , v ) ;
if ( t . wordsintitle ( ) > ( v = other . wordsintitle ( ) ) ) t . entry . setCol ( col_wordsInTitle , v ) ;
}
public static final void max ( indexRWIEntry t , indexRWIEntry other ) {
int v ;
long w ;
if ( t . hitcount ( ) < ( v = other . hitcount ( ) ) ) t . entry . setCol ( col_hitcount , v ) ;
if ( t . wordsintext ( ) < ( v = other . wordsintext ( ) ) ) t . entry . setCol ( col_wordsInText , v ) ;
if ( t . phrasesintext ( ) < ( v = other . phrasesintext ( ) ) ) t . entry . setCol ( col_phrasesInText , v ) ;
if ( t . posintext ( ) < ( v = other . posintext ( ) ) ) t . entry . setCol ( col_posintext , v ) ;
if ( t . posinphrase ( ) < ( v = other . posinphrase ( ) ) ) t . entry . setCol ( col_posinphrase , v ) ;
if ( t . posofphrase ( ) < ( v = other . posofphrase ( ) ) ) t . entry . setCol ( col_posofphrase , v ) ;
if ( t . worddistance ( ) < ( v = other . worddistance ( ) ) ) t . entry . setCol ( col_worddistance , v ) ;
if ( t . lastModified ( ) < ( w = other . lastModified ( ) ) ) t . entry . setCol ( col_lastModified , w ) ;
if ( t . urllength ( ) < ( v = other . urllength ( ) ) ) t . entry . setCol ( col_urlLength , v ) ;
if ( t . urlcomps ( ) < ( v = other . urlcomps ( ) ) ) t . entry . setCol ( col_urlComps , v ) ;
if ( t . wordsintitle ( ) < ( v = other . wordsintitle ( ) ) ) t . entry . setCol ( col_wordsInTitle , v ) ;
}
public void min ( indexRWIEntry other ) {
min ( this , other ) ;
}
public void max ( indexRWIEntry other ) {
max ( this , other ) ;
}
public boolean isNewer ( indexRWIEntry other ) {
if ( other = = null ) return true ;
if ( this . lastModified ( ) > other . lastModified ( ) ) return true ;
if ( this . lastModified ( ) = = other . lastModified ( ) ) {
if ( this . quality ( ) > other . quality ( ) ) return true ;
}
return false ;
}
public boolean isOlder ( indexRWIEntry other ) {
if ( other = = null ) return false ;
if ( this . lastModified ( ) < other . lastModified ( ) ) return true ;
if ( this . lastModified ( ) = = other . lastModified ( ) ) {
if ( this . quality ( ) < other . quality ( ) ) return true ;
}
return false ;
}
}