@ -67,24 +67,26 @@ public final class plasmaWordIndexEntry {
public static final int urlHashLength = yacySeedDB . commonHashLength ; // 12
// the size of the index entry attributes
//public static final int attrSpaceShort = 12;
public static final int attrSpaceLong = 18 ;
public static final int attrSpace = 24 ;
// the associated hash
private final String urlHash ;
// discrete values
private int count ; // words in file
private int hitcount ; // words in file
private int wordcount ;
private int phrasecount ;
private int posintext ; // first position of the word in text as number of word; 0=unknown or irrelevant position
private int posinphrase ; // position within a phrase of the word
private int posofphrase ; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text
private int worddistance ;
private long lastModified ; // calculated by using last-modified
private int quality ; // result of a heuristic on the source file
private byte [ ] language ; // essentially the country code (the TLD as heuristic), two letters lowercase only
private char doctype ; // type of source
private char localflag ; // indicates if the index was created locally
// some doctypes:
// doctypes:
public static final char DT_PDFPS = 'p' ;
public static final char DT_TEXT = 't' ;
public static final char DT_HTML = 'h' ;
@ -97,6 +99,19 @@ public final class plasmaWordIndexEntry {
public static final char DT_BINARY = 'b' ;
public static final char DT_UNKNOWN = 'u' ;
// appearance locations: (used for flags)
public static final int AP_TITLE = 0 ; // title tag from html header
public static final int AP_H1 = 1 ; // h0-tag
public static final int AP_H2 = 2 ;
public static final int AP_H3 = 3 ;
public static final int AP_H4 = 4 ;
public static final int AP_H5 = 5 ;
public static final int AP_H6 = 6 ;
public static final int AP_ANCHOR = 7 ; // anchor description
public static final int AP_URL = 8 ; // word inside an url
public static final int AP_IMG = 9 ; // tag inside image references
public static final int AP_TAG = 10 ; // for tagged indexeing (i.e. using mp3 tags)
// local flag attributes
public static final char LT_LOCAL = 'L' ;
public static final char LT_GLOBAL = 'G' ;
@ -187,23 +202,22 @@ public final class plasmaWordIndexEntry {
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
public plasmaWordIndexEntry ( String urlHash ,
int count , // how often appears this word in the text
int posintext ,
int posinphrase ,
int posofphrase ,
long time ,
int quality ,
String language ,
char doctype ,
int hitcount , // how often appears this word in the text
int wordcount , // total number of words
int phrasecount , // total number of phrases
int posintext , // position of word in all words
int posinphrase , // position of word in its phrase
int posofphrase , // number of the phrase where word appears
int distance , // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
long time , // last-modified time of the document where word appears
int quality , //
String language , //
char doctype , //
boolean local ) {
// more needed attributes:
// - int: length of text / total number of words
// - int: length of text / total number of sentences
// - long: update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
// - int: word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
// - char: category of appearance (header, title, section, text, anchor-descr, image-tag etc)
// - boolean: appears in title, appears in header, appears in ....
// - boolean: appears in title, appears in header, anchor-descr, image-tag etc
// - int: url-length (shorter are better)
// - int: url-number of components / length of path
// - int: length of description tag / title tag (longer are better)
@ -211,10 +225,13 @@ public final class plasmaWordIndexEntry {
if ( ( language = = null ) | | ( language . length ( ) ! = plasmaURL . urlLanguageLength ) ) language = "uk" ;
this . urlHash = urlHash ;
this . count = count ;
this . hitcount = hitcount ;
this . wordcount = wordcount ;
this . phrasecount = phrasecount ;
this . posintext = posintext ;
this . posinphrase = posinphrase ;
this . posofphrase = posofphrase ;
this . worddistance = distance ;
this . lastModified = time ;
this . quality = quality ;
this . language = language . getBytes ( ) ;
@ -225,15 +242,18 @@ public final class plasmaWordIndexEntry {
public plasmaWordIndexEntry ( String urlHash , String code ) {
// the code is not parsed but used later on
this . urlHash = urlHash ;
this . count = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 6 , 8 ) ) ;
this . posintext = ( code . length ( ) > = 14 ) ? ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 12 , 14 ) ) : 0 ;
this . posinphrase = ( code . length ( ) > = 15 ) ? ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 14 , 16 ) ) : 0 ;
this . posofphrase = ( code . length ( ) > = 16 ) ? ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 16 , 18 ) ) : 0 ;
this . hitcount = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 6 , 8 ) ) ;
this . lastModified = plasmaWordIndex . reverseMicroDateDays ( ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 3 , 6 ) ) ) ;
this . quality = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 0 , 3 ) ) ;
this . language = code . substring ( 8 , 10 ) . getBytes ( ) ;
this . doctype = code . charAt ( 10 ) ;
this . localflag = code . charAt ( 11 ) ;
this . posintext = ( code . length ( ) > = 14 ) ? ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 12 , 14 ) ) : 0 ;
this . posinphrase = ( code . length ( ) > = 15 ) ? ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 14 , 16 ) ) : 0 ;
this . posofphrase = ( code . length ( ) > = 17 ) ? ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 16 , 18 ) ) : 0 ;
this . worddistance = ( code . length ( ) > = 19 ) ? ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 18 , 20 ) ) : 0 ;
this . wordcount = ( code . length ( ) > = 21 ) ? ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 20 , 22 ) ) : 0 ;
this . phrasecount = ( code . length ( ) > = 23 ) ? ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( code . substring ( 22 , 24 ) ) : 0 ;
}
public plasmaWordIndexEntry ( String external ) {
@ -246,10 +266,13 @@ public final class plasmaWordIndexEntry {
}
// set values
this . urlHash = pr . getProperty ( "h" , "" ) ;
this . count = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( pr . getProperty ( "c" , "A" ) ) ;
this . hitcount = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( pr . getProperty ( "c" , "A" ) ) ;
this . wordcount = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( pr . getProperty ( "w" , "__" ) ) ;
this . phrasecount = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( pr . getProperty ( "p" , "__" ) ) ;
this . posintext = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( pr . getProperty ( "t" , "__" ) ) ;
this . posinphrase = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( pr . getProperty ( "r" , "__" ) ) ;
this . posofphrase = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( pr . getProperty ( "o" , "__" ) ) ;
this . worddistance = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( pr . getProperty ( "i" , "__" ) ) ;
this . lastModified = plasmaWordIndex . reverseMicroDateDays ( ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( pr . getProperty ( "a" , "A" ) ) ) ;
this . quality = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( pr . getProperty ( "q" , "__" ) ) ;
this . language = pr . getProperty ( "l" , "uk" ) . getBytes ( ) ;
@ -260,85 +283,64 @@ public final class plasmaWordIndexEntry {
public String toEncodedForm ( ) {
// attention: this integrates NOT the URL hash into the encoding
// if you need a complete dump, use toExternalForm()
StringBuffer buf = new StringBuffer ( attrSpace Long ) ;
StringBuffer buf = new StringBuffer ( attrSpace ) ;
buf . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . quality , plasmaURL . urlQualityLength ) )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( plasmaWordIndex . microDateDays ( this . lastModified ) , 3 ) )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . count, 2 ) )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . hit count, 2 ) )
. append ( new String ( this . language ) )
. append ( this . doctype )
. append ( this . localflag ) ; // 3 + 3 + 2 + 2 + 1 + 1 = 12 bytes
buf . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . posintext , 2 ) )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . posinphrase , 2 ) )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . posofphrase , 2 ) ) ;
. append ( this . localflag )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . posintext , 2 ) )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . posinphrase , 2 ) )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . posofphrase , 2 ) )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . worddistance , 2 ) )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . wordcount , 2 ) )
. append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . phrasecount , 2 ) ) ; // 3+3+2+2+1+1+2+2+2+2+2+2= 24 bytes
return buf . toString ( ) ;
}
}
public String toExternalForm ( ) {
public String toExternalForm ( ) {
StringBuffer str = new StringBuffer ( 61 ) ;
str . append ( "{" )
. append ( "h=" ) . append ( this . urlHash )
. append ( "h=" ) . append ( this . urlHash )
. append ( ",q=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . quality , plasmaURL . urlQualityLength ) )
. append ( ",a=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( plasmaWordIndex . microDateDays ( this . lastModified ) , 3 ) )
. append ( ",c=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . count, 2 ) )
. append ( ",c=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . hit count, 2 ) )
. append ( ",l=" ) . append ( new String ( this . language ) )
. append ( ",d=" ) . append ( this . doctype )
. append ( ",f=" ) . append ( this . localflag )
. append ( ",t=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . posintext , 2 ) )
. append ( ",r=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . posinphrase , 2 ) )
. append ( ",o=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . posofphrase , 2 ) )
. append ( ",i=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . worddistance , 2 ) )
. append ( ",w=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . wordcount , 2 ) )
. append ( ",p=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLongSmart ( this . phrasecount , 2 ) )
. append ( "}" ) ;
return str . toString ( ) ;
}
public String getUrlHash ( ) {
return urlHash ;
}
public int getQuality ( ) {
return quality ;
}
public int getVirtualAge ( ) {
return plasmaWordIndex . microDateDays ( lastModified ) ;
}
public long getLastModified ( ) {
return lastModified ;
public void combineDistance ( plasmaWordIndexEntry oe ) {
this . worddistance = this . worddistance + oe . worddistance + Math . abs ( this . posintext - oe . posintext ) ;
}
public int getCount ( ) {
return count ;
}
public int posintext ( ) {
return posintext ;
}
public int posinphrase ( ) {
return posinphrase ;
}
public int posofphrase ( ) {
return posofphrase ;
}
public String getLanguage ( ) {
return new String ( language ) ;
}
public char getType ( ) {
return doctype ;
}
public boolean isLocal ( ) {
return localflag = = LT_LOCAL ;
}
public String getUrlHash ( ) { return urlHash ; }
public int getQuality ( ) { return quality ; }
public int getVirtualAge ( ) { return plasmaWordIndex . microDateDays ( lastModified ) ; }
public long getLastModified ( ) { return lastModified ; }
public int getCount ( ) { return hitcount ; }
public int posintext ( ) { return posintext ; }
public int posinphrase ( ) { return posinphrase ; }
public int posofphrase ( ) { return posofphrase ; }
public int worddistance ( ) { return worddistance ; }
public int wordcount ( ) { return wordcount ; }
public int phrasecount ( ) { return phrasecount ; }
public String getLanguage ( ) { return new String ( language ) ; }
public char getType ( ) { return doctype ; }
public boolean isLocal ( ) { return localflag = = LT_LOCAL ; }
public static void main ( String [ ] args ) {
// outputs the word hash to a given word