@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -46,9 +46,8 @@ import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.order.Digest ;
import net.yacy.kelondro.order.NaturalOrder ;
import net.yacy.kelondro.util.ByteBuffer ;
import net.yacy.kelondro.util.kelondroException ;
import net.yacy.kelondro.util.MapTools ;
import net.yacy.kelondro.util.kelondroException ;
import de.anomic.crawler.retrieval.Request ;
import de.anomic.search.QueryParams ;
import de.anomic.tools.crypt ;
@ -56,7 +55,7 @@ import de.anomic.tools.crypt;
public class URIMetadataRow implements URIMetadata {
// this object stores attributes for URL entries
public static final Row rowdef = new Row (
"String hash-12, " + // the url's hash
"String comp-360, " + // components: the url, description, author, tags and publisher
@ -70,15 +69,15 @@ public class URIMetadataRow implements URIMetadata {
"byte[] dt-1, " + // doctype, taken from extension or any other heuristic
"Bitfield flags-4, " + // flags; any stuff (see Word-Entity definition)
"byte[] lang-2, " + // language
"Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width
"Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width
"Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height
"Cardinal limage-2 {b256}, " + // # of embedded image links
"Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks
"Cardinal lvideo-2 {b256}, " + // # of embedded video links
"Cardinal lapp-2 {b256}" , // # of embedded links to applications
Base64Order . enhancedCoder
) ;
) ;
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* Constants to access the various columns of an URL entry
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
@ -100,13 +99,13 @@ public class URIMetadataRow implements URIMetadata {
private static final int col_laudio = 15 ; // # of embedded audio links; for audio: track number; for video: number of audio tracks
private static final int col_lvideo = 16 ; // # of embedded video links
private static final int col_lapp = 17 ; // # of embedded links to applications
private final Row . Entry entry ;
private final String snippet ;
private WordReferenceVars word ; // this is only used if the url is transported via remote search requests
private final long ranking ; // during generation of a search result this value is set
private Components comp ;
public URIMetadataRow ( ) {
// create a dummy entry, good to produce poison objects
this . entry = rowdef . newEntry ( ) ;
@ -115,14 +114,14 @@ public class URIMetadataRow implements URIMetadata {
this . ranking = 0 ;
this . comp = null ;
}
public URIMetadataRow (
final DigestURI url ,
final String dc_title ,
final String dc_creator ,
final String dc_subject ,
final String dc_publisher ,
final float lon , final float lat , // decimal degrees as in WGS84; if unknown both values may be 0.0f;
final float lon , final float lat , // decimal degrees as in WGS84; if unknown both values may be 0.0f;
final Date mod ,
final Date load ,
final Date fresh ,
@ -173,14 +172,14 @@ public class URIMetadataRow implements URIMetadata {
}
private Date decodeDate ( final int col ) {
long t = this . entry . getColLong ( col ) ;
final long t = this . entry . getColLong ( col ) ;
/*if (t < 14600) */ return new Date ( 86400000L * t ) ; // time was stored as number of days since epoch
/ *
if ( t < 350400 ) return new Date ( 3600000L * t ) ; // hours since epoch
if ( t < 21024000 ) return new Date ( 60000L * t ) ; // minutes since epoch
* /
}
public static byte [ ] encodeComp (
final DigestURI url ,
final String dc_title ,
@ -198,7 +197,7 @@ public class URIMetadataRow implements URIMetadata {
if ( lon = = 0.0f & & lat = = 0.0f ) s . append ( 10 ) ; else s . append ( Float . toString ( lat ) ) . append ( ',' ) . append ( Float . toString ( lon ) ) . append ( 10 ) ;
return UTF8 . getBytes ( s . toString ( ) ) ;
}
public URIMetadataRow ( final Row . Entry entry , final WordReferenceVars searchedWord , final long ranking ) {
this . entry = entry ;
this . snippet = null ;
@ -223,14 +222,14 @@ public class URIMetadataRow implements URIMetadata {
String dc_publisher = crypt . simpleDecode ( prop . getProperty ( "publisher" , "" ) , null ) ; if ( dc_publisher = = null ) dc_publisher = "" ;
String lons = crypt . simpleDecode ( prop . getProperty ( "lon" , "0.0" ) , null ) ; if ( lons = = null ) lons = "0.0" ;
String lats = crypt . simpleDecode ( prop . getProperty ( "lat" , "0.0" ) , null ) ; if ( lats = = null ) lats = "0.0" ;
this . entry = rowdef . newEntry ( ) ;
this . entry . setCol ( col_hash , url . hash ( ) ) ; // FIXME potential null pointer access
this . entry . setCol ( col_comp , encodeComp ( url , descr , dc_creator , tags , dc_publisher , Float . parseFloat ( lats ) , Float . parseFloat ( lons ) ) ) ;
// create new formatters to make concurrency possible
GenericFormatter formatter = new GenericFormatter ( GenericFormatter . FORMAT_SHORT_DAY , GenericFormatter . time_minute ) ;
final GenericFormatter formatter = new GenericFormatter ( GenericFormatter . FORMAT_SHORT_DAY , GenericFormatter . time_minute ) ;
try {
encodeDate ( col_mod , formatter . parse ( prop . getProperty ( "mod" , "20000101" ) ) ) ;
} catch ( final ParseException e ) {
@ -250,7 +249,7 @@ public class URIMetadataRow implements URIMetadata {
this . entry . setCol ( col_md5 , Digest . decodeHex ( prop . getProperty ( "md5" , "" ) ) ) ;
this . entry . setCol ( col_size , Integer . parseInt ( prop . getProperty ( "size" , "0" ) ) ) ;
this . entry . setCol ( col_wc , Integer . parseInt ( prop . getProperty ( "wc" , "0" ) ) ) ;
String dt = prop . getProperty ( "dt" , "t" ) ;
final String dt = prop . getProperty ( "dt" , "t" ) ;
this . entry . setCol ( col_dt , dt . length ( ) > 0 ? new byte [ ] { ( byte ) dt . charAt ( 0 ) } : new byte [ ] { ( byte ) 't' } ) ;
final String flags = prop . getProperty ( "flags" , "AAAAAA" ) ;
this . entry . setCol ( col_flags , ( flags . length ( ) > 6 ) ? QueryParams . empty_constraint . bytes ( ) : ( new Bitfield ( 4 , flags ) ) . bytes ( ) ) ;
@ -285,14 +284,14 @@ public class URIMetadataRow implements URIMetadata {
private StringBuilder corePropList ( ) {
// generate a parseable string; this is a simple property-list
final Components metadata = this . metadata( ) ;
final Components metadata = metadata( ) ;
final StringBuilder s = new StringBuilder ( 300 ) ;
if ( metadata = = null ) return null ;
//System.out.println("author=" + comp.author());
// create new formatters to make concurrency possible
GenericFormatter formatter = new GenericFormatter ( GenericFormatter . FORMAT_SHORT_DAY , GenericFormatter . time_minute ) ;
final GenericFormatter formatter = new GenericFormatter ( GenericFormatter . FORMAT_SHORT_DAY , GenericFormatter . time_minute ) ;
try {
s . append ( "hash=" ) . append ( ASCII . String ( hash ( ) ) ) ;
assert ( s . toString ( ) . indexOf ( 0 ) < 0 ) ;
@ -342,15 +341,15 @@ public class URIMetadataRow implements URIMetadata {
assert ( s . toString ( ) . indexOf ( 0 ) < 0 ) ;
s . append ( ",lapp=" ) . append ( lapp ( ) ) ;
assert ( s . toString ( ) . indexOf ( 0 ) < 0 ) ;
if ( this . word ! = null ) {
// append also word properties
s . append ( ",wi=" ) . append ( Base64Order . enhancedCoder . encodeString ( word. toPropertyForm ( ) ) ) ;
s . append ( ",wi=" ) . append ( Base64Order . enhancedCoder . encodeString ( this . word. toPropertyForm ( ) ) ) ;
}
assert ( s . toString ( ) . indexOf ( 0 ) < 0 ) ;
return s ;
} catch ( final Exception e ) {
} catch ( final Throwable e ) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
@ -374,13 +373,13 @@ public class URIMetadataRow implements URIMetadata {
public long ranking ( ) {
return this . ranking ;
}
public Components metadata ( ) {
// avoid double computation of metadata elements
if ( this . comp ! = null ) return this . comp ;
// parse elements from comp field;
byte [ ] c = this . entry . getColBytes ( col_comp , true ) ;
List < byte [ ] > cl = ByteBuffer . split ( c , ( byte ) 10 ) ;
final byte [ ] c = this . entry . getColBytes ( col_comp , true ) ;
final List < byte [ ] > cl = ByteBuffer . split ( c , ( byte ) 10 ) ;
this . comp = new Components (
( cl . size ( ) > 0 ) ? UTF8 . String ( cl . get ( 0 ) ) : "" ,
hash ( ) ,
@ -391,7 +390,7 @@ public class URIMetadataRow implements URIMetadata {
( cl . size ( ) > 5 ) ? UTF8 . String ( cl . get ( 5 ) ) : "" ) ;
return this . comp ;
}
public Date moddate ( ) {
return decodeDate ( col_mod ) ;
}
@ -407,7 +406,7 @@ public class URIMetadataRow implements URIMetadata {
public byte [ ] referrerHash ( ) {
// return the creator's hash or null if there is none
// FIXME: There seem to be some malformed entries in the databasees like "null\0\0\0\0\0\0\0\0"
final byte [ ] r = entry. getColBytes ( col_referrer , true ) ;
final byte [ ] r = this . entry. getColBytes ( col_referrer , true ) ;
if ( r ! = null ) {
int i = r . length ;
while ( i > 0 ) {
@ -419,11 +418,11 @@ public class URIMetadataRow implements URIMetadata {
public String md5 ( ) {
// returns the md5 in hex representation
return Digest . encodeHex ( entry. getColBytes ( col_md5 , true ) ) ;
return Digest . encodeHex ( this . entry. getColBytes ( col_md5 , true ) ) ;
}
public char doctype ( ) {
return ( char ) entry. getColByte ( col_dt ) ;
return ( char ) this . entry. getColByte ( col_dt ) ;
}
public byte [ ] language ( ) {
@ -465,15 +464,15 @@ public class URIMetadataRow implements URIMetadata {
public int lapp ( ) {
return ( int ) this . entry . getColLong ( col_lapp ) ;
}
public String snippet ( ) {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return snippet;
return this . snippet;
}
public WordReferenceVars word ( ) {
return word;
return this . word;
}
public boolean isOlder ( final URIMetadata other ) {
@ -507,20 +506,20 @@ public class URIMetadataRow implements URIMetadata {
public Request toBalancerEntry ( final String initiatorHash ) {
return new Request (
ASCII . getBytes ( initiatorHash ) ,
metadata ( ) . url ( ) ,
referrerHash ( ) ,
ASCII . getBytes ( initiatorHash ) ,
metadata ( ) . url ( ) ,
referrerHash ( ) ,
metadata ( ) . dc_title ( ) ,
moddate ( ) ,
null ,
0 ,
0 ,
0 ,
0 ,
0 ,
0 ) ;
}
/ * *
* @return the object as String . < br >
* @return the object as String . < br >
* This e . g . looks like this :
* < pre > { hash = jmqfMk7Y3NKw , referrer = - - - - - - - - - - - - , mod = 20050610 , load = 20051003 , size = 51666 , wc = 1392 , cc = 0 , local = true , q = AEn , dt = h , lang = uk , url = b | aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv , descr = b | S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz } < / pre >
* /
@ -535,14 +534,14 @@ public class URIMetadataRow implements URIMetadata {
return core . toString ( ) ;
//return "{" + core + "}";
}
public class Components {
private DigestURI url ;
private String urlRaw ;
private byte [ ] urlHash ;
private final String dc_title , dc_creator , dc_subject , dc_publisher ;
private final String latlon ; // a comma-separated tuple as "<latitude>,<longitude>" where the coordinates are given as WGS84 spatial coordinates in decimal degrees
public Components (
final String urlRaw ,
final byte [ ] urlhash ,
@ -560,7 +559,7 @@ public class URIMetadataRow implements URIMetadata {
this . dc_publisher = publisher ;
this . latlon = latlon ;
}
public boolean matches ( Pattern matcher ) {
public boolean matches ( final Pattern matcher ) {
if ( this . urlRaw ! = null ) return matcher . matcher ( this . urlRaw . toLowerCase ( ) ) . matches ( ) ;
if ( this . url ! = null ) return matcher . matcher ( this . url . toNormalform ( true , true ) . toLowerCase ( ) ) . matches ( ) ;
return false ;
@ -569,7 +568,7 @@ public class URIMetadataRow implements URIMetadata {
if ( this . url = = null ) {
try {
this . url = new DigestURI ( this . urlRaw , this . urlHash ) ;
} catch ( MalformedURLException e ) {
} catch ( final MalformedURLException e ) {
this . url = null ;
}
this . urlRaw = null ;
@ -582,14 +581,14 @@ public class URIMetadataRow implements URIMetadata {
public String dc_publisher ( ) { return this . dc_publisher ; }
public String dc_subject ( ) { return this . dc_subject ; }
public float lat ( ) {
if ( latlon = = null | | latlon. length ( ) = = 0 ) return 0.0f ;
int p = latlon. indexOf ( ',' ) ;
return p < 0 ? 0.0f : Float . parseFloat ( latlon. substring ( 0 , p ) ) ;
if ( this . latlon = = null | | this . latlon. length ( ) = = 0 ) return 0.0f ;
final int p = this . latlon. indexOf ( ',' ) ;
return p < 0 ? 0.0f : Float . parseFloat ( this . latlon. substring ( 0 , p ) ) ;
}
public float lon ( ) {
if ( latlon = = null | | latlon. length ( ) = = 0 ) return 0.0f ;
int p = latlon. indexOf ( ',' ) ;
return p < 0 ? 0.0f : Float . parseFloat ( latlon. substring ( p + 1 ) ) ;
if ( this . latlon = = null | | this . latlon. length ( ) = = 0 ) return 0.0f ;
final int p = this . latlon. indexOf ( ',' ) ;
return p < 0 ? 0.0f : Float . parseFloat ( this . latlon. substring ( p + 1 ) ) ;
}
}
}