@ -61,14 +61,11 @@ import java.util.HashSet;
import java.util.Iterator ;
import java.util.LinkedList ;
import java.util.Locale ;
import java.util.Properties ;
import de.anomic.http.httpc ;
import de.anomic.http.httpc.response ;
import de.anomic.index.indexEntry ;
import de.anomic.index.indexURL ;
import de.anomic.index.indexURLEntry ;
import de.anomic.kelondro.kelondroBase64Order ;
import de.anomic.kelondro.kelondroNaturalOrder ;
import de.anomic.kelondro.kelondroRAMIndex ;
import de.anomic.kelondro.kelondroRow ;
@ -78,7 +75,6 @@ import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings ;
import de.anomic.server.serverObjects ;
import de.anomic.server.logging.serverLog ;
import de.anomic.tools.crypt ;
import de.anomic.tools.nxTools ;
import de.anomic.yacy.yacyCore ;
import de.anomic.yacy.yacySeed ;
@ -95,31 +91,15 @@ public final class plasmaCrawlLURL extends indexURL {
private final LinkedList lcrawlResultStack ; // 5 - local index: result of local crawling
private final LinkedList gcrawlResultStack ; // 6 - local index: triggered external
//public static Set damagedURLS = Collections.synchronizedSet(new HashSet());
public plasmaCrawlLURL ( File cachePath , int bufferkb , long preloadTime , boolean newdb ) {
super ( ) ;
kelondroRow rowdef = new kelondroRow (
"String urlhash-" + urlHashLength + ", " + // the url's hash
"String urlstring-" + urlStringLength + ", " + // the url as string
"String urldescr-" + urlDescrLength + ", " + // the description of the url
"Cardinal moddate-" + urlDateLength + " {b64e}, " + // last-modified from the httpd
"Cardinal loaddate-" + urlDateLength + " {b64e}, " + // time when the url was loaded
"String refhash-" + urlHashLength + ", " + // the url's referrer hash
"Cardinal copycount-" + urlCopyCountLength + " {b64e}, " + //
"byte[] flags-" + urlFlagLength + ", " + // flags
"Cardinal quality-" + urlQualityLength + " {b64e}, " + //
"String language-" + urlLanguageLength + ", " + //
"byte[] doctype-" + urlDoctypeLength + ", " + //
"Cardinal size-" + urlSizeLength + " {b64e}, " + // size of file in bytes
"Cardinal wc-" + urlWordCountLength + " {b64e}" ) ; // word count
File cacheFile = new File ( cachePath , "urlHash.db" ) ;
cacheFile . getParentFile ( ) . mkdirs ( ) ;
try {
urlIndexFile = new kelondroTree ( cacheFile , bufferkb * 0x400 , preloadTime , kelondroTree . defaultObjectCachePercent , rowdef) ;
urlIndexCache = new kelondroRAMIndex ( kelondroNaturalOrder . naturalOrder , rowdef) ;
urlIndexFile = new kelondroTree ( cacheFile , bufferkb * 0x400 , preloadTime , kelondroTree . defaultObjectCachePercent , plasmaCrawlLURLOldEntry . rowdef ) ;
urlIndexCache = new kelondroRAMIndex ( kelondroNaturalOrder . naturalOrder , plasmaCrawlLURLOldEntry . rowdef ) ;
} catch ( IOException e ) {
e . printStackTrace ( ) ;
System . exit ( - 1 ) ;
@ -134,19 +114,19 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack = new LinkedList ( ) ;
public synchronized void stack ( Entry e , String initiatorHash , String executorHash , int stackType ) {
public synchronized void stack ( plasmaCrawlLURL Entry e , String initiatorHash , String executorHash , int stackType ) {
if ( e = = null ) { return ; }
try {
if ( initiatorHash = = null ) { initiatorHash = dummyHash ; }
if ( executorHash = = null ) { executorHash = dummyHash ; }
switch ( stackType ) {
case 0 : break ;
case 1 : externResultStack . add ( e . urlHash + initiatorHash + executorHash ) ; break ;
case 2 : searchResultStack . add ( e . urlHash + initiatorHash + executorHash ) ; break ;
case 3 : transfResultStack . add ( e . urlHash + initiatorHash + executorHash ) ; break ;
case 4 : proxyResultStack . add ( e . urlHash + initiatorHash + executorHash ) ; break ;
case 5 : lcrawlResultStack . add ( e . urlHash + initiatorHash + executorHash ) ; break ;
case 6 : gcrawlResultStack . add ( e . urlHash + initiatorHash + executorHash ) ; break ;
case 1 : externResultStack . add ( e . hash( ) + initiatorHash + executorHash ) ; break ;
case 2 : searchResultStack . add ( e . hash( ) + initiatorHash + executorHash ) ; break ;
case 3 : transfResultStack . add ( e . hash( ) + initiatorHash + executorHash ) ; break ;
case 4 : proxyResultStack . add ( e . hash( ) + initiatorHash + executorHash ) ; break ;
case 5 : lcrawlResultStack . add ( e . hash( ) + initiatorHash + executorHash ) ; break ;
case 6 : gcrawlResultStack . add ( e . hash( ) + initiatorHash + executorHash ) ; break ;
return ;
} catch ( Exception ex ) {
@ -159,7 +139,7 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack . add ( urlHash + initiatorHash + executorHash ) ;
public Entry load ( String urlHash , indexEntry searchedWord ) {
public plasmaCrawlLURL Entry load ( String urlHash , indexEntry searchedWord ) {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@ -171,19 +151,18 @@ public final class plasmaCrawlLURL extends indexURL {
try {
if ( entry = = null ) entry = urlIndexFile . get ( urlHash . getBytes ( ) ) ;
if ( entry = = null ) return null ;
return new Entry( entry , searchedWord ) ;
return new plasmaCrawlLURLOld Entry( entry , searchedWord ) ;
} catch ( IOException e ) {
return null ;
public void store ( Entry entry , boolean cached ) throws IOException {
public void store ( plasmaCrawlLURL Entry entry , boolean cached ) throws IOException {
// Check if there is a more recent Entry already in the DB
if ( entry . stored ) return ;
Entry oldEntry ;
plasmaCrawlLURLEntry oldEntry ;
try {
if ( exists ( entry . urlHash ) ) {
oldEntry = load ( entry . urlHash , null ) ;
if ( exists ( entry . hash( ) ) ) {
oldEntry = load ( entry . hash( ) , null ) ;
} else {
oldEntry = null ;
@ -194,40 +173,32 @@ public final class plasmaCrawlLURL extends indexURL {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
entry . descr = oldEntry . descr ;
entry . moddate = oldEntry . moddate ;
entry . loaddate = oldEntry . loaddate ;
entry . referrerHash = oldEntry . referrerHash ;
entry . copyCount = oldEntry . copyCount ;
entry . flags = oldEntry . flags ;
entry . quality = oldEntry . quality ;
entry . language = oldEntry . language ;
entry . doctype = oldEntry . doctype ;
entry . size = oldEntry . size ;
entry . wordCount = oldEntry . wordCount ;
// this.snippet // not read from db
// this.word // not read from db
entry . stored = true ;
entry = oldEntry ;
return ; // this did not need to be stored, but is updated
super . store ( entry . toRowEntry ( ) , cached ) ;
entry . stored = true ;
if ( ( cached ) & & ( urlIndexCache ! = null ) ) {
synchronized ( urlIndexCache ) {
urlIndexCache . put ( entry . toRowEntry ( ) ) ;
} else {
urlIndexFile . put ( entry . toRowEntry ( ) , entry . loaddate ( ) ) ;
public synchronized Entry newEntry ( String propStr , boolean setGlobal ) {
public synchronized plasmaCrawlLURLEntry newEntry ( String propStr , boolean setGlobal ) {
if ( propStr . startsWith ( "{" ) & & propStr . endsWith ( "}" ) ) {
return new Entry ( serverCodings . s2p ( propStr . substring ( 1 , propStr . length ( ) - 1 ) ) , setGlobal ) ;
return new plasmaCrawlLURLOld Entry( serverCodings . s2p ( propStr . substring ( 1 , propStr . length ( ) - 1 ) ) , setGlobal ) ;
} else {
return null ;
public synchronized Entry newEntry ( URL url , String descr , Date moddate , Date loaddate ,
public synchronized plasmaCrawlLURL Entry newEntry ( URL url , String descr , Date moddate , Date loaddate ,
String referrerHash , int copyCount , boolean localNeed ,
int quality , String language , char doctype ,
int size , int wordCount ) {
Entry e = new Entry( url , descr , moddate , loaddate , referrerHash , copyCount , localNeed , quality , language , doctype , size , wordCount ) ;
plasmaCrawlLURL Entry e = new plasmaCrawlLURLOld Entry( url , descr , moddate , loaddate , referrerHash , copyCount , localNeed , quality , language , doctype , size , wordCount ) ;
return e ;
@ -365,7 +336,7 @@ public final class plasmaCrawlLURL extends indexURL {
String urlHash , initiatorHash , executorHash ;
String cachepath , urlstr , urltxt ;
yacySeed initiatorSeed , executorSeed ;
plasmaCrawlLURL . Entry urle ;
plasmaCrawlLURL Entry urle ;
URL url ;
// needed for getCachePath(url)
@ -412,317 +383,6 @@ public final class plasmaCrawlLURL extends indexURL {
return prop ;
public class Entry {
private URL url ;
private String descr ;
private Date moddate ;
private Date loaddate ;
private String urlHash ;
private String referrerHash ;
private int copyCount ;
private String flags ;
private int quality ;
private String language ;
private char doctype ;
private int size ;
private int wordCount ;
private String snippet ;
private indexEntry word ; // this is only used if the url is transported via remote search requests
private boolean stored ;
// more needed attributes:
// - author / copyright owner
// - keywords
// - phrasecount, total number of phrases
// - boolean: URL attributes (see Word-Entity definition)
// - boolean: appearance of bold and/or italics
// - ETag: for re-crawl decision upon HEAD request
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - int: # of keywords
// - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications
public Entry ( URL url , String descr , Date moddate , Date loaddate , String referrerHash , int copyCount , boolean localNeed , int quality , String language , char doctype , int size , int wordCount ) {
// create new entry and store it into database
this . urlHash = urlHash ( url ) ;
this . url = url ;
this . descr = ( descr = = null ) ? this . url . toString ( ) : descr ;
this . moddate = moddate ;
this . loaddate = loaddate ;
this . referrerHash = ( referrerHash = = null ) ? dummyHash : referrerHash ;
this . copyCount = copyCount ; // the number of remote (global) copies of this object without this one
this . flags = ( localNeed ) ? "L " : " " ;
this . quality = quality ;
this . language = ( language = = null ) ? "uk" : language ;
this . doctype = doctype ;
this . size = size ;
this . wordCount = wordCount ;
this . snippet = null ;
this . word = null ;
this . stored = false ;
public Entry ( kelondroRow . Entry entry , indexEntry searchedWord ) throws IOException {
try {
this . urlHash = entry . getColString ( 0 , null ) ;
this . url = new URL ( entry . getColString ( 1 , "UTF-8" ) . trim ( ) ) ;
this . descr = ( entry . empty ( 2 ) ) ? this . url . toString ( ) : entry . getColString ( 2 , "UTF-8" ) . trim ( ) ;
this . moddate = new Date ( 86400000 * entry . getColLong ( 3 ) ) ;
this . loaddate = new Date ( 86400000 * entry . getColLong ( 4 ) ) ;
this . referrerHash = ( entry . empty ( 5 ) ) ? dummyHash : entry . getColString ( 5 , "UTF-8" ) ;
this . copyCount = ( int ) entry . getColLong ( 6 ) ;
this . flags = entry . getColString ( 7 , "UTF-8" ) ;
this . quality = ( int ) entry . getColLong ( 8 ) ;
this . language = entry . getColString ( 9 , "UTF-8" ) ;
this . doctype = ( char ) entry . getColByte ( 10 ) ;
this . size = ( int ) entry . getColLong ( 11 ) ;
this . wordCount = ( int ) entry . getColLong ( 12 ) ;
this . snippet = null ;
this . word = searchedWord ;
this . stored = false ;
return ;
} catch ( Exception e ) {
serverLog . logSevere ( "PLASMA" , "INTERNAL ERROR in plasmaLURL.entry/1: " + e . toString ( ) , e ) ;
throw new IOException ( "plasmaLURL.entry/1: " + e . toString ( ) ) ;
public Entry ( Properties prop , boolean setGlobal ) {
// generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
this . urlHash = prop . getProperty ( "hash" , dummyHash ) ;
try {
//byte[][] entry = urlHashCache.get(urlHash.getBytes());
//if (entry == null) {
this . referrerHash = prop . getProperty ( "referrer" , dummyHash ) ;
this . moddate = shortDayFormatter . parse ( prop . getProperty ( "mod" , "20000101" ) ) ;
//System.out.println("DEBUG: moddate = " + moddate + ", prop=" + prop.getProperty("mod"));
this . loaddate = shortDayFormatter . parse ( prop . getProperty ( "load" , "20000101" ) ) ;
this . copyCount = Integer . parseInt ( prop . getProperty ( "cc" , "0" ) ) ;
this . flags = ( ( prop . getProperty ( "local" , "true" ) . equals ( "true" ) ) ? "L " : " " ) ;
if ( setGlobal ) this . flags = "G " ;
this . url = new URL ( crypt . simpleDecode ( prop . getProperty ( "url" , "" ) , null ) ) ;
this . descr = crypt . simpleDecode ( prop . getProperty ( "descr" , "" ) , null ) ;
if ( this . descr = = null ) this . descr = this . url . toString ( ) ;
this . quality = ( int ) kelondroBase64Order . enhancedCoder . decodeLong ( prop . getProperty ( "q" , "" ) ) ;
this . language = prop . getProperty ( "lang" , "uk" ) ;
this . doctype = prop . getProperty ( "dt" , "t" ) . charAt ( 0 ) ;
this . size = Integer . parseInt ( prop . getProperty ( "size" , "0" ) ) ;
this . wordCount = Integer . parseInt ( prop . getProperty ( "wc" , "0" ) ) ;
this . snippet = prop . getProperty ( "snippet" , "" ) ;
if ( snippet . length ( ) = = 0 ) snippet = null ; else snippet = crypt . simpleDecode ( snippet , null ) ;
this . word = ( prop . containsKey ( "word" ) ) ? new indexURLEntry ( kelondroBase64Order . enhancedCoder . decodeString ( prop . getProperty ( "word" , "" ) ) ) : null ;
this . stored = false ;
} catch ( Exception e ) {
serverLog . logSevere ( "PLASMA" , "INTERNAL ERROR in plasmaLURL.entry/2:" +
"\nProperties: " + ( ( prop = = null ) ? null : prop . toString ( ) ) +
( ( prop . containsKey ( "word" ) ) ? "\nWord: " + kelondroBase64Order . enhancedCoder . decodeString ( prop . getProperty ( "word" , "" ) ) : "" ) +
"\nErrorMsg: " + e . toString ( ) , e ) ;
public kelondroRow . Entry toRowEntry ( ) throws IOException {
final String moddatestr = kelondroBase64Order . enhancedCoder . encodeLong ( moddate . getTime ( ) / 86400000 , urlDateLength ) ;
final String loaddatestr = kelondroBase64Order . enhancedCoder . encodeLong ( loaddate . getTime ( ) / 86400000 , urlDateLength ) ;
final byte [ ] [ ] entry = new byte [ ] [ ] {
urlHash . getBytes ( ) ,
url . toString ( ) . getBytes ( ) ,
descr . getBytes ( ) , // null?
moddatestr . getBytes ( ) ,
loaddatestr . getBytes ( ) ,
referrerHash . getBytes ( ) ,
kelondroBase64Order . enhancedCoder . encodeLong ( copyCount , urlCopyCountLength ) . getBytes ( ) ,
flags . getBytes ( ) ,
kelondroBase64Order . enhancedCoder . encodeLong ( quality , urlQualityLength ) . getBytes ( ) ,
language . getBytes ( ) ,
new byte [ ] { ( byte ) doctype } ,
kelondroBase64Order . enhancedCoder . encodeLong ( size , urlSizeLength ) . getBytes ( ) ,
kelondroBase64Order . enhancedCoder . encodeLong ( wordCount , urlWordCountLength ) . getBytes ( ) ,
} ;
return urlIndexFile . row ( ) . newEntry ( entry ) ;
public String hash ( ) {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this . urlHash ;
public URL url ( ) {
return url ;
public String descr ( ) {
return descr ;
public Date moddate ( ) {
return moddate ;
public Date loaddate ( ) {
return loaddate ;
public String referrerHash ( ) {
// return the creator's hash
return referrerHash ;
public char doctype ( ) {
return doctype ;
public int copyCount ( ) {
// return number of copies of this object in the global index
return copyCount ;
public boolean local ( ) {
// returns true if the url was created locally and is needed for own word index
if ( flags = = null ) return false ;
return flags . charAt ( 0 ) = = 'L' ;
public int quality ( ) {
return quality ;
public String language ( ) {
return language ;
public int size ( ) {
return size ;
public int wordCount ( ) {
return wordCount ;
public String snippet ( ) {
// the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here
return snippet ;
public indexEntry word ( ) {
return word ;
public boolean isOlder ( Entry other ) {
if ( other = = null ) return false ;
if ( moddate . before ( other . moddate ( ) ) ) return true ;
if ( moddate . equals ( other . moddate ( ) ) ) {
if ( loaddate . before ( other . loaddate ( ) ) ) return true ;
if ( loaddate . equals ( other . loaddate ( ) ) ) {
if ( quality < other . quality ( ) ) return true ;
return false ;
private StringBuffer corePropList ( ) {
// generate a parseable string; this is a simple property-list
final StringBuffer corePropStr = new StringBuffer ( 300 ) ;
try {
. append ( "hash=" ) . append ( urlHash )
. append ( ",referrer=" ) . append ( referrerHash )
. append ( ",mod=" ) . append ( shortDayFormatter . format ( moddate ) )
. append ( ",load=" ) . append ( shortDayFormatter . format ( loaddate ) )
. append ( ",size=" ) . append ( size )
. append ( ",wc=" ) . append ( wordCount )
. append ( ",cc=" ) . append ( copyCount )
. append ( ",local=" ) . append ( ( ( local ( ) ) ? "true" : "false" ) )
. append ( ",q=" ) . append ( kelondroBase64Order . enhancedCoder . encodeLong ( quality , urlQualityLength ) )
. append ( ",dt=" ) . append ( doctype )
. append ( ",lang=" ) . append ( language )
. append ( ",url=" ) . append ( crypt . simpleEncode ( url . toString ( ) ) )
. append ( ",descr=" ) . append ( crypt . simpleEncode ( descr ) ) ;
if ( this . word ! = null ) {
// append also word properties
corePropStr . append ( ",word=" ) . append ( kelondroBase64Order . enhancedCoder . encodeString ( word . toPropertyForm ( false ) ) ) ;
return corePropStr ;
} catch ( Exception e ) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
// e.printStackTrace();
return null ;
/ *
public String toString ( int posintext , int posinphrase , int posofphrase ) {
// add information needed for remote transport
final StringBuffer core = corePropList ( ) ;
if ( core = = null ) return null ;
core . ensureCapacity ( core . length ( ) + 200 ) ;
core . insert ( 0 , "{" )
. append ( ",posintext=" ) . append ( posintext )
. append ( ",posinphrase=" ) . append ( posinphrase )
. append ( ",posofphraseint=" ) . append ( posofphrase )
. append ( "}" ) ;
return core . toString ( ) ;
* /
public String toString ( String snippet ) {
// add information needed for remote transport
final StringBuffer core = corePropList ( ) ;
if ( core = = null ) return null ;
core . ensureCapacity ( core . length ( ) + snippet . length ( ) * 2 ) ;
core . insert ( 0 , "{" ) ;
core . append ( ",snippet=" ) . append ( crypt . simpleEncode ( snippet ) ) ;
core . append ( "}" ) ;
return core . toString ( ) ;
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
/ * *
* Returns this object as String . < br >
* This e . g . looks like this :
* < pre > { hash = jmqfMk7Y3NKw , referrer = - - - - - - - - - - - - , mod = 20050610 , load = 20051003 , size = 51666 , wc = 1392 , cc = 0 , local = true , q = AEn , dt = h , lang = uk , url = b | aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv , descr = b | S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz } < / pre >
* /
public String toString ( ) {
final StringBuffer core = corePropList ( ) ;
if ( core = = null ) return null ;
core . insert ( 0 , "{" ) ;
core . append ( "}" ) ;
return core . toString ( ) ;
//return "{" + core + "}";
public void print ( ) {
System . out . println ( "URL : " + url ) ;
System . out . println ( "Description : " + descr ) ;
System . out . println ( "Modified : " + httpc . dateString ( moddate ) ) ;
System . out . println ( "Loaded : " + httpc . dateString ( loaddate ) ) ;
System . out . println ( "Size : " + size + " bytes, " + wordCount + " words" ) ;
System . out . println ( "Referrer Hash : " + referrerHash ) ;
System . out . println ( "Quality : " + quality ) ;
System . out . println ( "Language : " + language ) ;
System . out . println ( "DocType : " + doctype ) ;
System . out . println ( ) ;
} // class Entry
public class kiter implements Iterator {
// enumerates entry elements
Iterator i ;
@ -742,7 +402,7 @@ public final class plasmaCrawlLURL extends indexURL {
kelondroRow . Entry e = ( kelondroRow . Entry ) i . next ( ) ;
if ( e = = null ) return null ;
try {
return new Entry( e , null ) ;
return new plasmaCrawlLURLOld Entry( e , null ) ;
} catch ( IOException ex ) {
throw new RuntimeException ( "error '" + ex . getMessage ( ) + "' for hash " + e . getColString ( 0 , null ) ) ;
@ -873,7 +533,7 @@ public final class plasmaCrawlLURL extends indexURL {
plasmaCrawlLURL . Entry entry = ( plasmaCrawlLURL . Entry) eiter . next ( ) ;
plasmaCrawlLURL Entry entry = ( plasmaCrawlLURL Entry) eiter . next ( ) ;
totalSearchedUrls + + ;
if ( plasmaSwitchboard . urlBlacklist . isListed ( plasmaURLPattern . BLACKLIST_CRAWLER , entry . url ( ) ) | |
plasmaSwitchboard . urlBlacklist . isListed ( plasmaURLPattern . BLACKLIST_DHT , entry . url ( ) ) ) {
@ -944,7 +604,7 @@ public final class plasmaCrawlLURL extends indexURL {
final plasmaCrawlLURL urls = new plasmaCrawlLURL ( new File ( args [ 1 ] ) , 1 , 0 , false ) ;
final Iterator enu = urls . entries ( true , false , null ) ;
while ( enu . hasNext ( ) ) {
( ( Entry) enu . next ( ) ) . print ( ) ;
( ( plasmaCrawlLURL Entry) enu . next ( ) ) . print ( ) ;
} catch ( Exception e ) {
e . printStackTrace ( ) ;