@ -22,16 +22,21 @@ package net.yacy.crawler.data;
import java.io.File ;
import java.io.File ;
import java.io.IOException ;
import java.io.IOException ;
import java.text.ParseException ;
import java.util.ArrayList ;
import java.util.ArrayList ;
import java.util.Collection ;
import java.util.Collection ;
import java.util.Date ;
import java.util.Date ;
import java.util.HashMap ;
import java.util.Iterator ;
import java.util.Map ;
import java.util.TreeMap ;
import java.util.TreeSet ;
import org.apache.solr.common.SolrDocument ;
import org.apache.solr.common.SolrDocument ;
import net.yacy.cora.date.GenericFormatter ;
import net.yacy.cora.date.GenericFormatter ;
import net.yacy.cora.document.encoding.ASCII ;
import net.yacy.cora.document.encoding.ASCII ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.document.id.MultiProtocolURL ;
import net.yacy.cora.util.Html2Image ;
import net.yacy.cora.util.Html2Image ;
import net.yacy.search.index.Fulltext ;
import net.yacy.search.index.Fulltext ;
import net.yacy.search.schema.CollectionSchema ;
import net.yacy.search.schema.CollectionSchema ;
@ -56,8 +61,42 @@ public class Snapshots {
private File storageLocation ;
private File storageLocation ;
private Map < String , TreeMap < Integer , TreeSet < String > > > directory ; // a TreeMap for each domain where the key is the depth and the value is a Set containing a key/urlhash id to get all files into a specific order to provide a recent view on the documents
public Snapshots ( File location ) {
public Snapshots ( File location ) {
this . storageLocation = location ;
this . storageLocation = location ;
// scan the location to fill the directory
this . directory = new HashMap < > ( ) ;
for ( String domain : location . list ( ) ) {
TreeMap < Integer , TreeSet < String > > domaindepth = new TreeMap < > ( ) ;
this . directory . put ( domain , domaindepth ) ;
File domaindir = new File ( location , domain ) ;
if ( domaindir . isDirectory ( ) ) domainscan : for ( String depth : domaindir . list ( ) ) {
TreeSet < String > dateid = new TreeSet < > ( ) ;
Integer depthi = - 1 ;
try {
depthi = Integer . parseInt ( depth ) ;
} catch ( NumberFormatException e ) {
continue domainscan ;
}
domaindepth . put ( depthi , dateid ) ;
File sharddir = new File ( domaindir , depth ) ;
if ( sharddir . isDirectory ( ) ) for ( String shard : sharddir . list ( ) ) {
File snapshotdir = new File ( sharddir , shard ) ;
if ( snapshotdir . isDirectory ( ) ) for ( String snapshotfile : snapshotdir . list ( ) ) {
if ( snapshotfile . endsWith ( ".pdf" ) ) {
String s = snapshotfile . substring ( 0 , snapshotfile . length ( ) - 4 ) ;
int p = s . indexOf ( '.' ) ;
assert p = = 12 ;
if ( p > 0 ) {
String key = s . substring ( p + 1 ) + '.' + s . substring ( 0 , p ) ;
dateid . add ( key ) ;
}
}
}
}
}
}
}
}
/ * *
/ * *
@ -75,10 +114,14 @@ public class Snapshots {
if ( replaceOld ) {
if ( replaceOld ) {
for ( File oldPath : oldPaths ) oldPath . delete ( ) ;
for ( File oldPath : oldPaths ) oldPath . delete ( ) ;
}
}
File path = definePath ( url , "pdf" , depth , date ) ;
File path = definePath ( url , depth , date , "pdf" ) ;
path . getParentFile ( ) . mkdirs ( ) ;
path . getParentFile ( ) . mkdirs ( ) ;
boolean success = Html2Image . writeWkhtmltopdf ( url . toNormalform ( true ) , proxy , userAgent , path ) ;
boolean success = Html2Image . writeWkhtmltopdf ( url . toNormalform ( true ) , proxy , userAgent , path ) ;
return success ? path : null ;
if ( success ) {
announceStorage ( url , depth , date ) ;
return path ;
}
return null ;
}
}
/ * *
/ * *
@ -90,13 +133,91 @@ public class Snapshots {
* @param date
* @param date
* @return a file to the snapshot
* @return a file to the snapshot
* /
* /
public File definePath ( final DigestURL url , final String ext , final int depth , final Date date ) {
public File definePath ( final DigestURL url , final int depth , final Date date , final String ext ) {
String id = ASCII . String ( url . hash ( ) ) ;
String id = ASCII . String ( url . hash ( ) ) ;
String ds = GenericFormatter . SHORT_ DAY _FORMATTER. format ( date ) ;
String ds = GenericFormatter . SHORT_ MINUTE _FORMATTER. format ( date ) ;
File path = new File ( pathToShard ( url , depth ) , id + "." + ds + "." + ext ) ;
File path = new File ( pathToShard ( url , depth ) , id + "." + ds + "." + ext ) ;
return path ;
return path ;
}
}
private void announceStorage ( final DigestURL url , final int depth , final Date date ) {
String id = ASCII . String ( url . hash ( ) ) ;
String ds = GenericFormatter . SHORT_MINUTE_FORMATTER . format ( date ) ;
TreeMap < Integer , TreeSet < String > > domaindepth = this . directory . get ( pathToHostDir ( url ) ) ;
if ( domaindepth = = null ) { domaindepth = new TreeMap < Integer , TreeSet < String > > ( ) ; this . directory . put ( pathToHostDir ( url ) , domaindepth ) ; }
TreeSet < String > dateid = domaindepth . get ( depth ) ;
if ( dateid = = null ) { dateid = new TreeSet < String > ( ) ; domaindepth . put ( depth , dateid ) ; }
dateid . add ( ds + '.' + id ) ;
}
public static enum Order {
ANY , OLDESTFIRST , LATESTFIRST ;
}
/ * *
* select a set of urlhashes from the snapshot directory . The selection either ordered
* by generation date ( upwards = = OLDESTFIRST or downwards = = LATESTFIRST ) or with any
* order . The result set can be selected either with a given host or a depth
* @param host selected host or null for all hosts
* @param depth selected depth or null for all depths
* @param order Order . ANY , Order . OLDESTFIRST or Order . LATESTFIRST
* @param maxcount the maximum number of hosthashes . If unlimited , submit Integer . MAX_VALUE
* @return a map of hosthashes with the associated creation date
* /
public Map < String , Date > select ( String host , Integer depth , final Order order , int maxcount ) {
TreeSet < String > dateIdResult = new TreeSet < > ( ) ;
if ( host = = null & & depth = = null ) {
loop : for ( TreeMap < Integer , TreeSet < String > > domaindepth : this . directory . values ( ) ) {
for ( TreeSet < String > keys : domaindepth . values ( ) ) {
dateIdResult . addAll ( keys ) ;
if ( order = = Order . ANY & & dateIdResult . size ( ) > = maxcount ) break loop ;
}
}
}
if ( host = = null & & depth ! = null ) {
loop : for ( TreeMap < Integer , TreeSet < String > > domaindepth : this . directory . values ( ) ) {
TreeSet < String > keys = domaindepth . get ( depth ) ;
if ( keys ! = null ) dateIdResult . addAll ( keys ) ;
if ( order = = Order . ANY & & dateIdResult . size ( ) > = maxcount ) break loop ;
}
}
if ( host ! = null & & depth = = null ) {
TreeMap < Integer , TreeSet < String > > domaindepth = this . directory . get ( pathToHostDir ( host , 80 ) ) ;
if ( domaindepth ! = null ) loop : for ( TreeSet < String > keys : domaindepth . values ( ) ) {
dateIdResult . addAll ( keys ) ;
if ( order = = Order . ANY & & dateIdResult . size ( ) > = maxcount ) break loop ;
}
}
if ( host ! = null & & depth ! = null ) {
TreeMap < Integer , TreeSet < String > > domaindepth = this . directory . get ( pathToHostDir ( host , 80 ) ) ;
if ( domaindepth ! = null ) {
TreeSet < String > keys = domaindepth . get ( depth ) ;
if ( keys ! = null ) dateIdResult . addAll ( keys ) ;
}
}
Map < String , Date > result = new HashMap < > ( ) ;
Iterator < String > i = order = = Order . LATESTFIRST ? dateIdResult . descendingIterator ( ) : dateIdResult . iterator ( ) ;
while ( i . hasNext ( ) & & result . size ( ) < maxcount ) {
String di = i . next ( ) ;
int p = di . indexOf ( '.' ) ;
assert p > = 0 ;
String d = di . substring ( 0 , p ) ;
Date date ;
try {
date = GenericFormatter . SHORT_MINUTE_FORMATTER . parse ( d ) ;
} catch ( ParseException e ) {
try {
date = GenericFormatter . SHORT_DAY_FORMATTER . parse ( d ) ;
} catch ( ParseException ee ) {
date = new Date ( ) ;
}
}
result . put ( di . substring ( p + 1 ) , date ) ;
}
return result ;
}
/ * *
/ * *
* get the depth to a document , helper method for definePath to determine the depth value
* get the depth to a document , helper method for definePath to determine the depth value
* @param url
* @param url
@ -157,10 +278,26 @@ public class Snapshots {
private File pathToShard ( final DigestURL url , final int depth ) {
private File pathToShard ( final DigestURL url , final int depth ) {
String id = ASCII . String ( url . hash ( ) ) ;
String id = ASCII . String ( url . hash ( ) ) ;
File pathToHostDir = new File ( storageLocation , url. getHost ( ) + "." + url . getPort ( ) ) ;
File pathToHostDir = new File ( storageLocation , pathToHostDir( url ) ) ;
File pathToDepthDir = new File ( pathToHostDir , depth < 10 ? "0" + depth : Integer . toString ( depth ) ) ;
File pathToDepthDir = new File ( pathToHostDir , pathToDepthDir ( depth ) ) ;
File pathToShard = new File ( pathToDepthDir , id. substring ( 0 , 2 ) ) ;
File pathToShard = new File ( pathToDepthDir , pathToShard( id ) ) ;
return pathToShard ;
return pathToShard ;
}
}
private String pathToHostDir ( final DigestURL url ) {
return pathToHostDir ( url . getHost ( ) , url . getPort ( ) ) ;
}
private String pathToHostDir ( final String host , final int port ) {
return host + "." + port ;
}
private String pathToDepthDir ( final int depth ) {
return depth < 10 ? "0" + depth : Integer . toString ( depth ) ;
}
private String pathToShard ( final String urlhash ) {
return urlhash . substring ( 0 , 2 ) ;
}
}
}