@ -192,7 +192,6 @@ public class plasmaSnippetCache {
try {
try {
// trying to load the resource from the cache
// trying to load the resource from the cache
resource = this . cacheManager . loadResourceContent ( url ) ;
resource = this . cacheManager . loadResourceContent ( url ) ;
docInfo = this . cacheManager . loadResourceInfo ( url ) ;
// if not found try to download it
// if not found try to download it
if ( ( resource = = null ) & & ( fetchOnline ) ) {
if ( ( resource = = null ) & & ( fetchOnline ) ) {
@ -200,23 +199,22 @@ public class plasmaSnippetCache {
plasmaHTCache . Entry entry = loadResourceFromWeb ( url , 5000 ) ;
plasmaHTCache . Entry entry = loadResourceFromWeb ( url , 5000 ) ;
// getting resource metadata (e.g. the http headers for http resources)
// getting resource metadata (e.g. the http headers for http resources)
if ( entry ! = null ) {
if ( entry ! = null ) docInfo = entry . getDocumentInfo ( ) ;
docInfo = entry . getDocumentInfo ( ) ;
}
// now the resource should be stored in the cache, load body
// read resource body
resource = this . cacheManager . loadResourceContent ( url ) ;
resource = entry . cacheArray ( ) ;
if ( resource = = null ) {
if ( resource = = null ) {
//System.out.println("cannot load document for URL " + url);
return new Snippet ( null , ERROR_RESOURCE_LOADING , "error loading resource, plasmaHTCache.Entry cache is NULL" ) ;
return new Snippet ( null , ERROR_RESOURCE_LOADING , "error loading resource from web, cacheManager returned NULL" ) ;
}
}
source = SOURCE_WEB ;
source = SOURCE_WEB ;
}
}
} catch ( Exception e ) {
} catch ( Exception e ) {
if ( ! ( e instanceof plasmaCrawlerException ) ) e . printStackTrace ( ) ;
if ( ! ( e instanceof plasmaCrawlerException ) ) e . printStackTrace ( ) ;
return new Snippet ( null , ERROR_SOURCE_LOADING , "error loading resource from web : " + e . getMessage ( ) ) ;
return new Snippet ( null , ERROR_SOURCE_LOADING , "error loading resource : " + e . getMessage ( ) ) ;
}
}
if ( resource = = null ) return new Snippet ( null , ERROR_SOURCE_LOADING , "no resource available" ) ;
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
/ * = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
* PARSING RESOURCE
* PARSING RESOURCE
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * /
@ -460,10 +458,11 @@ public class plasmaSnippetCache {
} catch ( Exception e ) {
} catch ( Exception e ) {
// ignore this. resource info loading failed
// ignore this. resource info loading failed
}
}
}
// TODO: we need a better solution here
// TODO: we need a better solution here
// encapsulate this in the crawlLoader class
// encapsulate this in the crawlLoader class
if ( url . getProtocol ( ) . startsWith ( "http" ) ) {
if ( ( docInfo = = null ) & & ( url . getProtocol ( ) . startsWith ( "http" ) ) ) {
// getting URL mimeType
// getting URL mimeType
try {
try {
httpHeader header = httpc . whead ( url , url . getHost ( ) , 10000 , null , null , this . sb . remoteProxyConfig ) ;
httpHeader header = httpc . whead ( url , url . getHost ( ) , 10000 , null , null , this . sb . remoteProxyConfig ) ;
@ -473,8 +472,6 @@ public class plasmaSnippetCache {
}
}
}
}
}
if ( docInfo = = null ) {
if ( docInfo = = null ) {
String filename = this . cacheManager . getCachePath ( url ) . getName ( ) ;
String filename = this . cacheManager . getCachePath ( url ) . getName ( ) ;
int p = filename . lastIndexOf ( '.' ) ;
int p = filename . lastIndexOf ( '.' ) ;