@ -25,6 +25,7 @@ import java.io.FileNotFoundException;
import java.io.IOException ;
import java.io.Writer ;
import java.net.MalformedURLException ;
import java.util.ArrayList ;
import java.util.Date ;
import java.util.HashMap ;
import java.util.HashSet ;
@ -207,7 +208,7 @@ public class Crawler_p {
prop . putHTML ( "info-queue_message" , "pause reason: " + queuemessage ) ;
}
if ( post ! = null & & post . containsKey ( "terminate" ) ) try {
if ( post ! = null & & post . containsKey ( "terminate" ) ) try {
final String handle = post . get ( "handle" , "" ) ;
// termination of a crawl: shift the crawl from active to passive
final CrawlProfile p = sb . crawler . getActive ( handle . getBytes ( ) ) ;
@ -226,11 +227,11 @@ public class Crawler_p {
prop . put ( "info" , "3" ) ;
} else {
if ( post . getBoolean ( "cleanSearchCache" ) ) {
// clean up all search events
SearchEventCache . cleanupEvents ( true ) ;
sb . index . clearCaches ( ) ; // every time the ranking is changed we need to remove old orderings
}
if ( post . getBoolean ( "cleanSearchCache" ) ) {
// clean up all search events
SearchEventCache . cleanupEvents ( true ) ;
sb . index . clearCaches ( ) ; // every time the ranking is changed we need to remove old orderings
}
// remove crawlingFileContent before we record the call
String crawlingFileName = post . get ( "crawlingFile" ) ;
@ -267,6 +268,7 @@ public class Crawler_p {
Set < DigestURL > rootURLs = new HashSet < DigestURL > ( ) ;
String crawlName = "" ;
if ( crawlingFile = = null ) for ( String crawlingStart : rootURLs0 ) {
StringBuilder crawlNameBuilder = new StringBuilder ( ) ; // for large crawl queues this can be pretty large
if ( crawlingStart = = null | | crawlingStart . length ( ) = = 0 ) continue ;
// add the prefix http:// if necessary
int pos = crawlingStart . indexOf ( "://" , 0 ) ;
@ -276,14 +278,14 @@ public class Crawler_p {
try {
DigestURL crawlingStartURL = new DigestURL ( crawlingStart ) ;
rootURLs . add ( crawlingStartURL ) ;
crawlName + = ( ( crawlingStartURL . getHost ( ) = = null ) ? crawlingStartURL . toNormalform ( true ) : crawlingStartURL . getHost ( ) ) + ',' ;
crawlName Builder. append ( ( crawlingStartURL . getHost ( ) = = null ) ? crawlingStartURL . toNormalform ( true ) : crawlingStartURL . getHost ( ) ) . append ( ',' ) ;
if ( crawlingStartURL ! = null & & ( crawlingStartURL . isFile ( ) | | crawlingStartURL . isSMB ( ) ) ) storeHTCache = false ;
} catch ( final MalformedURLException e ) {
ConcurrentLog . warn ( "Crawler_p" , "crawl start url invalid: " + e . getMessage ( ) ) ;
}
crawlName = crawlNameBuilder . toString ( ) ;
} else {
crawlName = crawlingFile . getName ( ) ;
crawlName = crawlingFile . getName ( ) ;
}
if ( crawlName . endsWith ( "," ) ) crawlName = crawlName . substring ( 0 , crawlName . length ( ) - 1 ) ;
if ( crawlName . length ( ) > 64 ) {
@ -415,13 +417,15 @@ public class Crawler_p {
// delete all error urls for that domain
// and all urls for that host from the crawl queue
List < String > deleteIDs = new ArrayList < > ( ) ;
Set < String > hosthashes = new HashSet < String > ( ) ;
boolean anysmbftporpdf = false ;
for ( DigestURL u : rootURLs ) {
sb. index . fulltext ( ) . remove ( u . hash ( ) ) ;
deleteIDs. add ( new String ( u . hash ( ) ) ) ;
hosthashes . add ( u . hosthash ( ) ) ;
if ( "smb.ftp" . indexOf ( u . getProtocol ( ) ) > = 0 | | "pdf" . equals ( MultiProtocolURL . getFileExtension ( u . getFileName ( ) ) ) ) anysmbftporpdf = true ;
}
sb . index . fulltext ( ) . remove ( deleteIDs ) ;
sb . crawlQueues . removeHosts ( hosthashes ) ;
sb . index . fulltext ( ) . commit ( true ) ;
@ -458,8 +462,10 @@ public class Crawler_p {
// check if the crawl filter works correctly
try {
Pattern mmp = Pattern . compile ( newcrawlingMustMatch ) ;
int maxcheck = 100 ;
for ( DigestURL u : rootURLs ) {
assert mmp . matcher ( u . toNormalform ( true ) ) . matches ( ) : "pattern " + mmp . toString ( ) + " does not match url " + u . toNormalform ( true ) ;
if ( maxcheck - - < = 0 ) break ;
}
} catch ( final PatternSyntaxException e ) {
prop . put ( "info" , "4" ) ; // crawlfilter does not match url
@ -474,7 +480,7 @@ public class Crawler_p {
prop . put ( "info" , "5" ) ; //Crawling failed
prop . putHTML ( "info_crawlingURL" , "(no url given)" ) ;
prop . putHTML ( "info_reasonString" , "you must submit at least one crawl url" ) ;
hasCrawlstartDataOK = false ;
hasCrawlstartDataOK = false ;
}
}
@ -487,10 +493,10 @@ public class Crawler_p {
String ignoreclassname_s = post . get ( "ignoreclassname" ) ;
Set < String > ignoreclassname = new HashSet < > ( ) ;
if ( ignoreclassname_s ! = null ) {
String [ ] ignoreclassname_a = ignoreclassname_s . trim ( ) . split ( "," ) ;
for ( int i = 0 ; i < ignoreclassname_a . length ; i + + ) {
ignoreclassname . add ( ignoreclassname_a [ i ] . trim ( ) ) ;
}
String [ ] ignoreclassname_a = ignoreclassname_s . trim ( ) . split ( "," ) ;
for ( int i = 0 ; i < ignoreclassname_a . length ; i + + ) {
ignoreclassname . add ( ignoreclassname_a [ i ] . trim ( ) ) ;
}
}
// get vocabulary scraper info
@ -528,13 +534,13 @@ public class Crawler_p {
try {
if ( newcrawlingdepth > 0 ) {
if ( fullDomain ) {
/ * Crawl is restricted to start domains or sub - paths : we have to get all the start links now .
* Otherwise we can get them asynchronously later , thus allowing to handle more efficiently large start crawlingFiles * /
/ * Crawl is restricted to start domains or sub - paths : we have to get all the start links now .
* Otherwise we can get them asynchronously later , thus allowing to handle more efficiently large start crawlingFiles * /
hyperlinks_from_file = crawlingFileStart ( crawlingFile , timezoneOffset , crawlingFileContent ) ;
newcrawlingMustMatch = CrawlProfile . siteFilter ( hyperlinks_from_file ) ;
} else if ( subPath ) {
/ * Crawl is restricted to start domains or sub - paths : we have to get all the start links now .
* Otherwise we can get them asynchronously later , thus allowing to handle more efficiently large start crawlingFiles * /
/ * Crawl is restricted to start domains or sub - paths : we have to get all the start links now .
* Otherwise we can get them asynchronously later , thus allowing to handle more efficiently large start crawlingFiles * /
hyperlinks_from_file = crawlingFileStart ( crawlingFile , timezoneOffset , crawlingFileContent ) ;
newcrawlingMustMatch = CrawlProfile . subpathFilter ( hyperlinks_from_file ) ;
}
@ -550,50 +556,47 @@ public class Crawler_p {
}
/* If a solr query filter is defined, verify now its syntax and that the embedded Solr schema is available */
final String solrQueryMustMatch = post . get ( CrawlAttribute . INDEXING_SOLR_QUERY_MUSTMATCH . key , CrawlProfile . SOLR_MATCH_ALL_QUERY ) . trim ( ) ;
final String solrQueryMustNotMatch = post . get ( CrawlAttribute . INDEXING_SOLR_QUERY_MUSTNOTMATCH . key , CrawlProfile . SOLR_EMPTY_QUERY ) . trim ( ) ;
if ( ! ( solrQueryMustMatch . isEmpty ( ) | | CrawlProfile . SOLR_MATCH_ALL_QUERY . equals ( solrQueryMustMatch ) ) | | ! CrawlProfile . SOLR_EMPTY_QUERY . equals ( solrQueryMustNotMatch ) ) {
final EmbeddedInstance embeddedSolr = sb . index . fulltext ( ) . getEmbeddedInstance ( ) ;
final SolrCore embeddedCore = embeddedSolr ! = null ? embeddedSolr . getDefaultCore ( ) : null ;
final boolean embeddedSolrConnected = embeddedSolr ! = null & & embeddedCore ! = null ;
prop . put ( "noEmbeddedSolr" , ! embeddedSolrConnected ) ;
if ( embeddedSolrConnected ) {
if ( ! ( solrQueryMustMatch . isEmpty ( ) | | CrawlProfile . SOLR_MATCH_ALL_QUERY . equals ( solrQueryMustMatch ) ) ) {
try {
SingleDocumentMatcher . toLuceneQuery ( solrQueryMustMatch , embeddedCore ) ;
} catch ( final SyntaxError | SolrException e ) {
hasCrawlstartDataOK = false ;
prop . put ( "info" , "10" ) ;
prop . put ( "info_solrQuery" , solrQueryMustMatch ) ;
} catch ( final RuntimeException e ) {
hasCrawlstartDataOK = false ;
prop . put ( "info" , "11" ) ;
prop . put ( "info_solrQuery" , solrQueryMustMatch ) ;
}
}
if ( ! CrawlProfile . SOLR_EMPTY_QUERY . equals ( solrQueryMustNotMatch ) ) {
try {
SingleDocumentMatcher . toLuceneQuery ( solrQueryMustNotMatch , embeddedCore ) ;
} catch ( final SyntaxError | SolrException e ) {
hasCrawlstartDataOK = false ;
prop . put ( "info" , "10" ) ;
prop . put ( "info_solrQuery" , solrQueryMustNotMatch ) ;
} catch ( final RuntimeException e ) {
hasCrawlstartDataOK = false ;
prop . put ( "info" , "11" ) ;
prop . put ( "info_solrQuery" , solrQueryMustNotMatch ) ;
}
}
} else {
hasCrawlstartDataOK = false ;
prop . put ( "info" , "9" ) ;
}
}
final String solrQueryMustMatch = post . get ( CrawlAttribute . INDEXING_SOLR_QUERY_MUSTMATCH . key , CrawlProfile . SOLR_MATCH_ALL_QUERY ) . trim ( ) ;
final String solrQueryMustNotMatch = post . get ( CrawlAttribute . INDEXING_SOLR_QUERY_MUSTNOTMATCH . key , CrawlProfile . SOLR_EMPTY_QUERY ) . trim ( ) ;
if ( ! ( solrQueryMustMatch . isEmpty ( ) | | CrawlProfile . SOLR_MATCH_ALL_QUERY . equals ( solrQueryMustMatch ) ) | | ! CrawlProfile . SOLR_EMPTY_QUERY . equals ( solrQueryMustNotMatch ) ) {
final EmbeddedInstance embeddedSolr = sb . index . fulltext ( ) . getEmbeddedInstance ( ) ;
final SolrCore embeddedCore = embeddedSolr ! = null ? embeddedSolr . getDefaultCore ( ) : null ;
final boolean embeddedSolrConnected = embeddedSolr ! = null & & embeddedCore ! = null ;
prop . put ( "noEmbeddedSolr" , ! embeddedSolrConnected ) ;
if ( embeddedSolrConnected ) {
if ( ! ( solrQueryMustMatch . isEmpty ( ) | | CrawlProfile . SOLR_MATCH_ALL_QUERY . equals ( solrQueryMustMatch ) ) ) {
try {
SingleDocumentMatcher . toLuceneQuery ( solrQueryMustMatch , embeddedCore ) ;
} catch ( final SyntaxError | SolrException e ) {
hasCrawlstartDataOK = false ;
prop . put ( "info" , "10" ) ;
prop . put ( "info_solrQuery" , solrQueryMustMatch ) ;
} catch ( final RuntimeException e ) {
hasCrawlstartDataOK = false ;
prop . put ( "info" , "11" ) ;
prop . put ( "info_solrQuery" , solrQueryMustMatch ) ;
}
}
if ( ! CrawlProfile . SOLR_EMPTY_QUERY . equals ( solrQueryMustNotMatch ) ) {
try {
SingleDocumentMatcher . toLuceneQuery ( solrQueryMustNotMatch , embeddedCore ) ;
} catch ( final SyntaxError | SolrException e ) {
hasCrawlstartDataOK = false ;
prop . put ( "info" , "10" ) ;
prop . put ( "info_solrQuery" , solrQueryMustNotMatch ) ;
} catch ( final RuntimeException e ) {
hasCrawlstartDataOK = false ;
prop . put ( "info" , "11" ) ;
prop . put ( "info_solrQuery" , solrQueryMustNotMatch ) ;
}
}
} else {
hasCrawlstartDataOK = false ;
prop . put ( "info" , "9" ) ;
}
}
// prepare a new crawling profile
final CrawlProfile profile ;
@ -632,19 +635,18 @@ public class Crawler_p {
new VocabularyScraper ( vocabulary_scraper ) ,
timezoneOffset ) ;
profile . put ( CrawlAttribute . CRAWLER_ORIGIN_URL_MUSTMATCH . key ,
post . get ( CrawlAttribute . CRAWLER_ORIGIN_URL_MUSTMATCH . key , CrawlProfile . MATCH_ALL_STRING ) ) ;
profile . put ( CrawlAttribute . CRAWLER_ORIGIN_URL_MUSTNOTMATCH . key , post
. get ( CrawlAttribute . CRAWLER_ORIGIN_URL_MUSTNOTMATCH . key , CrawlProfile . MATCH_NEVER_STRING ) ) ;
profile . put ( CrawlAttribute . INDEXING_MEDIA_TYPE_MUSTMATCH . key ,
post . get ( CrawlAttribute . INDEXING_MEDIA_TYPE_MUSTMATCH . key , CrawlProfile . MATCH_ALL_STRING ) ) ;
profile . put ( CrawlAttribute . INDEXING_MEDIA_TYPE_MUSTNOTMATCH . key , post
. get ( CrawlAttribute . INDEXING_MEDIA_TYPE_MUSTNOTMATCH . key , CrawlProfile . MATCH_NEVER_STRING ) ) ;
profile . put ( CrawlAttribute . INDEXING_SOLR_QUERY_MUSTMATCH . key , solrQueryMustMatch ) ;
profile . put ( CrawlAttribute . INDEXING_SOLR_QUERY_MUSTNOTMATCH . key , solrQueryMustNotMatch ) ;
profile . put ( CrawlAttribute . CRAWLER_ALWAYS_CHECK_MEDIA_TYPE . key ,
post . getBoolean ( "crawlerAlwaysCheckMediaType" ) ) ;
profile . put ( CrawlAttribute . CRAWLER_ORIGIN_URL_MUSTMATCH . key ,
post . get ( CrawlAttribute . CRAWLER_ORIGIN_URL_MUSTMATCH . key , CrawlProfile . MATCH_ALL_STRING ) ) ;
profile . put ( CrawlAttribute . CRAWLER_ORIGIN_URL_MUSTNOTMATCH . key , post
. get ( CrawlAttribute . CRAWLER_ORIGIN_URL_MUSTNOTMATCH . key , CrawlProfile . MATCH_NEVER_STRING ) ) ;
profile . put ( CrawlAttribute . INDEXING_MEDIA_TYPE_MUSTMATCH . key ,
post . get ( CrawlAttribute . INDEXING_MEDIA_TYPE_MUSTMATCH . key , CrawlProfile . MATCH_ALL_STRING ) ) ;
profile . put ( CrawlAttribute . INDEXING_MEDIA_TYPE_MUSTNOTMATCH . key , post
. get ( CrawlAttribute . INDEXING_MEDIA_TYPE_MUSTNOTMATCH . key , CrawlProfile . MATCH_NEVER_STRING ) ) ;
profile . put ( CrawlAttribute . INDEXING_SOLR_QUERY_MUSTMATCH . key , solrQueryMustMatch ) ;
profile . put ( CrawlAttribute . INDEXING_SOLR_QUERY_MUSTNOTMATCH . key , solrQueryMustNotMatch ) ;
profile . put ( CrawlAttribute . CRAWLER_ALWAYS_CHECK_MEDIA_TYPE . key ,
post . getBoolean ( "crawlerAlwaysCheckMediaType" ) ) ;
handle = ASCII . getBytes ( profile . handle ( ) ) ;
@ -659,13 +661,11 @@ public class Crawler_p {
handle = null ;
}
// start the crawl
if ( hasCrawlstartDataOK ) {
final boolean wontReceiptRemoteRsults = crawlOrder & & ! sb . getConfigBool ( SwitchboardConstants . CRAWLJOB_REMOTE , false ) ;
if ( hasCrawlstartDataOK ) {
final boolean wontReceiptRemoteRsults = crawlOrder & & ! sb . getConfigBool ( SwitchboardConstants . CRAWLJOB_REMOTE , false ) ;
if ( "url" . equals ( crawlingMode ) ) {
if ( "url" . equals ( crawlingMode ) ) {
// stack requests
sb . crawler . putActive ( handle , profile ) ;
final Set < DigestURL > successurls = new HashSet < DigestURL > ( ) ;
@ -709,59 +709,58 @@ public class Crawler_p {
prop . putHTML ( "info_reasonString" , fr . toString ( ) ) ;
}
if ( successurls . size ( ) > 0 ) {
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
prop . put ( "wontReceiptRemoteResults" , wontReceiptRemoteRsults ) ;
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
prop . put ( "wontReceiptRemoteResults" , wontReceiptRemoteRsults ) ;
}
} else if ( "sitemap" . equals ( crawlingMode ) ) {
try {
final DigestURL sitemapURL = sitemapURLStr . indexOf ( "//" ) > 0 ? new DigestURL ( sitemapURLStr ) : new DigestURL ( rootURLs . iterator ( ) . next ( ) , sitemapURLStr ) ; // fix for relative paths which should not exist but are used anyway
sb . crawler . putActive ( handle , profile ) ;
final SitemapImporter importer = new SitemapImporter ( sb , sitemapURL , profile ) ;
importer . start ( ) ;
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
prop . put ( "wontReceiptRemoteResults" , wontReceiptRemoteRsults ) ;
} catch ( final Exception e ) {
// mist
prop . put ( "info" , "6" ) ; //Error with url
prop . putHTML ( "info_crawlingStart" , sitemapURLStr ) ;
prop . putHTML ( "info_error" , e . getMessage ( ) ) ;
ConcurrentLog . logException ( e ) ;
}
} else if ( "sitemap" . equals ( crawlingMode ) ) {
try {
final DigestURL sitemapURL = sitemapURLStr . indexOf ( "//" ) > 0 ? new DigestURL ( sitemapURLStr ) : new DigestURL ( rootURLs . iterator ( ) . next ( ) , sitemapURLStr ) ; // fix for relative paths which should not exist but are used anyway
sb . crawler . putActive ( handle , profile ) ;
final SitemapImporter importer = new SitemapImporter ( sb , sitemapURL , profile ) ;
importer . start ( ) ;
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
prop . put ( "wontReceiptRemoteResults" , wontReceiptRemoteRsults ) ;
} catch ( final Exception e ) {
// mist
prop . put ( "info" , "6" ) ; //Error with url
prop . putHTML ( "info_crawlingStart" , sitemapURLStr ) ;
prop . putHTML ( "info_error" , e . getMessage ( ) ) ;
ConcurrentLog . logException ( e ) ;
}
} else if ( "file" . equals ( crawlingMode ) ) {
if ( post . containsKey ( "crawlingFile" ) & & crawlingFile ! = null ) {
try {
if ( newcrawlingdepth > 0 & & ( fullDomain | | subPath ) ) {
/* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
if ( hyperlinks_from_file ! = null ) {
sb . crawler . putActive ( handle , profile ) ;
sb . crawlStacker . enqueueEntriesAsynchronous ( sb . peers . mySeed ( ) . hash . getBytes ( ) , profile . handle ( ) , hyperlinks_from_file , profile . timezoneOffset ( ) ) ;
}
} else {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post . get ( "crawlingFile$file" , "" ) ;
final ContentScraper scraper = new ContentScraper ( new DigestURL ( crawlingFile ) , 10000000 ,
new HashSet < String > ( ) , new VocabularyScraper ( ) , profile . timezoneOffset ( ) ) ;
FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask ( crawlingFile , crawlingFileContent , scraper , profile ,
sb . crawlStacker , sb . peers . mySeed ( ) . hash . getBytes ( ) ) ;
sb . crawler . putActive ( handle , profile ) ;
crawlStarterTask . start ( ) ;
}
} catch ( final PatternSyntaxException e ) {
prop . put ( "info" , "4" ) ; // crawlfilter does not match url
prop . putHTML ( "info_newcrawlingfilter" , newcrawlingMustMatch ) ;
prop . putHTML ( "info_error" , e . getMessage ( ) ) ;
} catch ( final Exception e ) {
// mist
prop . put ( "info" , "7" ) ; // Error with file
prop . putHTML ( "info_crawlingStart" , crawlingFileName ) ;
prop . putHTML ( "info_error" , e . getMessage ( ) ) ;
ConcurrentLog . logException ( e ) ;
}
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
prop . put ( "wontReceiptRemoteResults" , wontReceiptRemoteRsults ) ;
}
}
} else if ( "file" . equals ( crawlingMode ) ) {
if ( post . containsKey ( "crawlingFile" ) & & crawlingFile ! = null ) {
try {
if ( newcrawlingdepth > 0 & & ( fullDomain | | subPath ) ) {
/* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
if ( hyperlinks_from_file ! = null ) {
sb . crawler . putActive ( handle , profile ) ;
sb . crawlStacker . enqueueEntriesAsynchronous ( sb . peers . mySeed ( ) . hash . getBytes ( ) , profile . handle ( ) , hyperlinks_from_file , profile . timezoneOffset ( ) ) ;
}
} else {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post . get ( "crawlingFile$file" , "" ) ;
final ContentScraper scraper = new ContentScraper ( new DigestURL ( crawlingFile ) , 10000000 ,
new HashSet < String > ( ) , new VocabularyScraper ( ) , profile . timezoneOffset ( ) ) ;
FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask ( crawlingFile , crawlingFileContent , scraper , profile ,
sb . crawlStacker , sb . peers . mySeed ( ) . hash . getBytes ( ) ) ;
sb . crawler . putActive ( handle , profile ) ;
crawlStarterTask . start ( ) ;
}
} catch ( final PatternSyntaxException e ) {
prop . put ( "info" , "4" ) ; // crawlfilter does not match url
prop . putHTML ( "info_newcrawlingfilter" , newcrawlingMustMatch ) ;
prop . putHTML ( "info_error" , e . getMessage ( ) ) ;
} catch ( final Exception e ) {
// mist
prop . put ( "info" , "7" ) ; // Error with file
prop . putHTML ( "info_crawlingStart" , crawlingFileName ) ;
prop . putHTML ( "info_error" , e . getMessage ( ) ) ;
ConcurrentLog . logException ( e ) ;
}
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
prop . put ( "wontReceiptRemoteResults" , wontReceiptRemoteRsults ) ;
}
}
}
}
}
@ -826,7 +825,7 @@ public class Crawler_p {
String hosts = "" ;
for ( final byte [ ] h : sb . crawler . getActive ( ) ) {
profile = sb . crawler . getActive ( h ) ;
if ( CrawlSwitchboard . DEFAULT_PROFILES . contains ( profile . name ( ) ) ) continue ;
if ( CrawlSwitchboard . DEFAULT_PROFILES . contains ( profile . name ( ) ) ) continue ;
profile . putProfileEntry ( "crawlProfilesShow_list_" , prop , true , dark , count , domlistlength ) ;
prop . put ( "crawlProfilesShow_list_" + count + "_debug" , debug ? 1 : 0 ) ;
if ( debug ) {
@ -877,47 +876,47 @@ public class Crawler_p {
* @throws IOException
* @throws FileNotFoundException
* /
private static List < AnchorURL > crawlingFileStart ( final File crawlingFile , int timezoneOffset ,
final String crawlingFileContent ) throws MalformedURLException , IOException , FileNotFoundException {
List < AnchorURL > hyperlinks_from_file ;
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper ( new DigestURL ( crawlingFile ) , 10000000 , new HashSet < String > ( ) , new VocabularyScraper ( ) , timezoneOffset ) ;
final Writer writer = new TransformerWriter ( null , null , scraper , false ) ;
if ( ( crawlingFileContent = = null | | crawlingFileContent . isEmpty ( ) ) & & crawlingFile ! = null ) {
/* Let's report here detailed error to help user when he selected a wrong file */
if ( ! crawlingFile . exists ( ) ) {
throw new FileNotFoundException ( crawlingFile . getAbsolutePath ( ) + " does not exists" ) ;
}
if ( ! crawlingFile . isFile ( ) ) {
throw new FileNotFoundException ( crawlingFile . getAbsolutePath ( ) + " exists but is not a regular file" ) ;
}
if ( ! crawlingFile . canRead ( ) ) {
throw new IOException ( "Can not read : " + crawlingFile . getAbsolutePath ( ) ) ;
}
}
if ( crawlingFile ! = null ) {
FileInputStream inStream = null ;
try {
inStream = new FileInputStream ( crawlingFile ) ;
FileUtils . copy ( inStream , writer ) ;
} finally {
if ( inStream ! = null ) {
try {
inStream . close ( ) ;
} catch ( IOException ignoredException ) {
ConcurrentLog . info ( "Crawler_p" , "Could not close crawlingFile : " + crawlingFile . getAbsolutePath ( ) ) ;
}
}
}
} else {
FileUtils . copy ( crawlingFileContent , writer ) ;
}
writer . close ( ) ;
// get links and generate filter
hyperlinks_from_file = scraper . getAnchors ( ) ;
return hyperlinks_from_file ;
}
private static List < AnchorURL > crawlingFileStart ( final File crawlingFile , int timezoneOffset ,
final String crawlingFileContent ) throws MalformedURLException , IOException , FileNotFoundException {
List < AnchorURL > hyperlinks_from_file ;
// check if the crawl filter works correctly
final ContentScraper scraper = new ContentScraper ( new DigestURL ( crawlingFile ) , 10000000 , new HashSet < String > ( ) , new VocabularyScraper ( ) , timezoneOffset ) ;
final Writer writer = new TransformerWriter ( null , null , scraper , false ) ;
if ( ( crawlingFileContent = = null | | crawlingFileContent . isEmpty ( ) ) & & crawlingFile ! = null ) {
/* Let's report here detailed error to help user when he selected a wrong file */
if ( ! crawlingFile . exists ( ) ) {
throw new FileNotFoundException ( crawlingFile . getAbsolutePath ( ) + " does not exists" ) ;
}
if ( ! crawlingFile . isFile ( ) ) {
throw new FileNotFoundException ( crawlingFile . getAbsolutePath ( ) + " exists but is not a regular file" ) ;
}
if ( ! crawlingFile . canRead ( ) ) {
throw new IOException ( "Can not read : " + crawlingFile . getAbsolutePath ( ) ) ;
}
}
if ( crawlingFile ! = null ) {
FileInputStream inStream = null ;
try {
inStream = new FileInputStream ( crawlingFile ) ;
FileUtils . copy ( inStream , writer ) ;
} finally {
if ( inStream ! = null ) {
try {
inStream . close ( ) ;
} catch ( IOException ignoredException ) {
ConcurrentLog . info ( "Crawler_p" , "Could not close crawlingFile : " + crawlingFile . getAbsolutePath ( ) ) ;
}
}
}
} else {
FileUtils . copy ( crawlingFileContent , writer ) ;
}
writer . close ( ) ;
// get links and generate filter
hyperlinks_from_file = scraper . getAnchors ( ) ;
return hyperlinks_from_file ;
}
private static Date timeParser ( final boolean recrawlIfOlderCheck , final int number , final String unit ) {
if ( ! recrawlIfOlderCheck ) return null ;