@ -43,29 +43,13 @@
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
import java.io.File ;
import java.io.IOException ;
import java.io.Writer ;
import java.net.MalformedURLException ;
import java.util.Date ;
import java.util.Enumeration ;
import java.util.HashMap ;
import java.util.Iterator ;
import java.util.Map ;
import java.util.regex.Pattern ;
import java.util.regex.PatternSyntaxException ;
import de.anomic.data.wikiCode ;
import de.anomic.htmlFilter.htmlFilterContentScraper ;
import de.anomic.htmlFilter.htmlFilterWriter ;
import de.anomic.http.httpHeader ;
import de.anomic.kelondro.kelondroBitfield ;
import de.anomic.plasma.plasmaURL ;
import de.anomic.net.URL ;
import de.anomic.plasma.plasmaCrawlEURL ;
import de.anomic.plasma.plasmaCrawlProfile ;
import de.anomic.plasma.plasmaSwitchboard ;
import de.anomic.server.serverFileUtils ;
import de.anomic.server.serverObjects ;
import de.anomic.server.serverSwitch ;
import de.anomic.server.serverThread ;
@ -81,230 +65,10 @@ public class IndexCreate_p {
plasmaSwitchboard switchboard = ( plasmaSwitchboard ) env ;
serverObjects prop = new serverObjects ( ) ;
prop . put ( "error" , 0 ) ;
prop . put ( "info" , 0 ) ;
prop . put ( "refreshbutton" , 0 ) ;
if ( post ! = null ) {
if ( post . containsKey ( "crawlingstart" ) ) {
// init crawl
if ( yacyCore . seedDB = = null ) {
prop . put ( "error" , 3 ) ;
} else {
// set new properties
String newcrawlingfilter = post . get ( "crawlingFilter" , ".*" ) ;
env . setConfig ( "crawlingFilter" , newcrawlingfilter ) ;
int newcrawlingdepth = Integer . parseInt ( post . get ( "crawlingDepth" , "0" ) ) ;
env . setConfig ( "crawlingDepth" , Integer . toString ( newcrawlingdepth ) ) ;
boolean crawlingIfOlderCheck = post . get ( "crawlingIfOlderCheck" , "off" ) . equals ( "on" ) ;
int crawlingIfOlderNumber = Integer . parseInt ( post . get ( "crawlingIfOlderNumber" , "-1" ) ) ;
String crawlingIfOlderUnit = post . get ( "crawlingIfOlderUnit" , "year" ) ;
int crawlingIfOlder = recrawlIfOlderC ( crawlingIfOlderCheck , crawlingIfOlderNumber , crawlingIfOlderUnit ) ;
env . setConfig ( "crawlingIfOlder" , crawlingIfOlder ) ;
boolean crawlingDomFilterCheck = post . get ( "crawlingDomFilterCheck" , "off" ) . equals ( "on" ) ;
int crawlingDomFilterDepth = ( crawlingDomFilterCheck ) ? Integer . parseInt ( post . get ( "crawlingDomFilterDepth" , "-1" ) ) : - 1 ;
env . setConfig ( "crawlingDomFilterDepth" , Integer . toString ( crawlingDomFilterDepth ) ) ;
boolean crawlingDomMaxCheck = post . get ( "crawlingDomMaxCheck" , "off" ) . equals ( "on" ) ;
int crawlingDomMaxPages = ( crawlingDomMaxCheck ) ? Integer . parseInt ( post . get ( "crawlingDomMaxPages" , "-1" ) ) : - 1 ;
env . setConfig ( "crawlingDomMaxPages" , Integer . toString ( crawlingDomMaxPages ) ) ;
boolean crawlingQ = post . get ( "crawlingQ" , "off" ) . equals ( "on" ) ;
env . setConfig ( "crawlingQ" , ( crawlingQ ) ? "true" : "false" ) ;
boolean storeHTCache = post . get ( "storeHTCache" , "off" ) . equals ( "on" ) ;
env . setConfig ( "storeHTCache" , ( storeHTCache ) ? "true" : "false" ) ;
boolean localIndexing = post . get ( "localIndexing" , "off" ) . equals ( "on" ) ;
env . setConfig ( "localIndexing" , ( localIndexing ) ? "true" : "false" ) ;
boolean crawlOrder = post . get ( "crawlOrder" , "off" ) . equals ( "on" ) ;
env . setConfig ( "crawlOrder" , ( crawlOrder ) ? "true" : "false" ) ;
boolean xsstopw = post . get ( "xsstopw" , "off" ) . equals ( "on" ) ;
env . setConfig ( "xsstopw" , ( xsstopw ) ? "true" : "false" ) ;
boolean xdstopw = post . get ( "xdstopw" , "off" ) . equals ( "on" ) ;
env . setConfig ( "xdstopw" , ( xdstopw ) ? "true" : "false" ) ;
boolean xpstopw = post . get ( "xpstopw" , "off" ) . equals ( "on" ) ;
env . setConfig ( "xpstopw" , ( xpstopw ) ? "true" : "false" ) ;
String crawlingMode = post . get ( "crawlingMode" , "url" ) ;
if ( crawlingMode . equals ( "url" ) ) {
// getting the crawljob start url
String crawlingStart = post . get ( "crawlingURL" , "" ) ;
crawlingStart = crawlingStart . trim ( ) ;
// adding the prefix http:// if necessary
int pos = crawlingStart . indexOf ( "://" ) ;
if ( pos = = - 1 ) crawlingStart = "http://" + crawlingStart ;
// normalizing URL
try { crawlingStart = new URL ( crawlingStart ) . toNormalform ( ) ; } catch ( MalformedURLException e1 ) { }
// check if url is proper
URL crawlingStartURL = null ;
try {
crawlingStartURL = new URL ( crawlingStart ) ;
} catch ( MalformedURLException e ) {
crawlingStartURL = null ;
}
// check if pattern matches
if ( ( crawlingStartURL = = null ) /* || (!(crawlingStart.matches(newcrawlingfilter))) */ ) {
// print error message
prop . put ( "error" , 4 ) ; //crawlfilter does not match url
prop . put ( "error_newcrawlingfilter" , newcrawlingfilter ) ;
prop . put ( "error_crawlingStart" , crawlingStart ) ;
} else try {
// check if the crawl filter works correctly
Pattern . compile ( newcrawlingfilter ) ;
// stack request
// first delete old entry, if exists
String urlhash = plasmaURL . urlHash ( crawlingStart ) ;
switchboard . wordIndex . loadedURL . remove ( urlhash ) ;
switchboard . noticeURL . remove ( urlhash ) ;
switchboard . errorURL . remove ( urlhash ) ;
// stack url
plasmaCrawlProfile . entry pe = switchboard . profiles . newEntry ( crawlingStartURL . getHost ( ) , crawlingStart , newcrawlingfilter , newcrawlingfilter , newcrawlingdepth , newcrawlingdepth , crawlingIfOlder , crawlingDomFilterDepth , crawlingDomMaxPages , crawlingQ , storeHTCache , true , localIndexing , crawlOrder , xsstopw , xdstopw , xpstopw ) ;
String reasonString = switchboard . sbStackCrawlThread . stackCrawl ( crawlingStart , null , yacyCore . seedDB . mySeed . hash , "CRAWLING-ROOT" , new Date ( ) , 0 , pe ) ;
if ( reasonString = = null ) {
// liftoff!
prop . put ( "info" , 2 ) ; //start msg
prop . put ( "info_crawlingURL" , ( ( String ) post . get ( "crawlingURL" ) ) ) ;
// generate a YaCyNews if the global flag was set
if ( crawlOrder ) {
Map m = new HashMap ( pe . map ( ) ) ; // must be cloned
m . remove ( "specificDepth" ) ;
m . remove ( "localIndexing" ) ;
m . remove ( "remoteIndexing" ) ;
m . remove ( "xsstopw" ) ;
m . remove ( "xpstopw" ) ;
m . remove ( "xdstopw" ) ;
m . remove ( "storeTXCache" ) ;
m . remove ( "storeHTCache" ) ;
m . remove ( "generalFilter" ) ;
m . remove ( "specificFilter" ) ;
m . put ( "intention" , post . get ( "intention" , "" ) . replace ( ',' , '/' ) ) ;
yacyCore . newsPool . publishMyNews ( new yacyNewsRecord ( "crwlstrt" , m ) ) ;
}
} else {
prop . put ( "error" , 5 ) ; //Crawling failed
prop . put ( "error_crawlingURL" , wikiCode . replaceHTML ( ( ( String ) post . get ( "crawlingURL" ) ) ) ) ;
prop . put ( "error_reasonString" , reasonString ) ;
plasmaCrawlEURL . Entry ee = switchboard . errorURL . newEntry ( crawlingStartURL , null , yacyCore . seedDB . mySeed . hash , yacyCore . seedDB . mySeed . hash ,
crawlingStartURL . getHost ( ) , reasonString , new kelondroBitfield ( ) ) ;
ee . store ( ) ;
switchboard . errorURL . stackPushEntry ( ee ) ;
}
} catch ( PatternSyntaxException e ) {
prop . put ( "error" , 8 ) ; //crawlfilter does not match url
prop . put ( "error_newcrawlingfilter" , newcrawlingfilter ) ;
prop . put ( "error_error" , e . getMessage ( ) ) ;
} catch ( Exception e ) {
// mist
prop . put ( "error" , 6 ) ; //Error with url
prop . put ( "error_crawlingStart" , crawlingStart ) ;
prop . put ( "error_error" , e . getMessage ( ) ) ;
e . printStackTrace ( ) ;
}
} else if ( crawlingMode . equals ( "file" ) ) {
if ( post . containsKey ( "crawlingFile" ) ) {
// getting the name of the uploaded file
String fileName = ( String ) post . get ( "crawlingFile" ) ;
try {
// check if the crawl filter works correctly
Pattern . compile ( newcrawlingfilter ) ;
// loading the file content
File file = new File ( fileName ) ;
// getting the content of the bookmark file
byte [ ] fileContent = ( byte [ ] ) post . get ( "crawlingFile$file" ) ;
// TODO: determine the real charset here ....
String fileString = new String ( fileContent , "UTF-8" ) ;
// parsing the bookmark file and fetching the headline and contained links
htmlFilterContentScraper scraper = new htmlFilterContentScraper ( new URL ( file ) ) ;
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
Writer writer = new htmlFilterWriter ( null , null , scraper , null , false ) ;
serverFileUtils . write ( fileString , writer ) ;
writer . close ( ) ;
//String headline = scraper.getHeadline();
HashMap hyperlinks = ( HashMap ) scraper . getAnchors ( ) ;
// creating a crawler profile
plasmaCrawlProfile . entry profile = switchboard . profiles . newEntry ( fileName , file . toURL ( ) . toString ( ) , newcrawlingfilter , newcrawlingfilter , newcrawlingdepth , newcrawlingdepth , crawlingIfOlder , crawlingDomFilterDepth , crawlingDomMaxPages , crawlingQ , storeHTCache , true , localIndexing , crawlOrder , xsstopw , xdstopw , xpstopw ) ;
// loop through the contained links
Iterator interator = hyperlinks . entrySet ( ) . iterator ( ) ;
int c = 0 ;
while ( interator . hasNext ( ) ) {
Map . Entry e = ( Map . Entry ) interator . next ( ) ;
String nexturlstring = ( String ) e . getKey ( ) ;
if ( nexturlstring = = null ) continue ;
nexturlstring = nexturlstring . trim ( ) ;
// normalizing URL
nexturlstring = new URL ( nexturlstring ) . toNormalform ( ) ;
// generating an url object
URL nexturlURL = null ;
try {
nexturlURL = new URL ( nexturlstring ) ;
} catch ( MalformedURLException ex ) {
nexturlURL = null ;
c + + ;
continue ;
}
// enqueuing the url for crawling
String rejectReason = switchboard . sbStackCrawlThread . stackCrawl ( nexturlstring , null , yacyCore . seedDB . mySeed . hash , ( String ) e . getValue ( ) , new Date ( ) , 1 , profile ) ;
// if something failed add the url into the errorURL list
if ( rejectReason = = null ) {
c + + ;
} else {
plasmaCrawlEURL . Entry ee = switchboard . errorURL . newEntry ( nexturlURL , null , yacyCore . seedDB . mySeed . hash , yacyCore . seedDB . mySeed . hash ,
( String ) e . getValue ( ) , rejectReason , new kelondroBitfield ( ) ) ;
ee . store ( ) ;
switchboard . errorURL . stackPushEntry ( ee ) ;
}
}
} catch ( PatternSyntaxException e ) {
// print error message
prop . put ( "error" , 8 ) ; //crawlfilter does not match url
prop . put ( "error_newcrawlingfilter" , newcrawlingfilter ) ;
prop . put ( "error_error" , e . getMessage ( ) ) ;
} catch ( Exception e ) {
// mist
prop . put ( "error" , 7 ) ; //Error with file
prop . put ( "error_crawlingStart" , fileName ) ;
prop . put ( "error_error" , e . getMessage ( ) ) ;
e . printStackTrace ( ) ;
}
}
}
}
}
if ( post . containsKey ( "distributedcrawling" ) ) {
long newBusySleep = Integer . parseInt ( env . getConfig ( "62_remotetriggeredcrawl_busysleep" , "100" ) ) ;
if ( post . get ( "dcr" , "" ) . equals ( "acceptCrawlMax" ) ) {
@ -328,18 +92,14 @@ public class IndexCreate_p {
if ( post . containsKey ( "pausecrawlqueue" ) ) {
switchboard . pauseCrawlJob ( plasmaSwitchboard . CRAWLJOB_LOCAL_CRAWL ) ;
prop . put ( "info" , 4 ) ; //crawling paused
prop . put ( "info" , 1 ) ; //crawling paused
}
if ( post . containsKey ( "continuecrawlqueue" ) ) {
switchboard . continueCrawlJob ( plasmaSwitchboard . CRAWLJOB_LOCAL_CRAWL ) ;
prop . put ( "info" , 5 ) ; //crawling continued
prop . put ( "info" , 2 ) ; //crawling continued
}
if ( post . containsKey ( "deleteprofile" ) ) {
String handle = ( String ) post . get ( "handle" ) ;
if ( handle ! = null ) switchboard . profiles . removeEntry ( handle ) ;
}
}
// define visible variables
@ -420,43 +180,7 @@ public class IndexCreate_p {
}
// create prefetch table
boolean dark ;
// sed crawl profiles
int count = 0 ;
int domlistlength = ( post = = null ) ? 160 : post . getInt ( "domlistlength" , 160 ) ;
//try{
Iterator it = switchboard . profiles . profiles ( true ) ;
plasmaCrawlProfile . entry profile ;
dark = true ;
while ( it . hasNext ( ) ) {
profile = ( plasmaCrawlProfile . entry ) it . next ( ) ;
//table += profile.map().toString() + "<br>";
prop . put ( "crawlProfiles_" + count + "_dark" , ( ( dark ) ? 1 : 0 ) ) ;
prop . put ( "crawlProfiles_" + count + "_name" , wikiCode . replaceHTML ( profile . name ( ) ) ) ;
prop . put ( "crawlProfiles_" + count + "_startURL" , wikiCode . replaceHTML ( profile . startURL ( ) ) ) ;
prop . put ( "crawlProfiles_" + count + "_handle" , wikiCode . replaceHTML ( profile . handle ( ) ) ) ;
prop . put ( "crawlProfiles_" + count + "_depth" , profile . generalDepth ( ) ) ;
prop . put ( "crawlProfiles_" + count + "_filter" , profile . generalFilter ( ) ) ;
prop . put ( "crawlProfiles_" + count + "_crawlingIfOlder" , ( profile . recrawlIfOlder ( ) = = Long . MAX_VALUE ) ? "no re-crawl" : "" + profile . recrawlIfOlder ( ) ) ;
prop . put ( "crawlProfiles_" + count + "_crawlingDomFilterDepth" , ( profile . domFilterDepth ( ) = = Integer . MAX_VALUE ) ? "inactive" : "" + profile . domFilterDepth ( ) ) ;
prop . put ( "crawlProfiles_" + count + "_crawlingDomFilterContent" , profile . domNames ( true , domlistlength ) ) ;
prop . put ( "crawlProfiles_" + count + "_crawlingDomMaxPages" , ( profile . domMaxPages ( ) = = Integer . MAX_VALUE ) ? "unlimited" : "" + profile . domMaxPages ( ) ) ;
prop . put ( "crawlProfiles_" + count + "_withQuery" , ( ( profile . crawlingQ ( ) ) ? 1 : 0 ) ) ;
prop . put ( "crawlProfiles_" + count + "_storeCache" , ( ( profile . storeHTCache ( ) ) ? 1 : 0 ) ) ;
prop . put ( "crawlProfiles_" + count + "_localIndexing" , ( ( profile . localIndexing ( ) ) ? 1 : 0 ) ) ;
prop . put ( "crawlProfiles_" + count + "_remoteIndexing" , ( ( profile . remoteIndexing ( ) ) ? 1 : 0 ) ) ;
prop . put ( "crawlProfiles_" + count + "_deleteButton" , ( ( ( profile . name ( ) . equals ( "remote" ) ) | |
( profile . name ( ) . equals ( "proxy" ) ) | |
( profile . name ( ) . equals ( "snippet" ) ) ) ? 0 : 1 ) ) ;
prop . put ( "crawlProfiles_" + count + "_deleteButton_handle" , profile . handle ( ) ) ;
dark = ! dark ;
count + + ;
}
//}catch(IOException e){};
prop . put ( "crawlProfiles" , count ) ;
boolean dark = true ;
// create other peer crawl table using YaCyNews
int availableNews = yacyCore . newsPool . size ( yacyNewsPool . INCOMING_DB ) ;
@ -513,9 +237,8 @@ public class IndexCreate_p {
// remote crawl peers
if ( yacyCore . seedDB = = null ) {
//table += "Sorry, cannot show any crawl output now because the system is not completely initialised. Please re-try.";
prop . put ( "error" , 3 ) ;
if ( yacyCore . seedDB ! = null ) {
prop . put ( "remoteCrawlPeers" , 0 ) ;
} else {
Enumeration crawlavail = yacyCore . dhtAgent . getAcceptRemoteCrawlSeeds ( plasmaURL . dummyHash , true ) ;
Enumeration crawlpendi = yacyCore . dhtAgent . getAcceptRemoteCrawlSeeds ( plasmaURL . dummyHash , false ) ;
@ -546,22 +269,12 @@ public class IndexCreate_p {
}
prop . put ( "crawler-paused" , ( switchboard . crawlJobIsPaused ( plasmaSwitchboard . CRAWLJOB_LOCAL_CRAWL ) ) ? 0 : 1 ) ;
// return rewrite properties
return prop ;
}
private static int recrawlIfOlderC ( boolean recrawlIfOlderCheck , int recrawlIfOlderNumber , String crawlingIfOlderUnit ) {
if ( ! recrawlIfOlderCheck ) return - 1 ;
if ( crawlingIfOlderUnit . equals ( "year" ) ) return recrawlIfOlderNumber * 60 * 24 * 356 ;
if ( crawlingIfOlderUnit . equals ( "month" ) ) return recrawlIfOlderNumber * 60 * 24 * 30 ;
if ( crawlingIfOlderUnit . equals ( "day" ) ) return recrawlIfOlderNumber * 60 * 24 ;
if ( crawlingIfOlderUnit . equals ( "hour" ) ) return recrawlIfOlderNumber * 60 ;
if ( crawlingIfOlderUnit . equals ( "minute" ) ) return recrawlIfOlderNumber ;
return - 1 ;
}
}