|
|
@ -113,9 +113,12 @@ public class WatchCrawler_p {
|
|
|
|
newcrawlingfilter = ".*" + (new yacyURL(post.get("crawlingURL",""), null)).getHost() + ".*";
|
|
|
|
newcrawlingfilter = ".*" + (new yacyURL(post.get("crawlingURL",""), null)).getHost() + ".*";
|
|
|
|
} catch (MalformedURLException e) {}
|
|
|
|
} catch (MalformedURLException e) {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
|
|
|
|
|
|
|
|
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
|
|
|
|
|
|
|
|
|
|
|
|
int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
|
|
|
|
int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
|
|
|
|
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
|
|
|
|
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
|
|
|
|
if ((fullDomain) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
|
|
|
|
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
|
|
|
|
|
|
|
|
|
|
|
|
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
|
|
|
|
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
|
|
|
|
int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
|
|
|
|
int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
|
|
|
@ -143,9 +146,6 @@ public class WatchCrawler_p {
|
|
|
|
boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
|
|
|
|
boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
|
|
|
|
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
|
|
|
|
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
|
|
|
|
|
|
|
|
|
|
|
|
boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
|
|
|
|
|
|
|
|
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
boolean xsstopw = post.get("xsstopw", "off").equals("on");
|
|
|
|
boolean xsstopw = post.get("xsstopw", "off").equals("on");
|
|
|
|
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
|
|
|
|
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
|
|
|
|
|
|
|
|
|
|
|
@ -207,7 +207,7 @@ public class WatchCrawler_p {
|
|
|
|
|
|
|
|
|
|
|
|
// generate a YaCyNews if the global flag was set
|
|
|
|
// generate a YaCyNews if the global flag was set
|
|
|
|
if (crawlOrder) {
|
|
|
|
if (crawlOrder) {
|
|
|
|
Map m = new HashMap(pe.map()); // must be cloned
|
|
|
|
Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
|
|
|
|
m.remove("specificDepth");
|
|
|
|
m.remove("specificDepth");
|
|
|
|
m.remove("indexText");
|
|
|
|
m.remove("indexText");
|
|
|
|
m.remove("indexMedia");
|
|
|
|
m.remove("indexMedia");
|
|
|
@ -266,7 +266,7 @@ public class WatchCrawler_p {
|
|
|
|
writer.close();
|
|
|
|
writer.close();
|
|
|
|
|
|
|
|
|
|
|
|
//String headline = scraper.getHeadline();
|
|
|
|
//String headline = scraper.getHeadline();
|
|
|
|
HashMap hyperlinks = (HashMap) scraper.getAnchors();
|
|
|
|
Map<yacyURL, String> hyperlinks = scraper.getAnchors();
|
|
|
|
|
|
|
|
|
|
|
|
// creating a crawler profile
|
|
|
|
// creating a crawler profile
|
|
|
|
yacyURL crawlURL = new yacyURL("file://" + file.toString(), null);
|
|
|
|
yacyURL crawlURL = new yacyURL("file://" + file.toString(), null);
|
|
|
@ -276,30 +276,16 @@ public class WatchCrawler_p {
|
|
|
|
switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
|
|
|
|
switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
|
|
|
|
|
|
|
|
|
|
|
|
// loop through the contained links
|
|
|
|
// loop through the contained links
|
|
|
|
Iterator linkiterator = hyperlinks.entrySet().iterator();
|
|
|
|
Iterator<Map.Entry<yacyURL, String>> linkiterator = hyperlinks.entrySet().iterator();
|
|
|
|
|
|
|
|
yacyURL nexturl;
|
|
|
|
while (linkiterator.hasNext()) {
|
|
|
|
while (linkiterator.hasNext()) {
|
|
|
|
Map.Entry e = (Map.Entry) linkiterator.next();
|
|
|
|
Map.Entry<yacyURL, String> e = linkiterator.next();
|
|
|
|
String nexturlstring = (String) e.getKey();
|
|
|
|
nexturl = e.getKey();
|
|
|
|
|
|
|
|
if (nexturl == null) continue;
|
|
|
|
if (nexturlstring == null) continue;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nexturlstring = nexturlstring.trim();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// normalizing URL
|
|
|
|
|
|
|
|
nexturlstring = new yacyURL(nexturlstring, null).toNormalform(true, true);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// generating an url object
|
|
|
|
|
|
|
|
yacyURL nexturlURL = null;
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
nexturlURL = new yacyURL(nexturlstring, null);
|
|
|
|
|
|
|
|
} catch (MalformedURLException ex) {
|
|
|
|
|
|
|
|
nexturlURL = null;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// enqueuing the url for crawling
|
|
|
|
// enqueuing the url for crawling
|
|
|
|
switchboard.crawlStacker.enqueueEntry(
|
|
|
|
switchboard.crawlStacker.enqueueEntry(
|
|
|
|
nexturlURL,
|
|
|
|
nexturl,
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|
yacyCore.seedDB.mySeed().hash,
|
|
|
|
yacyCore.seedDB.mySeed().hash,
|
|
|
|
(String) e.getValue(),
|
|
|
|
(String) e.getValue(),
|
|
|
@ -341,9 +327,9 @@ public class WatchCrawler_p {
|
|
|
|
// create a new sitemap importer
|
|
|
|
// create a new sitemap importer
|
|
|
|
dbImporter importerThread = switchboard.dbImportManager.getNewImporter("sitemap");
|
|
|
|
dbImporter importerThread = switchboard.dbImportManager.getNewImporter("sitemap");
|
|
|
|
if (importerThread != null) {
|
|
|
|
if (importerThread != null) {
|
|
|
|
HashMap initParams = new HashMap();
|
|
|
|
HashMap<String, String> initParams = new HashMap<String, String>();
|
|
|
|
initParams.put("sitemapURL",sitemapURLStr);
|
|
|
|
initParams.put("sitemapURL", sitemapURLStr);
|
|
|
|
initParams.put("crawlingProfile",pe.handle());
|
|
|
|
initParams.put("crawlingProfile", pe.handle());
|
|
|
|
|
|
|
|
|
|
|
|
importerThread.init(initParams);
|
|
|
|
importerThread.init(initParams);
|
|
|
|
importerThread.startIt();
|
|
|
|
importerThread.startIt();
|
|
|
|