orbiter 17 years ago
parent 4ffbcd54a4
commit 6eb8321cb0

@ -113,9 +113,12 @@ public class WatchCrawler_p {
newcrawlingfilter = ".*" + (new yacyURL(post.get("crawlingURL",""), null)).getHost() + ".*"; newcrawlingfilter = ".*" + (new yacyURL(post.get("crawlingURL",""), null)).getHost() + ".*";
} catch (MalformedURLException e) {} } catch (MalformedURLException e) {}
boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8")); int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((fullDomain) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on"); boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1")); int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
@ -143,9 +146,6 @@ public class WatchCrawler_p {
boolean storeHTCache = post.get("storeHTCache", "off").equals("on"); boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false"); env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
boolean xsstopw = post.get("xsstopw", "off").equals("on"); boolean xsstopw = post.get("xsstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false"); env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
@ -207,7 +207,7 @@ public class WatchCrawler_p {
// generate a YaCyNews if the global flag was set // generate a YaCyNews if the global flag was set
if (crawlOrder) { if (crawlOrder) {
Map m = new HashMap(pe.map()); // must be cloned Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
m.remove("specificDepth"); m.remove("specificDepth");
m.remove("indexText"); m.remove("indexText");
m.remove("indexMedia"); m.remove("indexMedia");
@ -266,7 +266,7 @@ public class WatchCrawler_p {
writer.close(); writer.close();
//String headline = scraper.getHeadline(); //String headline = scraper.getHeadline();
HashMap hyperlinks = (HashMap) scraper.getAnchors(); Map<yacyURL, String> hyperlinks = scraper.getAnchors();
// creating a crawler profile // creating a crawler profile
yacyURL crawlURL = new yacyURL("file://" + file.toString(), null); yacyURL crawlURL = new yacyURL("file://" + file.toString(), null);
@ -276,30 +276,16 @@ public class WatchCrawler_p {
switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL); switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
// loop through the contained links // loop through the contained links
Iterator linkiterator = hyperlinks.entrySet().iterator(); Iterator<Map.Entry<yacyURL, String>> linkiterator = hyperlinks.entrySet().iterator();
yacyURL nexturl;
while (linkiterator.hasNext()) { while (linkiterator.hasNext()) {
Map.Entry e = (Map.Entry) linkiterator.next(); Map.Entry<yacyURL, String> e = linkiterator.next();
String nexturlstring = (String) e.getKey(); nexturl = e.getKey();
if (nexturl == null) continue;
if (nexturlstring == null) continue;
nexturlstring = nexturlstring.trim();
// normalizing URL
nexturlstring = new yacyURL(nexturlstring, null).toNormalform(true, true);
// generating an url object
yacyURL nexturlURL = null;
try {
nexturlURL = new yacyURL(nexturlstring, null);
} catch (MalformedURLException ex) {
nexturlURL = null;
continue;
}
// enqueuing the url for crawling // enqueuing the url for crawling
switchboard.crawlStacker.enqueueEntry( switchboard.crawlStacker.enqueueEntry(
nexturlURL, nexturl,
null, null,
yacyCore.seedDB.mySeed().hash, yacyCore.seedDB.mySeed().hash,
(String) e.getValue(), (String) e.getValue(),
@ -341,9 +327,9 @@ public class WatchCrawler_p {
// create a new sitemap importer // create a new sitemap importer
dbImporter importerThread = switchboard.dbImportManager.getNewImporter("sitemap"); dbImporter importerThread = switchboard.dbImportManager.getNewImporter("sitemap");
if (importerThread != null) { if (importerThread != null) {
HashMap initParams = new HashMap(); HashMap<String, String> initParams = new HashMap<String, String>();
initParams.put("sitemapURL",sitemapURLStr); initParams.put("sitemapURL", sitemapURLStr);
initParams.put("crawlingProfile",pe.handle()); initParams.put("crawlingProfile", pe.handle());
importerThread.init(initParams); importerThread.init(initParams);
importerThread.startIt(); importerThread.startIt();

Loading…
Cancel
Save