diff --git a/htroot/IndexCreate_p.html b/htroot/IndexCreate_p.html index c2d3dcce3..59cd6f976 100644 --- a/htroot/IndexCreate_p.html +++ b/htroot/IndexCreate_p.html @@ -102,13 +102,26 @@ You can define URLs as start points for Web page crawling and start that crawlin --> - Start Point: - - - Existing start URL's are re-crawled. + Starting Point: + + + + + + + + + + +
From File:
From URL:
+ + Existing start URL's are re-crawled. Other already visited URL's are sorted out as 'double'. A complete re-crawl will be available soon. + + + @@ -130,19 +143,21 @@ Your peer can search and index for other peers and they can search for you. -#(error)# -:: +#(error)# +:: Error with profile management. Please stop yacy, delete the File DATA/PLASMADB/crawlProfiles0.db and restart. -:: +:: Error: #[errmsg]# -:: +:: Application not yet initialized. Sorry. Please wait some seconds and repeat the request. -:: +:: ERROR: Crawl filter "#[newcrawlingfilter]#" does not match with crawl root "#[crawlingStart]#". Please try again with different filter


-:: +:: Crawling of "#[crawlingURL]#" failed. Reason: #[reasonString]#
-:: +:: Error with url input "#[crawlingStart]#": #[error]# +:: +Error with file input "#[crawlingStart]#": #[error]# #(/error)#
#(info)# diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index 123ca6970..eddd2d5c4 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -43,20 +43,30 @@ // javac -classpath .:../classes IndexCreate_p.java // if the shell's current path is HTROOT +import java.io.File; +import java.io.OutputStream; import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Enumeration; +import java.util.HashMap; import java.util.Iterator; import java.util.Locale; +import java.util.Map; +import java.util.Properties; +import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.htmlFilter.htmlFilterOutputStream; import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; +import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import de.anomic.tools.bitfield; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; @@ -104,49 +114,112 @@ public class IndexCreate_p { boolean xpstopw = ((String) post.get("xpstopw", "")).equals("on"); env.setConfig("xpstopw", (xpstopw) ? "true" : "false"); - String crawlingStart = (String) post.get("crawlingURL"); - if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart; - - // check if url is proper - URL crawlingStartURL = null; - try { - crawlingStartURL = new URL(crawlingStart); - } catch (MalformedURLException e) { - crawlingStartURL = null; - } - - // check if pattern matches - if ((crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { - // print error message - prop.put("error", 4); //crawlfilter does not match url - prop.put("error_newcrawlingfilter", newcrawlingfilter); - prop.put("error_crawlingStart", crawlingStart); - } else try { - // stack request - // first delete old entry, if exists - String urlhash = plasmaURL.urlHash(crawlingStart); - switchboard.urlPool.loadedURL.remove(urlhash); - switchboard.urlPool.noticeURL.remove(urlhash); + String crawlingMode = post.get("crawlingMode","url"); + if (crawlingMode.equals("url")) { + String crawlingStart = (String) post.get("crawlingURL"); + if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart; - // stack url - String reasonString = switchboard.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, - switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw)); - - if (reasonString == null) { - // liftoff! - prop.put("info", 2);//start msg - prop.put("info_crawlingURL", ((String) post.get("crawlingURL"))); - } else { - prop.put("error", 5); //Crawling failed - prop.put("error_crawlingURL", ((String) post.get("crawlingURL"))); - prop.put("error_reasonString", reasonString); + // check if url is proper + URL crawlingStartURL = null; + try { + crawlingStartURL = new URL(crawlingStart); + } catch (MalformedURLException e) { + crawlingStartURL = null; } - } catch (Exception e) { - // mist - prop.put("error", 6);//Error with url - prop.put("error_crawlingStart", crawlingStart); - prop.put("error_error", e.getMessage()); - e.printStackTrace(); + + // check if pattern matches + if ((crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { + // print error message + prop.put("error", 4); //crawlfilter does not match url + prop.put("error_newcrawlingfilter", newcrawlingfilter); + prop.put("error_crawlingStart", crawlingStart); + } else try { + // stack request + // first delete old entry, if exists + String urlhash = plasmaURL.urlHash(crawlingStart); + switchboard.urlPool.loadedURL.remove(urlhash); + switchboard.urlPool.noticeURL.remove(urlhash); + + // stack url + String reasonString = switchboard.stackCrawl(crawlingStart, null, yacyCore.seedDB.mySeed.hash, "CRAWLING-ROOT", new Date(), 0, + switchboard.profiles.newEntry(crawlingStartURL.getHost(), crawlingStart, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw)); + + if (reasonString == null) { + // liftoff! + prop.put("info", 2);//start msg + prop.put("info_crawlingURL", ((String) post.get("crawlingURL"))); + } else { + prop.put("error", 5); //Crawling failed + prop.put("error_crawlingURL", ((String) post.get("crawlingURL"))); + prop.put("error_reasonString", reasonString); + } + } catch (Exception e) { + // mist + prop.put("error", 6);//Error with url + prop.put("error_crawlingStart", crawlingStart); + prop.put("error_error", e.getMessage()); + e.printStackTrace(); + } + + } else if (crawlingMode.equals("file")) { + if (post.containsKey("crawlingFile")) { + // getting the name of the uploaded file + String fileName = (String) post.get("crawlingFile"); + try { + File file = new File(fileName); + + // getting the content of the bookmark file + byte[] fileContent = (byte[]) post.get("crawlingFile$file"); + + // parsing the bookmark file and fetching the headline and contained links + htmlFilterContentScraper scraper = new htmlFilterContentScraper(file.toURL()); + OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); + serverFileUtils.write(fileContent,os); + os.close(); + + String headline = scraper.getHeadline(); + HashMap hyperlinks = (HashMap) scraper.getAnchors(); + + // creating a crawler profile + plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw); + + // loop through the contained links + Iterator interator = hyperlinks.entrySet().iterator(); + int c = 0; + while (interator.hasNext()) { + Map.Entry e = (Map.Entry) interator.next(); + String nexturlstring = (String) e.getKey(); + + // generating an url object + URL nexturlURL = null; + try { + nexturlURL = new URL(nexturlstring); + } catch (MalformedURLException ex) { + nexturlURL = null; + c++; + continue; + } + + // enqueuing the url for crawling + String rejectReason = switchboard.stackCrawl(nexturlstring, null, yacyCore.seedDB.mySeed.hash, (String)e.getValue(), new Date(), 1, profile); + + // if something failed add the url into the errorURL list + if (rejectReason == null) { + c++; + } else { + switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, + (String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false); + } + } + + } catch (Exception e) { + // mist + prop.put("error", 7);//Error with file + prop.put("error_crawlingStart", fileName); + prop.put("error_error", e.getMessage()); + e.printStackTrace(); + } + } } } }