diff --git a/source/de/anomic/plasma/cache/ftp/ResourceInfo.java b/source/de/anomic/plasma/cache/ftp/ResourceInfo.java new file mode 100644 index 000000000..5b5db4404 --- /dev/null +++ b/source/de/anomic/plasma/cache/ftp/ResourceInfo.java @@ -0,0 +1,118 @@ +package de.anomic.plasma.cache.ftp; + +import java.net.MalformedURLException; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; + +import de.anomic.index.indexURL; +import de.anomic.net.URL; +import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.plasma.cache.ResourceInfoFactory; + +public class ResourceInfo implements IResourceInfo { + + public static final String MIMETYPE = "mimetype"; + public static final String MODIFICATION_DATE = "modificationDate"; + public static final String REFERER = "referer"; + + private URL url; + private String urlHash; + private HashMap propertyMap; + + /** + * Constructor used by the {@link ResourceInfoFactory} + * @param objectURL + * @param objectInfo + */ + public ResourceInfo(URL objectURL, Map objectInfo) { + if (objectURL == null) throw new NullPointerException(); + if (objectInfo == null) throw new NullPointerException(); + + // generating the url hash + this.url = objectURL; + this.urlHash = indexURL.urlHash(this.url.toNormalform()); + + // create the http header object + this.propertyMap = new HashMap(objectInfo); + } + + public ResourceInfo(URL objectURL, String refererUrl, String mimeType, Date fileDate) { + if (objectURL == null) throw new NullPointerException(); + + // generating the url hash + this.url = objectURL; + this.urlHash = indexURL.urlHash(this.url.toNormalform()); + + // create the http header object + this.propertyMap = new HashMap(); + if (refererUrl != null) + this.propertyMap.put(REFERER, refererUrl); + if (mimeType != null) + this.propertyMap.put(MIMETYPE, mimeType); + if (fileDate != null) + this.propertyMap.put(MODIFICATION_DATE, Long.toString(fileDate.getTime())); + } + + public Map getMap() { + return this.propertyMap; + } + + public String getMimeType() { + return (String) ((this.propertyMap == null) ? null : this.propertyMap.get(MIMETYPE)); + } + + public Date getModificationDate() { + if (this.propertyMap == null || !this.propertyMap.containsKey(MODIFICATION_DATE)) return new Date(); + return new Date(Long.valueOf((String) this.propertyMap.get(MODIFICATION_DATE)).longValue()); + } + + public URL getRefererUrl() { + try { + return (this.propertyMap == null) ? null : new URL((String)this.propertyMap.get(REFERER)); + } catch (MalformedURLException e) { + return null; + } + } + + public URL getUrl() { + return this.url; + } + + public String getUrlHash() { + return this.urlHash; + } + + public Date ifModifiedSince() { + return null; + } + + public boolean requestProhibitsIndexing() { + return false; + } + + public boolean requestWithCookie() { + return false; + } + + public String shallIndexCacheForCrawler() { + return null; + } + + public String shallIndexCacheForProxy() { + return null; + } + + public String shallStoreCacheForProxy() { + return null; + } + + public boolean shallUseCacheForProxy() { + return false; + } + + public boolean validResponseStatus(String responseStatus) { + return responseStatus != null && responseStatus.equalsIgnoreCase("OK"); + } + +} diff --git a/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java b/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java index f83e6d5d4..9b335c223 100644 --- a/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java @@ -53,19 +53,25 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; import java.io.PrintWriter; +import java.util.Date; +import de.anomic.http.httpHeader; +import de.anomic.http.httpc; import de.anomic.net.URL; import de.anomic.net.ftpc; +import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaHTCache; +import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.plasma.cache.ftp.ResourceInfo; import de.anomic.plasma.crawler.AbstractCrawlWorker; import de.anomic.plasma.crawler.plasmaCrawlWorker; import de.anomic.plasma.crawler.plasmaCrawlerPool; import de.anomic.plasma.plasmaHTCache.Entry; import de.anomic.server.logging.serverLog; -public class CrawlWorker extends AbstractCrawlWorker implements - plasmaCrawlWorker { +public class CrawlWorker extends AbstractCrawlWorker implements plasmaCrawlWorker { public CrawlWorker(ThreadGroup theTG, plasmaCrawlerPool thePool, plasmaSwitchboard theSb, plasmaHTCache theCacheManager, serverLog theLog) { super(theTG, thePool, theSb, theCacheManager, theLog); @@ -75,52 +81,87 @@ public class CrawlWorker extends AbstractCrawlWorker implements } public void close() { - // TODO Auto-generated method stub - + // TODO: abort a currently established connection } public void init() { - // TODO Auto-generated method stub + // nothing todo here } + protected plasmaHTCache.Entry createCacheEntry(String mimeType, Date fileDate) { + IResourceInfo resourceInfo = new ResourceInfo( + this.url, + this.refererURLString, + mimeType, + fileDate + ); + + return this.cacheManager.newEntry( + new Date(), + this.depth, + this.url, + this.name, + "OK", + resourceInfo, + this.initiator, + this.profile + ); + } + public Entry load() throws IOException { - - ByteArrayOutputStream bout = new ByteArrayOutputStream(); PrintStream out = new PrintStream(bout); ByteArrayOutputStream berr = new ByteArrayOutputStream(); PrintStream err = new PrintStream(berr); + // create a new ftp client ftpc ftpClient = new ftpc(System.in, out, err); + // get username and password String userInfo = this.url.getUserInfo(); - String userName, userPwd; + String userName = "anonymous", userPwd = "anonymous"; if (userInfo != null) { int pos = userInfo.indexOf(":"); - userName = userInfo.substring(0,pos); - userPwd = userInfo.substring(pos+1); - } else { - userName = "anonymous"; - userPwd = "anonymous"; - } + if (pos != -1) { + userName = userInfo.substring(0,pos); + userPwd = userInfo.substring(pos+1); + } + } + // get server name, port and file path String host = this.url.getHost(); String fullPath = this.url.getPath(); int port = this.url.getPort(); + // open a connection to the ftp server if (port == -1) { - ftpClient.exec("open " + this.url.getHost(), false); + ftpClient.exec("open " + host, false); } else { - ftpClient.exec("open " + this.url.getHost() + " " + port, false); + ftpClient.exec("open " + host + " " + port, false); } - ftpClient.exec("user " + userName + " " + userPwd, false); - ftpClient.exec("binary", false); + if (berr.size() > 0) { + this.log.logInfo("Unable to connect to ftp server " + this.url.getHost() + " hosting URL " + this.url.toString() + "\nErrorlog: " + berr.toString()); + addURLtoErrorDB(plasmaCrawlEURL.DENIED_CONNECTION_ERROR); + } + + // login to the server + ftpClient.exec("user " + userName + " " + userPwd, false); + if (berr.size() > 0) { + this.log.logInfo("Unable to login to ftp server " + this.url.getHost() + " hosting URL " + this.url.toString() + "\nErrorlog: " + berr.toString()); + addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_LOGIN_FAILED); + } - // cd - String file, path; + // change transfer mode to binary + ftpClient.exec("binary", false); + if (berr.size() > 0) { + this.log.logInfo("Unable to set the file transfer mode to binary for URL " + this.url.toString() + "\nErrorlog: " + berr.toString()); + addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_TRASFER_MODE_PROBLEM); + } + // determine filename and path + String file, path; if (fullPath.endsWith("/")) { file = ""; path = fullPath; @@ -135,6 +176,7 @@ public class CrawlWorker extends AbstractCrawlWorker implements } } + // testing if the specified file is a directory if (file.length() > 0) { ftpClient.exec("cd \"" + path + "\"", false); @@ -150,12 +192,20 @@ public class CrawlWorker extends AbstractCrawlWorker implements // creating a cache file object File cacheFile = this.cacheManager.getCachePath(this.url); cacheFile.getParentFile().mkdirs(); - - ftpClient.exec("cd \"" + fullPath + "\"", false); - if (file.length() == 0) { - + + String mimeType; + Date fileDate; + plasmaHTCache.Entry htCache = null; + if (file.length() == 0) { // getting the dirlist - StringBuffer dirList = ftpClient.dirhtml(host, (port==-1)?21:port, fullPath, userName, userPwd); + mimeType = "text/html"; + fileDate = new Date(); + + // create a htcache entry + htCache = createCacheEntry(mimeType,fileDate); + + // generate the dirlist + StringBuffer dirList = ftpClient.dirhtml(fullPath); // write it into a file PrintWriter writer = new PrintWriter(new FileOutputStream(cacheFile),false); @@ -163,18 +213,55 @@ public class CrawlWorker extends AbstractCrawlWorker implements writer.flush(); writer.close(); } else { - // download the remote file - ftpClient.exec("get \"" + file + "\" \"" + cacheFile.getAbsolutePath() + "\"", false); + // determine the mimetype of the resource + String extension = plasmaParser.getFileExt(this.url); + mimeType = plasmaParser.getMimeTypeByFileExt(extension); + + // if the mimetype and file extension is supported we start to download the file + if ((this.acceptAllContent) || (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER,this.url,mimeType))) { + + // TODO: determine the real file date + fileDate = new Date(); + + // create a htcache entry + htCache = createCacheEntry(mimeType,fileDate); + + // change into working directory + ftpClient.exec("cd \"" + fullPath + "\"", false); + + // download the remote file + ftpClient.exec("get \"" + file + "\" \"" + cacheFile.getAbsolutePath() + "\"", false); + } else { + // if the response has not the right file type then reject file + this.log.logInfo("REJECTED WRONG MIME/EXT TYPE " + mimeType + " for URL " + this.url.toString()); + addURLtoErrorDB(plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT); + } } + // closing connection ftpClient.exec("close", false); ftpClient.exec("exit", false); - // TODO: do mimetype detection based of file extension - - // TODO: create a new htCache entry .... + // pass the downloaded resource to the cache manager + if (berr.size() > 0 || htCache == null) { + // if the response has not the right file type then reject file + this.log.logInfo("Unable to download URL " + this.url.toString() + "\nErrorlog: " + berr.toString()); + addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_DOWNLOAD_ERROR); + + // an error has occured. cleanup + if (cacheFile.exists()) cacheFile.delete(); + } else { + // announce the file + this.cacheManager.writeFileAnnouncement(cacheFile); + + // enQueue new entry with response header + if (this.profile != null) { + this.cacheManager.push(htCache); + } + } - return null; + return htCache; } + } diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java index dcddcdc1f..84de054a5 100644 --- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java @@ -223,7 +223,7 @@ public final class CrawlWorker extends AbstractCrawlWorker { // request has been placed and result has been returned. work off response File cacheFile = this.cacheManager.getCachePath(this.url); try { - if ((acceptAllContent) || (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER,this.url,res.responseHeader.mime()))) { + if ((this.acceptAllContent) || (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER,this.url,res.responseHeader.mime()))) { if (cacheFile.isFile()) { this.cacheManager.deleteFile(this.url); } diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 084fd4b7f..e0a2e68c9 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -95,10 +95,10 @@ public class plasmaCrawlEURL extends indexURL { // network errors public static final String DENIED_UNKNOWN_HOST = "denied_(unknown_host)"; public static final String DENIED_NO_ROUTE_TO_HOST = "denied_(no_route_to_host)"; - public static final String DENIED_NETWORK_IS_UNREACHABLE = "denied_(Network_is_unreachable)"; + public static final String DENIED_NETWORK_IS_UNREACHABLE = "denied_(Network_is_unreachable)"; // connection errors - public static final String DENIED_CONNECTION_ERROR = "denied_(socket_connection_error)"; + public static final String DENIED_CONNECTION_ERROR = "denied_(connection_error)"; public static final String DENIED_CONNECTION_BIND_EXCEPTION = "denied_(connection_bind_exception)"; public static final String DENIED_CONNECTION_TIMEOUT = "denied_(connection_timeout)"; public static final String DENIED_CONNECTION_REFUSED = "denied_(connection_refused)"; @@ -110,6 +110,9 @@ public class plasmaCrawlEURL extends indexURL { // server errors public static final String DENIED_OUT_OF_DISK_SPACE = "denied_(out_of_disk_space)"; public static final String DENIED_SERVER_SHUTDOWN = "denied_(server_shutdown)"; + public static final String DENIED_SERVER_LOGIN_FAILED = "denied_(server_login_failed)"; + public static final String DENIED_SERVER_TRASFER_MODE_PROBLEM = "denied_(server_transfermode_problem)"; + public static final String DENIED_SERVER_DOWNLOAD_ERROR = "denied_(server_download_error)"; // Parser errors public static final String DENIED_PARSER_ERROR = "denied_(parser_error)";