|
|
|
@ -133,6 +133,8 @@ public class CrawlWorker extends AbstractCrawlWorker implements plasmaCrawlWorke
|
|
|
|
|
String fullPath = this.url.getPath();
|
|
|
|
|
int port = this.url.getPort();
|
|
|
|
|
|
|
|
|
|
plasmaHTCache.Entry htCache = null;
|
|
|
|
|
try {
|
|
|
|
|
// open a connection to the ftp server
|
|
|
|
|
if (port == -1) {
|
|
|
|
|
ftpClient.exec("open " + host, false);
|
|
|
|
@ -140,22 +142,25 @@ public class CrawlWorker extends AbstractCrawlWorker implements plasmaCrawlWorke
|
|
|
|
|
ftpClient.exec("open " + host + " " + port, false);
|
|
|
|
|
}
|
|
|
|
|
if (berr.size() > 0) {
|
|
|
|
|
this.log.logInfo("Unable to connect to ftp server " + this.url.getHost() + " hosting URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
|
|
|
|
|
this.log.logWarning("Unable to connect to ftp server " + this.url.getHost() + " hosting URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
|
|
|
|
|
addURLtoErrorDB(plasmaCrawlEURL.DENIED_CONNECTION_ERROR);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// login to the server
|
|
|
|
|
ftpClient.exec("user " + userName + " " + userPwd, false);
|
|
|
|
|
if (berr.size() > 0) {
|
|
|
|
|
this.log.logInfo("Unable to login to ftp server " + this.url.getHost() + " hosting URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
|
|
|
|
|
this.log.logWarning("Unable to login to ftp server " + this.url.getHost() + " hosting URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
|
|
|
|
|
addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_LOGIN_FAILED);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// change transfer mode to binary
|
|
|
|
|
ftpClient.exec("binary", false);
|
|
|
|
|
if (berr.size() > 0) {
|
|
|
|
|
this.log.logInfo("Unable to set the file transfer mode to binary for URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
|
|
|
|
|
this.log.logWarning("Unable to set the file transfer mode to binary for URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
|
|
|
|
|
addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_TRASFER_MODE_PROBLEM);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// determine filename and path
|
|
|
|
@ -205,7 +210,6 @@ public class CrawlWorker extends AbstractCrawlWorker implements plasmaCrawlWorke
|
|
|
|
|
|
|
|
|
|
String mimeType;
|
|
|
|
|
Date fileDate;
|
|
|
|
|
plasmaHTCache.Entry htCache = null;
|
|
|
|
|
if (file.length() == 0) {
|
|
|
|
|
// getting the dirlist
|
|
|
|
|
mimeType = "text/html";
|
|
|
|
@ -217,11 +221,16 @@ public class CrawlWorker extends AbstractCrawlWorker implements plasmaCrawlWorke
|
|
|
|
|
// generate the dirlist
|
|
|
|
|
StringBuffer dirList = ftpClient.dirhtml(fullPath);
|
|
|
|
|
|
|
|
|
|
if (dirList != null && dirList.length() > 0) try {
|
|
|
|
|
// write it into a file
|
|
|
|
|
PrintWriter writer = new PrintWriter(new FileOutputStream(cacheFile),false);
|
|
|
|
|
writer.write(dirList.toString());
|
|
|
|
|
writer.flush();
|
|
|
|
|
writer.close();
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
this.log.logInfo("Unable to write dirlist for URL " + this.url.toString());
|
|
|
|
|
htCache = null;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// determine the mimetype of the resource
|
|
|
|
|
String extension = plasmaParser.getFileExt(this.url);
|
|
|
|
@ -245,17 +254,14 @@ public class CrawlWorker extends AbstractCrawlWorker implements plasmaCrawlWorke
|
|
|
|
|
// if the response has not the right file type then reject file
|
|
|
|
|
this.log.logInfo("REJECTED WRONG MIME/EXT TYPE " + mimeType + " for URL " + this.url.toString());
|
|
|
|
|
addURLtoErrorDB(plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// closing connection
|
|
|
|
|
ftpClient.exec("close", false);
|
|
|
|
|
ftpClient.exec("exit", false);
|
|
|
|
|
|
|
|
|
|
// pass the downloaded resource to the cache manager
|
|
|
|
|
if (berr.size() > 0 || htCache == null) {
|
|
|
|
|
// if the response has not the right file type then reject file
|
|
|
|
|
this.log.logInfo("Unable to download URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
|
|
|
|
|
this.log.logWarning("Unable to download URL " + this.url.toString() + "\nErrorlog: " + berr.toString());
|
|
|
|
|
addURLtoErrorDB(plasmaCrawlEURL.DENIED_SERVER_DOWNLOAD_ERROR);
|
|
|
|
|
|
|
|
|
|
// an error has occured. cleanup
|
|
|
|
@ -271,6 +277,11 @@ public class CrawlWorker extends AbstractCrawlWorker implements plasmaCrawlWorke
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return htCache;
|
|
|
|
|
} finally {
|
|
|
|
|
// closing connection
|
|
|
|
|
ftpClient.exec("close", false);
|
|
|
|
|
ftpClient.exec("exit", false);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|