diff --git a/htroot/BlacklistTest_p.java b/htroot/BlacklistTest_p.java index 54414bea3..1f5b3be37 100644 --- a/htroot/BlacklistTest_p.java +++ b/htroot/BlacklistTest_p.java @@ -56,10 +56,11 @@ public class BlacklistTest_p { if(post != null && post.containsKey("testList")) { prop.put("testlist", "1"); String urlstring = post.get("testurl", ""); - if(!urlstring.startsWith("http://") && - !urlstring.startsWith("https://")&& - !urlstring.startsWith("ftp://") - ) urlstring = "http://"+urlstring; + if (!urlstring.startsWith("http://") && + !urlstring.startsWith("https://") && + !urlstring.startsWith("ftp://") && + !urlstring.startsWith("smb://") && + !urlstring.startsWith("file://")) urlstring = "http://" + urlstring; DigestURI testurl = null; try { testurl = new DigestURI(urlstring, null); diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index 583c6b570..c3c76a968 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -84,7 +84,11 @@ public class Blacklist_p { if(post.containsKey("testList")) { prop.put("testlist", "1"); String urlstring = post.get("testurl", ""); - if(!urlstring.startsWith("http://")) urlstring = "http://"+urlstring; + if(!urlstring.startsWith("http://") && + !urlstring.startsWith("https://") && + !urlstring.startsWith("ftp://") && + !urlstring.startsWith("smb://") && + !urlstring.startsWith("file://")) urlstring = "http://"+urlstring; DigestURI testurl = null; try { testurl = new DigestURI(urlstring, null); diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 1f1542837..896db3c62 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -130,7 +130,10 @@ public class IndexControlURLs_p { String urlhash = post.get("urlhash", "").trim(); if (!urlstring.startsWith("http://") && - !urlstring.startsWith("https://")) { urlstring = "http://" + urlstring; } + !urlstring.startsWith("https://") && + !urlstring.startsWith("ftp://") && + !urlstring.startsWith("smb://") && + !urlstring.startsWith("file://")) { urlstring = "http://" + urlstring; } prop.putHTML("urlstring", urlstring); prop.putHTML("urlhash", urlhash); @@ -180,14 +183,15 @@ public class IndexControlURLs_p { prop.put("urlhash", urlhash); final URIMetadataRow entry = segment.urlMetadata().load(urlhash, null, 0); if (entry == null) { - prop.putHTML("urlstring", "unknown url: " + urlstring); + prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true)); + prop.putHTML("urlstring", urlstring); prop.put("urlhash", ""); } else { prop.putAll(genUrlProfile(segment, entry, urlhash)); prop.put("statistics", 0); } } catch (final MalformedURLException e) { - prop.putHTML("urlstring", "bad url: " + urlstring); + prop.putHTML("result", "bad url: " + urlstring); prop.put("urlhash", ""); } prop.put("lurlexport", 0); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 4bceb94df..4d3a6118b 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -110,7 +110,7 @@ public class ViewFile { prop.putHTML("error_words", ""); } - final String viewMode = post.get("viewMode","sentences"); + final String viewMode = post.get("viewMode","parsed"); prop.put("error_vMode-" + viewMode, "1"); DigestURI url = null; diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index aea04309b..eb9f076b7 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -38,7 +38,11 @@ public class getpageinfo_p { prop.put("robots-allowed", "1"); prop.putXML("title", "FTP: "+url); return prop; - } else if (!(url.toLowerCase().startsWith("http://") || url.toLowerCase().startsWith("https://"))) { + } else if (!url.startsWith("http://") && + !url.startsWith("https://") && + !url.startsWith("ftp://") && + !url.startsWith("smb://") && + !url.startsWith("file://")) { url = "http://" + url; } if (actions.indexOf("title")>=0) { diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 565ad5e0c..ea570819c 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -63,7 +63,7 @@ public class FTPLoader { * @param request * @return */ - public Response load(final Request request) throws IOException { + public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { long start = System.currentTimeMillis(); final DigestURI entryUrl = request.url(); @@ -91,55 +91,58 @@ public class FTPLoader { // stream for ftp-client errors final ByteArrayOutputStream berr = new ByteArrayOutputStream(); - final ftpc ftpClient = createFTPClient(berr); + // create new ftp client + final PrintStream err = new PrintStream(berr); + final ftpc ftpClient = new ftpc(System.in, null, err); + ftpClient.setDataTimeoutByMaxFilesize(maxFileSize); + + // get a connection if (openConnection(ftpClient, entryUrl)) { - // ftp stuff - //try { - // testing if the specified file is a directory - if (file.length() > 0) { - ftpClient.exec("cd \"" + path + "\"", false); - - final boolean isFolder = ftpClient.isFolder(file); - if (isFolder) { - path = fullPath + "/"; - file = ""; - } + // test if the specified file is a directory + if (file.length() > 0) { + ftpClient.exec("cd \"" + path + "\"", false); + + final boolean isFolder = ftpClient.isFolder(file); + if (isFolder) { + path = fullPath + "/"; + file = ""; } + } - if (file.length() == 0) { - // directory -> get list of files - RequestHeader requestHeader = new RequestHeader(); - if (request.referrerhash() != null) { - DigestURI u = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); - if (u != null) requestHeader.put(RequestHeader.REFERER, u.toNormalform(true, false)); - } - - byte[] dirList = generateDirlist(ftpClient, request, path); + if (file.length() == 0) { + // directory -> get list of files + RequestHeader requestHeader = new RequestHeader(); + if (request.referrerhash() != null) { + DigestURI u = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); + if (u != null) requestHeader.put(RequestHeader.REFERER, u.toNormalform(true, false)); + } + + StringBuilder dirList = ftpClient.dirhtml(path); - if (dirList == null) { - response = null; - } else { - ResponseHeader responseHeader = new ResponseHeader(); - responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date())); - responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); - response = new Response( - request, - requestHeader, - responseHeader, - "OK", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), - dirList); - } + if (dirList == null) { + response = null; } else { - // file -> download - try { - response = getFile(ftpClient, request); - } catch (final Exception e) { - // add message to errorLog - (new PrintStream(berr)).print(e.getMessage()); - } + ResponseHeader responseHeader = new ResponseHeader(); + responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date())); + responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); + response = new Response( + request, + requestHeader, + responseHeader, + "200", + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + dirList.toString().getBytes()); + } + } else { + // file -> download + try { + response = getFile(ftpClient, request, acceptOnlyParseable); + } catch (final Exception e) { + // add message to errorLog + (new PrintStream(berr)).print(e.getMessage()); } + } closeConnection(ftpClient); } @@ -166,11 +169,6 @@ public class FTPLoader { /** * establish a connection to the ftp server (open, login, set transfer mode) - * - * @param ftpClient - * @param hostname - * @param port - * @return success */ private boolean openConnection(final ftpc ftpClient, final DigestURI entryUrl) { // get username and password @@ -209,61 +207,62 @@ public class FTPLoader { return true; } - /** - * @param ftpClient - * @param request - * @param htCache - * @param cacheFile - * @return - * @throws Exception - */ - private Response getFile(final ftpc ftpClient, final Request request) throws Exception { + private Response getFile(final ftpc ftpClient, final Request request, boolean acceptOnlyParseable) throws Exception { // determine the mimetype of the resource - final DigestURI entryUrl = request.url(); - final String mimeType = TextParser.mimeOf(entryUrl); - final String path = getPath(entryUrl); + final DigestURI url = request.url(); + final String mime = TextParser.mimeOf(url); + final String path = getPath(url); - // if the mimetype and file extension is supported we start to download - // the file - Response response = null; - String supportError = TextParser.supports(entryUrl, mimeType); - if (supportError != null) { - // reject file - log.logInfo("PARSER REJECTED URL " + request.url().toString() + ": " + supportError); - sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, supportError); - throw new Exception(supportError); - } else { - // abort the download if content is too long - final int size = ftpClient.fileSize(path); - if (size <= maxFileSize || maxFileSize == -1) { - // timeout for download - ftpClient.setDataTimeoutByMaxFilesize(size); - - // determine the file date - final Date fileDate = ftpClient.entryDate(path); - - // download the remote file - byte[] b = ftpClient.get(path); - - // create a cache entry - RequestHeader requestHeader = new RequestHeader(); - if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false)); - ResponseHeader responseHeader = new ResponseHeader(); - responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate)); - responseHeader.put(HeaderFramework.CONTENT_TYPE, mimeType); - response = new Response( - request, - requestHeader, - responseHeader, - "OK", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), - b); + // determine the file date + final Date fileDate = ftpClient.entryDate(path); + + // create response header + RequestHeader requestHeader = new RequestHeader(); + if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false)); + ResponseHeader responseHeader = new ResponseHeader(); + responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate)); + responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); + + // if the mimetype and file extension is supported we start to download the file + final int size = ftpClient.fileSize(path); + String parserError = null; + if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || + (size > maxFileSize && maxFileSize >= 0)) { + // we know that we cannot process that file before loading + // only the metadata is returned + + if (parserError != null) { + log.logInfo("No parser available in FTP crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata"); } else { - log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString()); - sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded"); - throw new Exception("file size exceeds limit"); + log.logInfo("Too big file in FTP crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata"); } + + // create response with metadata only + responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); + Response response = new Response( + request, + requestHeader, + responseHeader, + "200", + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + url.toNormalform(true, true).getBytes()); + return response; } + + // timeout for download + ftpClient.setDataTimeoutByMaxFilesize(size); + + // download the remote file + byte[] b = ftpClient.get(path); + + // create a response + Response response = new Response( + request, + requestHeader, + responseHeader, + "200", + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + b); return response; } @@ -277,45 +276,4 @@ public class FTPLoader { return DigestURI.unescape(entryUrl.getPath()).replace("\"", "\"\""); } - /** - * @param ftpClient - * @param entry - * @param cacheFile - * @return - */ - private byte[] generateDirlist(final ftpc ftpClient, final Request entry, final String path) { - // getting the dirlist - final DigestURI entryUrl = entry.url(); - - // generate the dirlist - final StringBuilder dirList = ftpClient.dirhtml(path); - - if (dirList != null && dirList.length() > 0) { - try { - return dirList.toString().getBytes(); - } catch (final Exception e) { - log.logInfo("Unable to write dirlist for URL " + entryUrl.toString()); - } - } - return null; - } - - /** - * create a new ftp client - * - * @param berr - * @return - */ - private ftpc createFTPClient(final ByteArrayOutputStream berr) { - // error - final PrintStream err = new PrintStream(berr); - - final ftpc ftpClient = new ftpc(System.in, null, err); - - // set timeout - ftpClient.setDataTimeoutByMaxFilesize(maxFileSize); - - return ftpClient; - } - } diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index d42ccf55c..11825b59e 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -1,27 +1,26 @@ -//HTTPLoader.java -//------------------------ -//part of YaCy -//(C) by Michael Peter Christen; mc@yacy.net -//first published on http://www.anomic.de -//Frankfurt, Germany, 2006 +// HTTPLoader.java +// --------------- +// (C) by Michael Peter Christen; mc@yacy.net +// first published on http://yacy.net +// Frankfurt, Germany, 2006 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // -//This program is free software; you can redistribute it and/or modify -//it under the terms of the GNU General Public License as published by -//the Free Software Foundation; either version 2 of the License, or -//(at your option) any later version. +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. // -//This program is distributed in the hope that it will be useful, -//but WITHOUT ANY WARRANTY; without even the implied warranty of -//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. // -//You should have received a copy of the GNU General Public License -//along with this program; if not, write to the Free Software -//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package de.anomic.crawler.retrieval; diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index c7ba1e739..21d53460e 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -336,7 +336,7 @@ public class Response { // check status code if (!validResponseStatus()) { - return "bad_status_" + this.responseStatus.substring(0, 3); + return "bad_status_" + this.responseStatus; } if (requestHeader != null) { @@ -737,7 +737,9 @@ public class Response { public DigestURI referrerURL() { if (requestHeader == null) return null; try { - return new DigestURI(requestHeader.get(RequestHeader.REFERER, ""), null); + String r = requestHeader.get(RequestHeader.REFERER, null); + if (r == null) return null; + return new DigestURI(r, null); } catch (final Exception e) { return null; } diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index a6d242636..c9321f3d1 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -70,7 +70,7 @@ public class SMBLoader { if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false)); } - + // process directories: transform them to html with meta robots=noindex (using the ftpc lib) if (url.isDirectory()) { List list = new ArrayList(); String u = url.toNormalform(true, true); @@ -92,29 +92,44 @@ public class SMBLoader { request, requestHeader, responseHeader, - "OK", + "200", sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), content.toString().getBytes()); return response; } - // check mime type and availability of parsers + // create response header String mime = MimeTable.ext2mime(url.getFileExtension()); - String parserError = null; - if (acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) { - // we know that we cannot process that file before loading - log.logInfo("no parser available (" + parserError + ") for url = " + request.url().toString()); - sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "no parser available (" + parserError + ") for url = " + request.url().toString()); - throw new IOException("no parser available (" + parserError + ") for url = " + request.url().toString()); - } + ResponseHeader responseHeader = new ResponseHeader(); + responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date(url.lastModified()))); + responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); - // check resource size + // check mime type and availability of parsers + // and also check resource size and limitation of the size long size = url.length(); - if (size > maxFileSize && maxFileSize >= 0) { - log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString()); - sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded"); - throw new IOException("file size = " + size + " exceeds limit"); + String parserError = null; + if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || + (size > maxFileSize && maxFileSize >= 0)) { + // we know that we cannot process that file before loading + // only the metadata is returned + + if (parserError != null) { + log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata"); + } else { + log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata"); + } + + // create response with metadata only + responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); + Response response = new Response( + request, + requestHeader, + responseHeader, + "200", + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + url.toNormalform(true, true).getBytes()); + return response; } // load the resource @@ -122,15 +137,12 @@ public class SMBLoader { byte[] b = FileUtils.read(is); is.close(); - // create response object - ResponseHeader responseHeader = new ResponseHeader(); - responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date(url.lastModified()))); - responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); + // create response with loaded content Response response = new Response( request, requestHeader, responseHeader, - "OK", + "200", sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), b); return response; diff --git a/source/de/anomic/net/ftpc.java b/source/de/anomic/net/ftpc.java index 48d710206..729800b07 100644 --- a/source/de/anomic/net/ftpc.java +++ b/source/de/anomic/net/ftpc.java @@ -1037,7 +1037,7 @@ public class ftpc { * analogous to the "list" command except that data shall be * transferred over the control connection. */ - send("STAT \"path\""); + send("STAT " + path); final String reply = receive(); if (isNotPositiveCompletion(reply)) { diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 61434ece6..1df0fbe7b 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1727,13 +1727,13 @@ public final class Switchboard extends serverSwitch { if (condenser == null || document.indexingDenied()) { if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase); - addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase); + addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? "" : referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase); return; } if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) { if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase); - addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase); + addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? "" : referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase); return; } @@ -1753,7 +1753,7 @@ public final class Switchboard extends serverSwitch { RSSFeed.channels((queueEntry.initiator().equals(peers.mySeed().hash)) ? RSSFeed.LOCALINDEXING : RSSFeed.REMOTEINDEXING).addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false))); } catch (final IOException e) { if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase); - addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "error storing url: " + e.getMessage()); + addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? "" : referrerURL.hash(), queueEntry.initiator(), dc_title, "error storing url: " + e.getMessage()); return; } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index fdc3d550a..1d25feaae 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -256,7 +256,7 @@ public final class LoaderDispatcher { // load resource from the internet Response response = null; if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable); - if (protocol.equals("ftp")) response = ftpLoader.load(request); + if (protocol.equals("ftp")) response = ftpLoader.load(request, true); if (protocol.equals("smb")) response = smbLoader.load(request, true); if (response != null) { // we got something. Now check if we want to store that to the cache