diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 9706e255e..feec79c2f 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -228,10 +228,10 @@ public final class CrawlStacker { // delete old entry, if exists to force a re-load of the url (thats wanted here) DigestURI url = null; try { - if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + entry.name); - else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + entry.name); - else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + entry.name); - else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + entry.name); + if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name)); + else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + MultiProtocolURI.escape(entry.name)); + else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name)); + else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name)); } catch (MalformedURLException e) { continue; } @@ -247,7 +247,7 @@ public final class CrawlStacker { initiator, url, null, - entry.name, + MultiProtocolURI.unescape(entry.name), entry.date, profileHandle, 0, diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index cff12bd18..4ae8dd763 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -278,7 +278,7 @@ public class FTPLoader { * @return */ private String getPath(final MultiProtocolURI entryUrl) { - return MultiProtocolURI.unescape(entryUrl.getPath()).replace("\"", "\"\""); + return entryUrl.getPath().replace("\"", "\"\""); } } diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 0c90000c7..f5e2c79c4 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -41,6 +41,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.protocol.Scanner; import net.yacy.cora.storage.DynamicScore; import net.yacy.cora.storage.ScoreCluster; import net.yacy.cora.storage.StaticScore; @@ -475,6 +476,11 @@ public final class RankingProcess extends Thread { } } + // check Scanner + if (!Scanner.acceptURL(metadata.url())) { + continue; + } + // accept url return page; } diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index b327ff4ff..bf0348e21 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -56,7 +56,7 @@ public class MultiProtocolURI implements Serializable, Comparable scancacheValidUntilTime) return true; + //if (System.currentTimeMillis() > scancacheValidUntilTime) return true; InetAddress a = Domains.dnsResolve(url.getHost()); if (a == null) return true; InetAddress n = normalize(a); diff --git a/source/net/yacy/cora/protocol/ftp/FTPClient.java b/source/net/yacy/cora/protocol/ftp/FTPClient.java index 229a6e6de..16b432794 100644 --- a/source/net/yacy/cora/protocol/ftp/FTPClient.java +++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java @@ -1338,6 +1338,7 @@ public class FTPClient { } public List list(final String path, final boolean extended) throws IOException { + createDataSocket(); // send command to the control port @@ -2253,8 +2254,9 @@ public class FTPClient { } } + public byte[] get(final String fileName) throws IOException { - + createDataSocket(); // set type of the transfer @@ -2541,17 +2543,28 @@ public class FTPClient { } if (!path.endsWith("/")) path += "/"; entryInfo info; + // first find all files and add them to the crawl list for (final String line : list) { info = parseListData(line); - if (info != null && info.type == filetype.file) { - if (!info.name.startsWith("/")) info.name = path + info.name; + if (info != null && info.type == filetype.file && !info.name.endsWith(".") && !info.name.startsWith(".")) { + if (!info.name.startsWith("/")) info.name = path + MultiProtocolURI.escape(info.name); queue.add(info); } } + // then find all directories and add them recursively for (final String line : list) { info = parseListData(line); - if (info != null && info.type == filetype.directory) { - sitelist(ftpClient, path + info.name, queue); + if (info != null && !info.name.endsWith(".") && !info.name.startsWith(".")) { + if (info.type == filetype.directory) { + sitelist(ftpClient, path + MultiProtocolURI.escape(info.name), queue); + } + if (info.type == filetype.link) { + int q = info.name.indexOf("->"); + if (q >= 0) { + info.name = info.name.substring(0, q).trim(); + sitelist(ftpClient, path + MultiProtocolURI.escape(info.name), queue); + } + } } } } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 9c0b536f1..7ac57b21b 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -125,7 +125,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { String b = cleanLine(super.stripAll(newtext)); if ((insideTag != null) && (!(insideTag.equals("a")))) { // texts inside tags sometimes have no punctuation at the line end - // this is bad for the text sematics, because it is not possible for the + // this is bad for the text semantics, because it is not possible for the // condenser to distinguish headlines from text beginnings. // to make it easier for the condenser, a dot ('.') is appended in case that // no punctuation is part of the newtext line @@ -141,6 +141,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (p == Integer.MAX_VALUE) break; q = b.indexOf(" ", p + 1); u = b.substring(p, q < 0 ? b.length() : q); + if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above s = p + 1; try { url = new MultiProtocolURI(u); @@ -351,11 +352,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { s = getDescription(); if (s.length() > 0) return s; - // extract headline from content - if (content.length() > 80) { - return cleanLine(new String(content.getChars(), 0, 80)); - } - return cleanLine(content.trim().toString()); + // extract headline from file name + return MultiProtocolURI.unescape(root.getFileName()); } public String[] getHeadlines(final int i) {