- fixed numerous bugs

- better document names - fixed problem with ftp crawling - added automatic removal of search results from services that are not online according to the latest network scan: this does not delete the index but just does not show them. after the next network scan when the server is available again, the results are again showed. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7385 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 9b25a33fd9
parent 7bdb13bf7f
commit 9b25a33fd9
7 changed files with 39 additions and 22 deletions
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -228,10 +228,10 @@ public final class CrawlStacker {
                        // delete old entry, if exists to force a re-load of the url (thats wanted here)
                        DigestURI url = null;
                        try {
-                            if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + entry.name);
-                            else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + entry.name);
-                            else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + entry.name);
-                            else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + entry.name);
+                            if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
+                            else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + MultiProtocolURI.escape(entry.name));
+                            else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
+                            else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
                        } catch (MalformedURLException e) {
                            continue;
                        }
@ -247,7 +247,7 @@ public final class CrawlStacker {
                                initiator, 
                                url, 
                                null, 
-                                entry.name, 
+                                MultiProtocolURI.unescape(entry.name), 
                                entry.date,
                                profileHandle,
                                0,
--- a/source/de/anomic/crawler/retrieval/FTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/FTPLoader.java
@ -278,7 +278,7 @@ public class FTPLoader {
     * @return
     */
    private String getPath(final MultiProtocolURI entryUrl) {
-        return MultiProtocolURI.unescape(entryUrl.getPath()).replace("\"", "\"\"");
+        return entryUrl.getPath().replace("\"", "\"\"");
    }

 }
--- a/source/de/anomic/search/RankingProcess.java
+++ b/source/de/anomic/search/RankingProcess.java
@ -41,6 +41,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;

 import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.protocol.Scanner;
 import net.yacy.cora.storage.DynamicScore;
 import net.yacy.cora.storage.ScoreCluster;
 import net.yacy.cora.storage.StaticScore;
@ -475,6 +476,11 @@ public final class RankingProcess extends Thread {
                }
            }
            
+            // check Scanner
+            if (!Scanner.acceptURL(metadata.url())) {
+                continue;
+            }
+            
            // accept url
            return page;
        }
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@ -56,7 +56,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU

    
    private static final long serialVersionUID = -1173233022912141884L;
-    private static final long SMB_TIMEOUT = 1500;
+    private static final long SMB_TIMEOUT = 5000;
    
    public  static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
    private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(?<!/\\.{1,2})/)[.]{2}(?=/|$)|/\\.(?=/)|/(?=/)");
@ -774,7 +774,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
    }

    public String toTokens() {
-        return toTokens(this.toNormalform(true, true));
+        return toTokens(unescape(this.toNormalform(true, true)));
    }
    
    /**
@ -782,9 +782,9 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
     * resulting words are not ordered by appearance, but all
     * @return
     */
-    public static String toTokens(String s) {
+    private static String toTokens(String s) {
        // unesape string
-        String t = unescape(s);
+        String t = s;

        // remove all non-character & non-number
        StringBuilder sb = new StringBuilder(t.length());
--- a/source/net/yacy/cora/protocol/Scanner.java
+++ b/source/net/yacy/cora/protocol/Scanner.java
@ -90,7 +90,7 @@ public class Scanner extends Thread {
    public static boolean acceptURL(MultiProtocolURI url) {
        if (scancacheScanrange == null || scancacheScanrange.size() == 0) return true;
        
-        if (System.currentTimeMillis() > scancacheValidUntilTime) return true;
+        //if (System.currentTimeMillis() > scancacheValidUntilTime) return true;
        InetAddress a = Domains.dnsResolve(url.getHost());
        if (a == null) return true;
        InetAddress n = normalize(a);
--- a/source/net/yacy/cora/protocol/ftp/FTPClient.java
+++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java
@ -1338,6 +1338,7 @@ public class FTPClient {
    }

    public List<String> list(final String path, final boolean extended) throws IOException {
+
        createDataSocket();

        // send command to the control port
@ -2253,8 +2254,9 @@ public class FTPClient {
        }
    }

+    
    public byte[] get(final String fileName) throws IOException {
-
+        
        createDataSocket();

        // set type of the transfer
@ -2541,17 +2543,28 @@ public class FTPClient {
        }
        if (!path.endsWith("/")) path += "/";
        entryInfo info;
+        // first find all files and add them to the crawl list
        for (final String line : list) {
            info = parseListData(line);
-            if (info != null && info.type == filetype.file) {
-                if (!info.name.startsWith("/")) info.name = path + info.name;
+            if (info != null && info.type == filetype.file && !info.name.endsWith(".") && !info.name.startsWith(".")) {
+                if (!info.name.startsWith("/")) info.name = path + MultiProtocolURI.escape(info.name);
                queue.add(info);
            }
        }
+        // then find all directories and add them recursively
        for (final String line : list) {
            info = parseListData(line);
-            if (info != null && info.type == filetype.directory) {
-                sitelist(ftpClient, path + info.name, queue);
+            if (info != null && !info.name.endsWith(".") && !info.name.startsWith(".")) {
+                if (info.type == filetype.directory) {
+                    sitelist(ftpClient, path + MultiProtocolURI.escape(info.name), queue);
+                }
+                if (info.type == filetype.link) {
+                    int q = info.name.indexOf("->");
+                    if (q >= 0) {
+                        info.name = info.name.substring(0, q).trim();
+                        sitelist(ftpClient, path + MultiProtocolURI.escape(info.name), queue);
+                    }
+                }
            }
        }
    }
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -125,7 +125,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        String b = cleanLine(super.stripAll(newtext));
        if ((insideTag != null) && (!(insideTag.equals("a")))) {
            // texts inside tags sometimes have no punctuation at the line end
-            // this is bad for the text sematics, because it is not possible for the
+            // this is bad for the text semantics, because it is not possible for the
            // condenser to distinguish headlines from text beginnings.
            // to make it easier for the condenser, a dot ('.') is appended in case that
            // no punctuation is part of the newtext line
@ -141,6 +141,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            if (p == Integer.MAX_VALUE) break;
            q = b.indexOf(" ", p + 1);
            u = b.substring(p, q < 0 ? b.length() : q);
+            if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
            s = p + 1;
            try {
                url = new MultiProtocolURI(u);
@ -351,11 +352,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        s = getDescription();
        if (s.length() > 0) return s;
        
-        // extract headline from content
-        if (content.length() > 80) {
-            return cleanLine(new String(content.getChars(), 0, 80));
-        }
-        return cleanLine(content.trim().toString());
+        // extract headline from file name
+        return MultiProtocolURI.unescape(root.getFileName()); 
    }
    
    public String[] getHeadlines(final int i) {