more fixes to smb crawling: better file names

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7384 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 7bdb13bf7f
parent 94c48500cc
commit 7bdb13bf7f
7 changed files with 10 additions and 11 deletions
--- a/source/de/anomic/crawler/retrieval/SMBLoader.java
+++ b/source/de/anomic/crawler/retrieval/SMBLoader.java
@ -45,6 +45,7 @@ import de.anomic.search.Switchboard;
 import de.anomic.crawler.CrawlProfile;
 import de.anomic.data.MimeTable;

+import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
@ -95,6 +96,7 @@ public class SMBLoader {
            List<String> list = new ArrayList<String>();
            for (String s: l) {
                if (s.startsWith(".")) continue;
+                s = MultiProtocolURI.escape(s).toString();
                if (!s.endsWith("/") && !s.endsWith("\\")) {
                    // check if this is a directory
                    SmbFile sf = new SmbFile(u + s);
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@ -777,18 +777,14 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
        return toTokens(this.toNormalform(true, true));
    }
    
-    private final static String[] replacementStrings = {"%20", "%2B", "%2b"};
-    
    /**
     * create word tokens for parser. Find CamelCases and separate these words
     * resulting words are not ordered by appearance, but all
     * @return
     */
    public static String toTokens(String s) {
-        String t = new String(s);
-        
-        // remove all replacement strings
-        for (String r: replacementStrings) t = t.replaceAll(r, " ");
+        // unesape string
+        String t = unescape(s);

        // remove all non-character & non-number
        StringBuilder sb = new StringBuilder(t.length());
--- a/source/net/yacy/cora/protocol/ftp/FTPClient.java
+++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java
@ -65,6 +65,7 @@ import java.util.concurrent.LinkedBlockingQueue;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

+import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.Domains;

 import org.apache.log4j.Logger;
@ -2589,7 +2590,7 @@ public class FTPClient {
            final boolean metaRobotNoindex) {
        // this creates the html output from collected strings
        final StringBuilder page = new StringBuilder(1024);
-        final String title = "Index of " + base;
+        final String title = "Index of " + MultiProtocolURI.unescape(base);

        page.append("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\">\n");
        page.append("<html><head>\n");
--- a/source/net/yacy/document/parser/genericParser.java
+++ b/source/net/yacy/document/parser/genericParser.java
@ -46,7 +46,7 @@ public class genericParser extends AbstractParser implements Parser {
                charset,
                null,
                null,
-                location.getFileName(), // title
+                MultiProtocolURI.unescape(location.getFileName()), // title
                "", // author 
                location.getHost(),
                null,
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@ -185,7 +185,7 @@ public class genericImageParser extends AbstractParser implements Parser {
        String infoString = ii.info.toString();
        images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
        
-        if (title == null || title.length() == 0) title = location.getFileName();
+        if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName());
        
        return new Document[]{new Document(
             location,
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -118,7 +118,7 @@ public class pdfParser extends AbstractParser implements Parser {
            // info.getModificationDate();
        }
        
-        if (docTitle == null || docTitle.length() == 0) docTitle = location.getFileName();
+        if (docTitle == null || docTitle.length() == 0) docTitle = MultiProtocolURI.unescape(location.getFileName());
        CharBuffer writer = null;
        try {
            // create a writer for output
--- a/source/net/yacy/document/parser/torrentParser.java
+++ b/source/net/yacy/document/parser/torrentParser.java
@ -84,7 +84,7 @@ public class torrentParser extends AbstractParser implements Parser {
            BObject nameo = info.get("name");
            if (nameo != null) title = new String(nameo.getString());
        }
-        if (title == null || title.length() == 0) title = location.getFileName();
+        if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName());
        try {
            return new Document[]{new Document(
                    location,