more fixes to smb crawling: better file names

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7384 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 94c48500cc
commit 7bdb13bf7f

@ -45,6 +45,7 @@ import de.anomic.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
@ -95,6 +96,7 @@ public class SMBLoader {
List<String> list = new ArrayList<String>();
for (String s: l) {
if (s.startsWith(".")) continue;
s = MultiProtocolURI.escape(s).toString();
if (!s.endsWith("/") && !s.endsWith("\\")) {
// check if this is a directory
SmbFile sf = new SmbFile(u + s);

@ -777,18 +777,14 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return toTokens(this.toNormalform(true, true));
}
private final static String[] replacementStrings = {"%20", "%2B", "%2b"};
/**
* create word tokens for parser. Find CamelCases and separate these words
* resulting words are not ordered by appearance, but all
* @return
*/
public static String toTokens(String s) {
String t = new String(s);
// remove all replacement strings
for (String r: replacementStrings) t = t.replaceAll(r, " ");
// unesape string
String t = unescape(s);
// remove all non-character & non-number
StringBuilder sb = new StringBuilder(t.length());

@ -65,6 +65,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import org.apache.log4j.Logger;
@ -2589,7 +2590,7 @@ public class FTPClient {
final boolean metaRobotNoindex) {
// this creates the html output from collected strings
final StringBuilder page = new StringBuilder(1024);
final String title = "Index of " + base;
final String title = "Index of " + MultiProtocolURI.unescape(base);
page.append("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\">\n");
page.append("<html><head>\n");

@ -46,7 +46,7 @@ public class genericParser extends AbstractParser implements Parser {
charset,
null,
null,
location.getFileName(), // title
MultiProtocolURI.unescape(location.getFileName()), // title
"", // author
location.getHost(),
null,

@ -185,7 +185,7 @@ public class genericImageParser extends AbstractParser implements Parser {
String infoString = ii.info.toString();
images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
if (title == null || title.length() == 0) title = location.getFileName();
if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName());
return new Document[]{new Document(
location,

@ -118,7 +118,7 @@ public class pdfParser extends AbstractParser implements Parser {
// info.getModificationDate();
}
if (docTitle == null || docTitle.length() == 0) docTitle = location.getFileName();
if (docTitle == null || docTitle.length() == 0) docTitle = MultiProtocolURI.unescape(location.getFileName());
CharBuffer writer = null;
try {
// create a writer for output

@ -84,7 +84,7 @@ public class torrentParser extends AbstractParser implements Parser {
BObject nameo = info.get("name");
if (nameo != null) title = new String(nameo.getString());
}
if (title == null || title.length() == 0) title = location.getFileName();
if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName());
try {
return new Document[]{new Document(
location,

Loading…
Cancel
Save