- fixed numerous bugs

- better document names
- fixed problem with ftp crawling
- added automatic removal of search results from services that are not online according to the latest network scan: this does not delete the index but just does not show them. after the next network scan when the server is available again, the results are again showed.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7385 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 7bdb13bf7f
commit 9b25a33fd9

@ -228,10 +228,10 @@ public final class CrawlStacker {
// delete old entry, if exists to force a re-load of the url (thats wanted here)
DigestURI url = null;
try {
if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + entry.name);
else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + entry.name);
else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + entry.name);
else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + entry.name);
if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + MultiProtocolURI.escape(entry.name));
else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
} catch (MalformedURLException e) {
continue;
}
@ -247,7 +247,7 @@ public final class CrawlStacker {
initiator,
url,
null,
entry.name,
MultiProtocolURI.unescape(entry.name),
entry.date,
profileHandle,
0,

@ -278,7 +278,7 @@ public class FTPLoader {
* @return
*/
private String getPath(final MultiProtocolURI entryUrl) {
return MultiProtocolURI.unescape(entryUrl.getPath()).replace("\"", "\"\"");
return entryUrl.getPath().replace("\"", "\"\"");
}
}

@ -41,6 +41,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Scanner;
import net.yacy.cora.storage.DynamicScore;
import net.yacy.cora.storage.ScoreCluster;
import net.yacy.cora.storage.StaticScore;
@ -475,6 +476,11 @@ public final class RankingProcess extends Thread {
}
}
// check Scanner
if (!Scanner.acceptURL(metadata.url())) {
continue;
}
// accept url
return page;
}

@ -56,7 +56,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
private static final long serialVersionUID = -1173233022912141884L;
private static final long SMB_TIMEOUT = 1500;
private static final long SMB_TIMEOUT = 5000;
public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(?<!/\\.{1,2})/)[.]{2}(?=/|$)|/\\.(?=/)|/(?=/)");
@ -774,7 +774,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
}
public String toTokens() {
return toTokens(this.toNormalform(true, true));
return toTokens(unescape(this.toNormalform(true, true)));
}
/**
@ -782,9 +782,9 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
* resulting words are not ordered by appearance, but all
* @return
*/
public static String toTokens(String s) {
private static String toTokens(String s) {
// unesape string
String t = unescape(s);
String t = s;
// remove all non-character & non-number
StringBuilder sb = new StringBuilder(t.length());

@ -90,7 +90,7 @@ public class Scanner extends Thread {
public static boolean acceptURL(MultiProtocolURI url) {
if (scancacheScanrange == null || scancacheScanrange.size() == 0) return true;
if (System.currentTimeMillis() > scancacheValidUntilTime) return true;
//if (System.currentTimeMillis() > scancacheValidUntilTime) return true;
InetAddress a = Domains.dnsResolve(url.getHost());
if (a == null) return true;
InetAddress n = normalize(a);

@ -1338,6 +1338,7 @@ public class FTPClient {
}
public List<String> list(final String path, final boolean extended) throws IOException {
createDataSocket();
// send command to the control port
@ -2253,8 +2254,9 @@ public class FTPClient {
}
}
public byte[] get(final String fileName) throws IOException {
createDataSocket();
// set type of the transfer
@ -2541,17 +2543,28 @@ public class FTPClient {
}
if (!path.endsWith("/")) path += "/";
entryInfo info;
// first find all files and add them to the crawl list
for (final String line : list) {
info = parseListData(line);
if (info != null && info.type == filetype.file) {
if (!info.name.startsWith("/")) info.name = path + info.name;
if (info != null && info.type == filetype.file && !info.name.endsWith(".") && !info.name.startsWith(".")) {
if (!info.name.startsWith("/")) info.name = path + MultiProtocolURI.escape(info.name);
queue.add(info);
}
}
// then find all directories and add them recursively
for (final String line : list) {
info = parseListData(line);
if (info != null && info.type == filetype.directory) {
sitelist(ftpClient, path + info.name, queue);
if (info != null && !info.name.endsWith(".") && !info.name.startsWith(".")) {
if (info.type == filetype.directory) {
sitelist(ftpClient, path + MultiProtocolURI.escape(info.name), queue);
}
if (info.type == filetype.link) {
int q = info.name.indexOf("->");
if (q >= 0) {
info.name = info.name.substring(0, q).trim();
sitelist(ftpClient, path + MultiProtocolURI.escape(info.name), queue);
}
}
}
}
}

@ -125,7 +125,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
String b = cleanLine(super.stripAll(newtext));
if ((insideTag != null) && (!(insideTag.equals("a")))) {
// texts inside tags sometimes have no punctuation at the line end
// this is bad for the text sematics, because it is not possible for the
// this is bad for the text semantics, because it is not possible for the
// condenser to distinguish headlines from text beginnings.
// to make it easier for the condenser, a dot ('.') is appended in case that
// no punctuation is part of the newtext line
@ -141,6 +141,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (p == Integer.MAX_VALUE) break;
q = b.indexOf(" ", p + 1);
u = b.substring(p, q < 0 ? b.length() : q);
if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
s = p + 1;
try {
url = new MultiProtocolURI(u);
@ -351,11 +352,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
s = getDescription();
if (s.length() > 0) return s;
// extract headline from content
if (content.length() > 80) {
return cleanLine(new String(content.getChars(), 0, 80));
}
return cleanLine(content.trim().toString());
// extract headline from file name
return MultiProtocolURI.unescape(root.getFileName());
}
public String[] getHeadlines(final int i) {

Loading…
Cancel
Save