- fixed numerous bugs

- better document names
- fixed problem with ftp crawling
- added automatic removal of search results from services that are not online according to the latest network scan: this does not delete the index but just does not show them. after the next network scan when the server is available again, the results are again showed.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7385 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 7bdb13bf7f
commit 9b25a33fd9

@ -228,10 +228,10 @@ public final class CrawlStacker {
// delete old entry, if exists to force a re-load of the url (thats wanted here) // delete old entry, if exists to force a re-load of the url (thats wanted here)
DigestURI url = null; DigestURI url = null;
try { try {
if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + entry.name); if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + entry.name); else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + MultiProtocolURI.escape(entry.name));
else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + entry.name); else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + entry.name); else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
continue; continue;
} }
@ -247,7 +247,7 @@ public final class CrawlStacker {
initiator, initiator,
url, url,
null, null,
entry.name, MultiProtocolURI.unescape(entry.name),
entry.date, entry.date,
profileHandle, profileHandle,
0, 0,

@ -278,7 +278,7 @@ public class FTPLoader {
* @return * @return
*/ */
private String getPath(final MultiProtocolURI entryUrl) { private String getPath(final MultiProtocolURI entryUrl) {
return MultiProtocolURI.unescape(entryUrl.getPath()).replace("\"", "\"\""); return entryUrl.getPath().replace("\"", "\"\"");
} }
} }

@ -41,6 +41,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Scanner;
import net.yacy.cora.storage.DynamicScore; import net.yacy.cora.storage.DynamicScore;
import net.yacy.cora.storage.ScoreCluster; import net.yacy.cora.storage.ScoreCluster;
import net.yacy.cora.storage.StaticScore; import net.yacy.cora.storage.StaticScore;
@ -475,6 +476,11 @@ public final class RankingProcess extends Thread {
} }
} }
// check Scanner
if (!Scanner.acceptURL(metadata.url())) {
continue;
}
// accept url // accept url
return page; return page;
} }

@ -56,7 +56,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
private static final long serialVersionUID = -1173233022912141884L; private static final long serialVersionUID = -1173233022912141884L;
private static final long SMB_TIMEOUT = 1500; private static final long SMB_TIMEOUT = 5000;
public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter public static final int TLD_any_zone_filter = 255; // from TLD zones can be filtered during search; this is the catch-all filter
private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(?<!/\\.{1,2})/)[.]{2}(?=/|$)|/\\.(?=/)|/(?=/)"); private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(?<!/\\.{1,2})/)[.]{2}(?=/|$)|/\\.(?=/)|/(?=/)");
@ -774,7 +774,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
} }
public String toTokens() { public String toTokens() {
return toTokens(this.toNormalform(true, true)); return toTokens(unescape(this.toNormalform(true, true)));
} }
/** /**
@ -782,9 +782,9 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
* resulting words are not ordered by appearance, but all * resulting words are not ordered by appearance, but all
* @return * @return
*/ */
public static String toTokens(String s) { private static String toTokens(String s) {
// unesape string // unesape string
String t = unescape(s); String t = s;
// remove all non-character & non-number // remove all non-character & non-number
StringBuilder sb = new StringBuilder(t.length()); StringBuilder sb = new StringBuilder(t.length());

@ -90,7 +90,7 @@ public class Scanner extends Thread {
public static boolean acceptURL(MultiProtocolURI url) { public static boolean acceptURL(MultiProtocolURI url) {
if (scancacheScanrange == null || scancacheScanrange.size() == 0) return true; if (scancacheScanrange == null || scancacheScanrange.size() == 0) return true;
if (System.currentTimeMillis() > scancacheValidUntilTime) return true; //if (System.currentTimeMillis() > scancacheValidUntilTime) return true;
InetAddress a = Domains.dnsResolve(url.getHost()); InetAddress a = Domains.dnsResolve(url.getHost());
if (a == null) return true; if (a == null) return true;
InetAddress n = normalize(a); InetAddress n = normalize(a);

@ -1338,6 +1338,7 @@ public class FTPClient {
} }
public List<String> list(final String path, final boolean extended) throws IOException { public List<String> list(final String path, final boolean extended) throws IOException {
createDataSocket(); createDataSocket();
// send command to the control port // send command to the control port
@ -2253,8 +2254,9 @@ public class FTPClient {
} }
} }
public byte[] get(final String fileName) throws IOException { public byte[] get(final String fileName) throws IOException {
createDataSocket(); createDataSocket();
// set type of the transfer // set type of the transfer
@ -2541,17 +2543,28 @@ public class FTPClient {
} }
if (!path.endsWith("/")) path += "/"; if (!path.endsWith("/")) path += "/";
entryInfo info; entryInfo info;
// first find all files and add them to the crawl list
for (final String line : list) { for (final String line : list) {
info = parseListData(line); info = parseListData(line);
if (info != null && info.type == filetype.file) { if (info != null && info.type == filetype.file && !info.name.endsWith(".") && !info.name.startsWith(".")) {
if (!info.name.startsWith("/")) info.name = path + info.name; if (!info.name.startsWith("/")) info.name = path + MultiProtocolURI.escape(info.name);
queue.add(info); queue.add(info);
} }
} }
// then find all directories and add them recursively
for (final String line : list) { for (final String line : list) {
info = parseListData(line); info = parseListData(line);
if (info != null && info.type == filetype.directory) { if (info != null && !info.name.endsWith(".") && !info.name.startsWith(".")) {
sitelist(ftpClient, path + info.name, queue); if (info.type == filetype.directory) {
sitelist(ftpClient, path + MultiProtocolURI.escape(info.name), queue);
}
if (info.type == filetype.link) {
int q = info.name.indexOf("->");
if (q >= 0) {
info.name = info.name.substring(0, q).trim();
sitelist(ftpClient, path + MultiProtocolURI.escape(info.name), queue);
}
}
} }
} }
} }

@ -125,7 +125,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
String b = cleanLine(super.stripAll(newtext)); String b = cleanLine(super.stripAll(newtext));
if ((insideTag != null) && (!(insideTag.equals("a")))) { if ((insideTag != null) && (!(insideTag.equals("a")))) {
// texts inside tags sometimes have no punctuation at the line end // texts inside tags sometimes have no punctuation at the line end
// this is bad for the text sematics, because it is not possible for the // this is bad for the text semantics, because it is not possible for the
// condenser to distinguish headlines from text beginnings. // condenser to distinguish headlines from text beginnings.
// to make it easier for the condenser, a dot ('.') is appended in case that // to make it easier for the condenser, a dot ('.') is appended in case that
// no punctuation is part of the newtext line // no punctuation is part of the newtext line
@ -141,6 +141,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (p == Integer.MAX_VALUE) break; if (p == Integer.MAX_VALUE) break;
q = b.indexOf(" ", p + 1); q = b.indexOf(" ", p + 1);
u = b.substring(p, q < 0 ? b.length() : q); u = b.substring(p, q < 0 ? b.length() : q);
if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
s = p + 1; s = p + 1;
try { try {
url = new MultiProtocolURI(u); url = new MultiProtocolURI(u);
@ -351,11 +352,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
s = getDescription(); s = getDescription();
if (s.length() > 0) return s; if (s.length() > 0) return s;
// extract headline from content // extract headline from file name
if (content.length() > 80) { return MultiProtocolURI.unescape(root.getFileName());
return cleanLine(new String(content.getChars(), 0, 80));
}
return cleanLine(content.trim().toString());
} }
public String[] getHeadlines(final int i) { public String[] getHeadlines(final int i) {

Loading…
Cancel
Save