enhanced content scraper (should discover urls much faster in case of very large plain texts)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7787 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 996f0a8764
commit 9706fc55aa

@ -39,6 +39,7 @@ import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.event.EventListenerList;
@ -243,10 +244,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
String u;
MultiProtocolURI url;
while (s < b.length()) {
p = find(b, "://", s);
p = find(b, dpssp, s);
if (p == Integer.MAX_VALUE) break;
s = Math.max(0, p - 5);
p = Math.min(find(b, "smb://", s), Math.min(find(b, "ftp://", s), Math.min(find(b, "http://", s), find(b, "https://", s))));
p = find(b, protp, s);
if (p == Integer.MAX_VALUE) break;
q = b.indexOf(" ", p + 1);
u = b.substring(p, q < 0 ? b.length() : q);
@ -262,8 +263,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (b.length() != 0) this.content.append(b).append(32);
}
private static final int find(final String s, final String m, final int start) {
final int p = s.indexOf(m, start);
private final static Pattern dpssp = Pattern.compile("://");
private final static Pattern protp = Pattern.compile("smb://|ftp://|http://|https://");
private static final int find(final String s, final Pattern m, final int start) {
final Matcher mm = m.matcher(s.subSequence(start, s.length()));
if (!mm.find()) return Integer.MAX_VALUE;
final int p = mm.start() + start;
//final int p = s.indexOf(m, start);
return (p < 0) ? Integer.MAX_VALUE : p;
}

Loading…
Cancel
Save