Refactored plain-text URLs detection implementation.

For faster processing (measured about 2 times faster on many real-world examples) and more advanced detection (previous algorithm detected only URLs separated from the rest of the text by a space character).
7 years ago · 9b1bb2545e
parent 8da3174867
commit 9b1bb2545e
2 changed files with 233 additions and 22 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -362,9 +362,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }
    }
    private final static Pattern dpssp = Pattern.compile("://");
    private final static Pattern protp = Pattern.compile("smb://|ftp://|http://|https://");
    /** A regular expression pattern matching any whitespace character */
    private final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
    /**
     * Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter are can be null. 
     * @param text the text to parse
@ -375,23 +377,33 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        if(text == null) {
        	return;
        }
-        int schemePosition, spacePosition, offset = 0;
+        int schemePosition, offset = 0;
        boolean hasWhiteSpace;
        String urlString;
        AnchorURL url;
        final Matcher urlSchemeMatcher = protp.matcher(text);
        final Matcher whiteSpaceMatcher = WHITESPACE_PATTERN.matcher(text);
        while (offset < text.length()) {
-            schemePosition = find(text, dpssp, offset);
+            if(!urlSchemeMatcher.find(offset)) {
            if (schemePosition == Integer.MAX_VALUE) {
            	break;
            }
-            offset = Math.max(0, schemePosition - 5);
+            schemePosition = urlSchemeMatcher.start();
-            schemePosition = find(text, protp, offset);
+            
-            if (schemePosition == Integer.MAX_VALUE) {
+            hasWhiteSpace = whiteSpaceMatcher.find(urlSchemeMatcher.end());
-            	break;
+            urlString = text.substring(schemePosition, hasWhiteSpace ? whiteSpaceMatcher.start() : text.length());
            if (urlString.endsWith(".")) {
            	urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
            }
-            spacePosition = text.indexOf(" ", schemePosition + 1);
+            /* URLs can contain brackets, furthermore as they can even be reserved characters in the URI syntax (see https://tools.ietf.org/html/rfc3986#section-2.2)
-            urlString = text.substring(schemePosition, spacePosition < 0 ? text.length() : spacePosition);
+             * But when unpaired, in most cases this is that the unpaired bracket is not part of the URL, but rather used to wrap it in the text*/
-            if (urlString.endsWith(".")) urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
+            urlString = removeUnpairedBrackets(urlString, '(', ')');
-            offset = schemePosition + 6;
+            urlString = removeUnpairedBrackets(urlString, '{', '}');
           	urlString = removeUnpairedBrackets(urlString, '[', ']');
            offset = schemePosition + urlString.length();
            try {
            	url = new AnchorURL(urlString);
            	if(urls != null) {
@ -406,12 +418,58 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }
    }
-    private static final int find(final String s, final Pattern m, final int start) {
+	/**
-        final Matcher mm = m.matcher(s.subSequence(start, s.length()));
+	 * Analyze bracket pairs found in the string and eventually
-        if (!mm.find()) return Integer.MAX_VALUE;
+	 * return a truncated version of that string when one or more pairs are incomplete
-        final int p = mm.start() + start;
+	 * 
-        //final int p = s.indexOf(m, start);
+	 * @param str
-        return (p < 0) ? Integer.MAX_VALUE : p;
+	 *            the string to analyze
 	 * @param openingMark
 	 *            the opening bracket character (example : '{')
 	 * @param closingMark
 	 *            the closing bracket character (example : '}')
 	 * @return the original string or a truncated copy
 	 */
 	protected static String removeUnpairedBrackets(final String str, final char openingMark,
 			final char closingMark) {
 		if(str == null) {
 			return null;
 		}
 		String result = str;
 		char ch;
 		int depth = 0, index = 0, lastUnpairedOpeningIndex = -1;
 		/* Loop on all characters of the string */
 		for(; index < str.length(); index++) {
 			ch = str.charAt(index);
 			if(ch == openingMark) {
 				if(depth == 0) {
 					lastUnpairedOpeningIndex = index;
 				}
 				depth++;
 			} else if(ch == closingMark) {
 				depth--;
 				if(depth == 0) {
 					lastUnpairedOpeningIndex = -1;
 				}
 			}
 			if(depth < 0) {
 				/* Unpaired closing mark : stop the loop here */
 				break;
 			}
 		}
 		if (depth > 0) {
 			/* One or more unpaired opening marks : truncate at the first opening level */
 			if(lastUnpairedOpeningIndex >= 0) {
 				result = str.substring(0, lastUnpairedOpeningIndex);
 			}
 		} else if (depth < 0) {
 			/* One or more unpaired closing marks : truncate at the current index as the loop should have been exited with a break */
 			if(index >= 0) {
 				result = str.substring(0, index);
 			}
 		}
 		return result;
 	}
    /**
--- a/test/java/net/yacy/document/parser/html/ContentScraperTest.java
+++ b/test/java/net/yacy/document/parser/html/ContentScraperTest.java
@ -25,17 +25,21 @@ import java.io.IOException;
 import java.io.StringReader;
 import java.io.Writer;
 import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collection;
 import java.util.Date;
 import java.util.List;
 import java.util.Set;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;
 import org.junit.Assert;
 import org.junit.Test;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;
 /**
 * Unit tests for ContentScrapper class.
 * @author luc
@ -159,4 +163,153 @@ public class ContentScraperTest {
        scraper.close();
    }
    /**
     * Test absolute URLs detection in plain text
     * @throws MalformedURLException should not happen
     */
    @Test
    public void testFindAbsoluteURLs() throws MalformedURLException {
 		final String[] urlStrings = { "http://yacy.net", "http://forum.yacy.de", "https://en.wikipedia.org" };
 		final List<AnchorURL> urls = new ArrayList<>();
 		for (String urlString : urlStrings) {
 			urls.add(new AnchorURL(urlString));
 		}
 		/* Test with various white space separators */
 		String[] separators = { " ", "\n", "\t", "\r" };
 		for (String separator : separators) {
 			StringBuilder text = new StringBuilder();
 			for (String urlString : urlStrings) {
 				if (text.length() > 0) {
 					text.append(separator);
 				}
 				text.append(urlString);
 			}
 			Collection<AnchorURL> detectedURLs = new ArrayList<>();
 			ContentScraper.findAbsoluteURLs(text.toString(), detectedURLs, null);
 			Assert.assertEquals(urls.size(), detectedURLs.size());
 			Assert.assertTrue(urls.containsAll(detectedURLs));
 		}
 		/* URLs surrounded with parenthesis */
 		String[] texts = { "(http://yacy.net)", "YaCy home page (http://yacy.net)",
 				"Nested parentheses (YaCy home page (http://yacy.net))",
 				"Text in parenthesis (example : http://yacy.net)", "A markdown link [YaCy home page](http://yacy.net)",
 				"A markdown [example](http://yacy.net \"YaCy home page\") inline link" };
 		for (String text : texts) {
 			Collection<AnchorURL> detectedURLs = new ArrayList<>();
 			ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
 			Assert.assertEquals(1, detectedURLs.size());
 			Assert.assertEquals(new AnchorURL("http://yacy.net"), detectedURLs.iterator().next());
 		}
 		/* URLs surrounded with square brackets */ 
 		//http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]/
 		String[] squareBracketsTexts = { "[http://yacy.net]", "YaCy home page [http://yacy.net]",
 				"Nested brackets [YaCy home page [http://yacy.net]]",
 				"A mediawiki external link with different label [http://yacy.net YaCy home page]" };
 		for(String text : squareBracketsTexts) {
 			Collection<AnchorURL> detectedURLs = new ArrayList<>();
 			ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
 			Assert.assertEquals(1, detectedURLs.size());
 			Assert.assertEquals(new AnchorURL("http://yacy.net"), detectedURLs.iterator().next());
 		}
 		/* URLs surrounded with curly brackets */ 
 		//http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]/
 		String[] curlyBracketsTexts = { "{http://yacy.net}", "YaCy home page {http://yacy.net}",
 				"Nested brackets {YaCy home page {http://yacy.net}}",
 				"Text in brackets {example : http://yacy.net}" };
 		for(String text : curlyBracketsTexts) {
 			Collection<AnchorURL> detectedURLs = new ArrayList<>();
 			ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
 			Assert.assertEquals(1, detectedURLs.size());
 			Assert.assertEquals(new AnchorURL("http://yacy.net"), detectedURLs.iterator().next());
 		}
 		/* URL with parenthesis */
 		String text = "Example: https://en.wikipedia.org/wiki/Firefox_(disambiguation)";
 		Collection<AnchorURL> detectedURLs = new ArrayList<>();
 		ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
 		Assert.assertEquals(1, detectedURLs.size());
 		Assert.assertEquals(new AnchorURL("https://en.wikipedia.org/wiki/Firefox_(disambiguation)"), detectedURLs.iterator().next());
 		/* IPV6 host */
 		text = "URL with IPV6 host : http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]";
 		detectedURLs = new ArrayList<>();
 		ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
 		Assert.assertEquals(1, detectedURLs.size());
 		Assert.assertEquals(new AnchorURL("http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]"), detectedURLs.iterator().next());
 		/* Text containing only the '://' pattern */
 		detectedURLs = new ArrayList<>();
 		ContentScraper.findAbsoluteURLs("An absolute URL should contain the '://' pattern", detectedURLs, null);
 		Assert.assertEquals(0, detectedURLs.size());
 		/* Text containing only the 'http://' and 'https://' patterns */
 		detectedURLs = new ArrayList<>();
 		ContentScraper.findAbsoluteURLs("An absolute HTTP URL should start with 'http://' or 'https://'", detectedURLs, null);
 		Assert.assertEquals(0, detectedURLs.size());
 		/* Text containing a malformed URL */
 		detectedURLs = new ArrayList<>();
 		ContentScraper.findAbsoluteURLs("The URL https://example.com:demo is malformed", detectedURLs, null);
 		Assert.assertEquals(0, detectedURLs.size());
 		/* Empty text */
 		detectedURLs = new ArrayList<>();
 		ContentScraper.findAbsoluteURLs("", detectedURLs, null);
 		Assert.assertEquals(0, detectedURLs.size());
 		/* Null text */
 		detectedURLs = new ArrayList<>();
 		ContentScraper.findAbsoluteURLs("", detectedURLs, null);
 		Assert.assertEquals(0, detectedURLs.size());
    }
    /**
     * Test unpaired brackets cleaning
     */
    @Test
    public void testRemoveUnpairedBrackets() {
    	/* Null String */
    	Assert.assertEquals(null, ContentScraper.removeUnpairedBrackets(null, '{', '}'));
    	/* Empty string */
    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("", '{', '}'));
    	/* No bracket at all */
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc", '{', '}'));
    	/* Missing one or more opening mark */
    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("}", '{', '}'));
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}", '{', '}'));
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}def", '{', '}'));
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}}", '{', '}'));
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}def}", '{', '}'));
    	Assert.assertEquals("{abc}", ContentScraper.removeUnpairedBrackets("{abc}}", '{', '}'));
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}{def}}", '{', '}'));
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}{def}", '{', '}'));
    	Assert.assertEquals("{abc}def", ContentScraper.removeUnpairedBrackets("{abc}def}", '{', '}'));
    	Assert.assertEquals("{abc}def", ContentScraper.removeUnpairedBrackets("{abc}def}hij}", '{', '}'));
    	Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}}", '{', '}'));
    	/* Missing both opening and closing */
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}de{f", '{', '}'));
    	/* Missing one or more closing mark */
    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{", '{', '}'));
    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{abc", '{', '}'));
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc{def", '{', '}'));
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc{{", '{', '}'));
    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc{def{", '{', '}'));
    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{{abc}", '{', '}'));
    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{abc{def}", '{', '}'));
    	Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}{", '{', '}'));
    	/* Correctly paired marks */
    	Assert.assertEquals("abc{}", ContentScraper.removeUnpairedBrackets("abc{}", '{', '}'));
    	Assert.assertEquals("{abc}", ContentScraper.removeUnpairedBrackets("{abc}", '{', '}'));
    	Assert.assertEquals("{abc}{def}", ContentScraper.removeUnpairedBrackets("{abc}{def}", '{', '}'));
    	Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}", '{', '}'));
    }
 }