Refactored plain-text URLs detection implementation.

For faster processing (measured about 2 times faster on many real-world examples) and more advanced detection (previous algorithm detected only URLs separated from the rest of the text by a space character).
7 years ago · 9b1bb2545e
parent 8da3174867
commit 9b1bb2545e
2 changed files with 233 additions and 22 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -362,9 +362,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }
    }

-    private final static Pattern dpssp = Pattern.compile("://");
    private final static Pattern protp = Pattern.compile("smb://|ftp://|http://|https://");
    
+    /** A regular expression pattern matching any whitespace character */
+    private final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
+    
    /**
     * Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter are can be null. 
     * @param text the text to parse
@ -375,23 +377,33 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        if(text == null) {
        	return;
        }
-        int schemePosition, spacePosition, offset = 0;
+        int schemePosition, offset = 0;
+        boolean hasWhiteSpace;
        String urlString;
        AnchorURL url;
+        final Matcher urlSchemeMatcher = protp.matcher(text);
+        final Matcher whiteSpaceMatcher = WHITESPACE_PATTERN.matcher(text);
+        
+        
        while (offset < text.length()) {
-            schemePosition = find(text, dpssp, offset);
-            if (schemePosition == Integer.MAX_VALUE) {
+            if(!urlSchemeMatcher.find(offset)) {
            	break;
            }
-            offset = Math.max(0, schemePosition - 5);
-            schemePosition = find(text, protp, offset);
-            if (schemePosition == Integer.MAX_VALUE) {
-            	break;
+            schemePosition = urlSchemeMatcher.start();
+            
+            hasWhiteSpace = whiteSpaceMatcher.find(urlSchemeMatcher.end());
+            urlString = text.substring(schemePosition, hasWhiteSpace ? whiteSpaceMatcher.start() : text.length());
+            
+            if (urlString.endsWith(".")) {
+            	urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
            }
-            spacePosition = text.indexOf(" ", schemePosition + 1);
-            urlString = text.substring(schemePosition, spacePosition < 0 ? text.length() : spacePosition);
-            if (urlString.endsWith(".")) urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
-            offset = schemePosition + 6;
+            /* URLs can contain brackets, furthermore as they can even be reserved characters in the URI syntax (see https://tools.ietf.org/html/rfc3986#section-2.2)
+             * But when unpaired, in most cases this is that the unpaired bracket is not part of the URL, but rather used to wrap it in the text*/
+            urlString = removeUnpairedBrackets(urlString, '(', ')');
+            urlString = removeUnpairedBrackets(urlString, '{', '}');
+           	urlString = removeUnpairedBrackets(urlString, '[', ']');
+            
+            offset = schemePosition + urlString.length();
            try {
            	url = new AnchorURL(urlString);
            	if(urls != null) {
@ -406,13 +418,59 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        }
    }

-    private static final int find(final String s, final Pattern m, final int start) {
-        final Matcher mm = m.matcher(s.subSequence(start, s.length()));
-        if (!mm.find()) return Integer.MAX_VALUE;
-        final int p = mm.start() + start;
-        //final int p = s.indexOf(m, start);
-        return (p < 0) ? Integer.MAX_VALUE : p;
-    }
+	/**
+	 * Analyze bracket pairs found in the string and eventually
+	 * return a truncated version of that string when one or more pairs are incomplete
+	 * 
+	 * @param str
+	 *            the string to analyze
+	 * @param openingMark
+	 *            the opening bracket character (example : '{')
+	 * @param closingMark
+	 *            the closing bracket character (example : '}')
+	 * @return the original string or a truncated copy
+	 */
+	protected static String removeUnpairedBrackets(final String str, final char openingMark,
+			final char closingMark) {
+		if(str == null) {
+			return null;
+		}
+		String result = str;
+		char ch;
+		int depth = 0, index = 0, lastUnpairedOpeningIndex = -1;
+		/* Loop on all characters of the string */
+		for(; index < str.length(); index++) {
+			ch = str.charAt(index);
+			if(ch == openingMark) {
+				if(depth == 0) {
+					lastUnpairedOpeningIndex = index;
+				}
+				depth++;
+			} else if(ch == closingMark) {
+				depth--;
+				if(depth == 0) {
+					lastUnpairedOpeningIndex = -1;
+				}
+			}
+			if(depth < 0) {
+				/* Unpaired closing mark : stop the loop here */
+				break;
+			}
+		}
+		
+		if (depth > 0) {
+			/* One or more unpaired opening marks : truncate at the first opening level */
+			if(lastUnpairedOpeningIndex >= 0) {
+				result = str.substring(0, lastUnpairedOpeningIndex);
+			}
+		} else if (depth < 0) {
+			/* One or more unpaired closing marks : truncate at the current index as the loop should have been exited with a break */
+			if(index >= 0) {
+				result = str.substring(0, index);
+			}
+		}
+		return result;
+	}

    /**
     * @param relativePath relative path to this document base URL
--- a/test/java/net/yacy/document/parser/html/ContentScraperTest.java
+++ b/test/java/net/yacy/document/parser/html/ContentScraperTest.java
@ -25,17 +25,21 @@ import java.io.IOException;
 import java.io.StringReader;
 import java.io.Writer;
 import java.net.MalformedURLException;
+import java.util.ArrayList;
 import java.util.Calendar;
+import java.util.Collection;
 import java.util.Date;
 import java.util.List;
 import java.util.Set;
-import net.yacy.cora.document.id.DigestURL;
-import net.yacy.document.VocabularyScraper;
-import net.yacy.kelondro.util.FileUtils;

 import org.junit.Assert;
 import org.junit.Test;

+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.document.VocabularyScraper;
+import net.yacy.kelondro.util.FileUtils;
+
 /**
 * Unit tests for ContentScrapper class.
 * @author luc
@ -158,5 +162,154 @@ public class ContentScraperTest {
        }
        scraper.close();
    }
+    
+    /**
+     * Test absolute URLs detection in plain text
+     * @throws MalformedURLException should not happen
+     */
+    @Test
+    public void testFindAbsoluteURLs() throws MalformedURLException {
+		final String[] urlStrings = { "http://yacy.net", "http://forum.yacy.de", "https://en.wikipedia.org" };
+		final List<AnchorURL> urls = new ArrayList<>();
+		for (String urlString : urlStrings) {
+			urls.add(new AnchorURL(urlString));
+		}
+
+		/* Test with various white space separators */
+		String[] separators = { " ", "\n", "\t", "\r" };
+		for (String separator : separators) {
+			StringBuilder text = new StringBuilder();
+			for (String urlString : urlStrings) {
+				if (text.length() > 0) {
+					text.append(separator);
+				}
+				text.append(urlString);
+			}
+			Collection<AnchorURL> detectedURLs = new ArrayList<>();
+			ContentScraper.findAbsoluteURLs(text.toString(), detectedURLs, null);
+			Assert.assertEquals(urls.size(), detectedURLs.size());
+			Assert.assertTrue(urls.containsAll(detectedURLs));
+		}
+		
+		/* URLs surrounded with parenthesis */
+		String[] texts = { "(http://yacy.net)", "YaCy home page (http://yacy.net)",
+				"Nested parentheses (YaCy home page (http://yacy.net))",
+				"Text in parenthesis (example : http://yacy.net)", "A markdown link [YaCy home page](http://yacy.net)",
+				"A markdown [example](http://yacy.net \"YaCy home page\") inline link" };
+		for (String text : texts) {
+			Collection<AnchorURL> detectedURLs = new ArrayList<>();
+			ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
+			Assert.assertEquals(1, detectedURLs.size());
+			Assert.assertEquals(new AnchorURL("http://yacy.net"), detectedURLs.iterator().next());
+		}
+		
+		/* URLs surrounded with square brackets */ 
+		//http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]/
+		String[] squareBracketsTexts = { "[http://yacy.net]", "YaCy home page [http://yacy.net]",
+				"Nested brackets [YaCy home page [http://yacy.net]]",
+				"A mediawiki external link with different label [http://yacy.net YaCy home page]" };
+		for(String text : squareBracketsTexts) {
+			Collection<AnchorURL> detectedURLs = new ArrayList<>();
+			ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
+			Assert.assertEquals(1, detectedURLs.size());
+			Assert.assertEquals(new AnchorURL("http://yacy.net"), detectedURLs.iterator().next());
+		}
+		
+		/* URLs surrounded with curly brackets */ 
+		//http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]/
+		String[] curlyBracketsTexts = { "{http://yacy.net}", "YaCy home page {http://yacy.net}",
+				"Nested brackets {YaCy home page {http://yacy.net}}",
+				"Text in brackets {example : http://yacy.net}" };
+		for(String text : curlyBracketsTexts) {
+			Collection<AnchorURL> detectedURLs = new ArrayList<>();
+			ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
+			Assert.assertEquals(1, detectedURLs.size());
+			Assert.assertEquals(new AnchorURL("http://yacy.net"), detectedURLs.iterator().next());
+		}
+		
+		/* URL with parenthesis */
+		String text = "Example: https://en.wikipedia.org/wiki/Firefox_(disambiguation)";
+		Collection<AnchorURL> detectedURLs = new ArrayList<>();
+		ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
+		Assert.assertEquals(1, detectedURLs.size());
+		Assert.assertEquals(new AnchorURL("https://en.wikipedia.org/wiki/Firefox_(disambiguation)"), detectedURLs.iterator().next());
+		
+		/* IPV6 host */
+		text = "URL with IPV6 host : http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]";
+		detectedURLs = new ArrayList<>();
+		ContentScraper.findAbsoluteURLs(text, detectedURLs, null);
+		Assert.assertEquals(1, detectedURLs.size());
+		Assert.assertEquals(new AnchorURL("http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]"), detectedURLs.iterator().next());
+		
+		/* Text containing only the '://' pattern */
+		detectedURLs = new ArrayList<>();
+		ContentScraper.findAbsoluteURLs("An absolute URL should contain the '://' pattern", detectedURLs, null);
+		Assert.assertEquals(0, detectedURLs.size());
+		
+		/* Text containing only the 'http://' and 'https://' patterns */
+		detectedURLs = new ArrayList<>();
+		ContentScraper.findAbsoluteURLs("An absolute HTTP URL should start with 'http://' or 'https://'", detectedURLs, null);
+		Assert.assertEquals(0, detectedURLs.size());
+		
+		/* Text containing a malformed URL */
+		detectedURLs = new ArrayList<>();
+		ContentScraper.findAbsoluteURLs("The URL https://example.com:demo is malformed", detectedURLs, null);
+		Assert.assertEquals(0, detectedURLs.size());
+		
+		/* Empty text */
+		detectedURLs = new ArrayList<>();
+		ContentScraper.findAbsoluteURLs("", detectedURLs, null);
+		Assert.assertEquals(0, detectedURLs.size());
+		
+		/* Null text */
+		detectedURLs = new ArrayList<>();
+		ContentScraper.findAbsoluteURLs("", detectedURLs, null);
+		Assert.assertEquals(0, detectedURLs.size());
+    }
+    
+    /**
+     * Test unpaired brackets cleaning
+     */
+    @Test
+    public void testRemoveUnpairedBrackets() {
+    	/* Null String */
+    	Assert.assertEquals(null, ContentScraper.removeUnpairedBrackets(null, '{', '}'));
+    	/* Empty string */
+    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("", '{', '}'));
+    	/* No bracket at all */
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc", '{', '}'));
+    	
+    	/* Missing one or more opening mark */
+    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("}", '{', '}'));
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}", '{', '}'));
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}def", '{', '}'));
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}}", '{', '}'));
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}def}", '{', '}'));
+    	Assert.assertEquals("{abc}", ContentScraper.removeUnpairedBrackets("{abc}}", '{', '}'));
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}{def}}", '{', '}'));
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}{def}", '{', '}'));
+    	Assert.assertEquals("{abc}def", ContentScraper.removeUnpairedBrackets("{abc}def}", '{', '}'));
+    	Assert.assertEquals("{abc}def", ContentScraper.removeUnpairedBrackets("{abc}def}hij}", '{', '}'));
+    	Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}}", '{', '}'));
+    	
+    	/* Missing both opening and closing */
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc}de{f", '{', '}'));
+    	
+    	/* Missing one or more closing mark */
+    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{", '{', '}'));
+    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{abc", '{', '}'));
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc{def", '{', '}'));
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc{{", '{', '}'));
+    	Assert.assertEquals("abc", ContentScraper.removeUnpairedBrackets("abc{def{", '{', '}'));
+    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{{abc}", '{', '}'));
+    	Assert.assertEquals("", ContentScraper.removeUnpairedBrackets("{abc{def}", '{', '}'));
+    	Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}{", '{', '}'));
+    	
+    	/* Correctly paired marks */
+    	Assert.assertEquals("abc{}", ContentScraper.removeUnpairedBrackets("abc{}", '{', '}'));
+    	Assert.assertEquals("{abc}", ContentScraper.removeUnpairedBrackets("{abc}", '{', '}'));
+    	Assert.assertEquals("{abc}{def}", ContentScraper.removeUnpairedBrackets("{abc}{def}", '{', '}'));
+    	Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}", '{', '}'));
+    }

 }