@ -25,17 +25,21 @@ import java.io.IOException;
import java.io.StringReader ;
import java.io.Writer ;
import java.net.MalformedURLException ;
import java.util.ArrayList ;
import java.util.Calendar ;
import java.util.Collection ;
import java.util.Date ;
import java.util.List ;
import java.util.Set ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.document.VocabularyScraper ;
import net.yacy.kelondro.util.FileUtils ;
import org.junit.Assert ;
import org.junit.Test ;
import net.yacy.cora.document.id.AnchorURL ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.document.VocabularyScraper ;
import net.yacy.kelondro.util.FileUtils ;
/ * *
* Unit tests for ContentScrapper class .
* @author luc
@ -158,5 +162,154 @@ public class ContentScraperTest {
}
scraper . close ( ) ;
}
/ * *
* Test absolute URLs detection in plain text
* @throws MalformedURLException should not happen
* /
@Test
public void testFindAbsoluteURLs ( ) throws MalformedURLException {
final String [ ] urlStrings = { "http://yacy.net" , "http://forum.yacy.de" , "https://en.wikipedia.org" } ;
final List < AnchorURL > urls = new ArrayList < > ( ) ;
for ( String urlString : urlStrings ) {
urls . add ( new AnchorURL ( urlString ) ) ;
}
/* Test with various white space separators */
String [ ] separators = { " " , "\n" , "\t" , "\r" } ;
for ( String separator : separators ) {
StringBuilder text = new StringBuilder ( ) ;
for ( String urlString : urlStrings ) {
if ( text . length ( ) > 0 ) {
text . append ( separator ) ;
}
text . append ( urlString ) ;
}
Collection < AnchorURL > detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( text . toString ( ) , detectedURLs , null ) ;
Assert . assertEquals ( urls . size ( ) , detectedURLs . size ( ) ) ;
Assert . assertTrue ( urls . containsAll ( detectedURLs ) ) ;
}
/* URLs surrounded with parenthesis */
String [ ] texts = { "(http://yacy.net)" , "YaCy home page (http://yacy.net)" ,
"Nested parentheses (YaCy home page (http://yacy.net))" ,
"Text in parenthesis (example : http://yacy.net)" , "A markdown link [YaCy home page](http://yacy.net)" ,
"A markdown [example](http://yacy.net \"YaCy home page\") inline link" } ;
for ( String text : texts ) {
Collection < AnchorURL > detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( text , detectedURLs , null ) ;
Assert . assertEquals ( 1 , detectedURLs . size ( ) ) ;
Assert . assertEquals ( new AnchorURL ( "http://yacy.net" ) , detectedURLs . iterator ( ) . next ( ) ) ;
}
/* URLs surrounded with square brackets */
//http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]/
String [ ] squareBracketsTexts = { "[http://yacy.net]" , "YaCy home page [http://yacy.net]" ,
"Nested brackets [YaCy home page [http://yacy.net]]" ,
"A mediawiki external link with different label [http://yacy.net YaCy home page]" } ;
for ( String text : squareBracketsTexts ) {
Collection < AnchorURL > detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( text , detectedURLs , null ) ;
Assert . assertEquals ( 1 , detectedURLs . size ( ) ) ;
Assert . assertEquals ( new AnchorURL ( "http://yacy.net" ) , detectedURLs . iterator ( ) . next ( ) ) ;
}
/* URLs surrounded with curly brackets */
//http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]/
String [ ] curlyBracketsTexts = { "{http://yacy.net}" , "YaCy home page {http://yacy.net}" ,
"Nested brackets {YaCy home page {http://yacy.net}}" ,
"Text in brackets {example : http://yacy.net}" } ;
for ( String text : curlyBracketsTexts ) {
Collection < AnchorURL > detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( text , detectedURLs , null ) ;
Assert . assertEquals ( 1 , detectedURLs . size ( ) ) ;
Assert . assertEquals ( new AnchorURL ( "http://yacy.net" ) , detectedURLs . iterator ( ) . next ( ) ) ;
}
/* URL with parenthesis */
String text = "Example: https://en.wikipedia.org/wiki/Firefox_(disambiguation)" ;
Collection < AnchorURL > detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( text , detectedURLs , null ) ;
Assert . assertEquals ( 1 , detectedURLs . size ( ) ) ;
Assert . assertEquals ( new AnchorURL ( "https://en.wikipedia.org/wiki/Firefox_(disambiguation)" ) , detectedURLs . iterator ( ) . next ( ) ) ;
/* IPV6 host */
text = "URL with IPV6 host : http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]" ;
detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( text , detectedURLs , null ) ;
Assert . assertEquals ( 1 , detectedURLs . size ( ) ) ;
Assert . assertEquals ( new AnchorURL ( "http://[abcd:ef01:2345:6789:abcd:ef01:2345:6789]" ) , detectedURLs . iterator ( ) . next ( ) ) ;
/* Text containing only the '://' pattern */
detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( "An absolute URL should contain the '://' pattern" , detectedURLs , null ) ;
Assert . assertEquals ( 0 , detectedURLs . size ( ) ) ;
/* Text containing only the 'http://' and 'https://' patterns */
detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( "An absolute HTTP URL should start with 'http://' or 'https://'" , detectedURLs , null ) ;
Assert . assertEquals ( 0 , detectedURLs . size ( ) ) ;
/* Text containing a malformed URL */
detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( "The URL https://example.com:demo is malformed" , detectedURLs , null ) ;
Assert . assertEquals ( 0 , detectedURLs . size ( ) ) ;
/* Empty text */
detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( "" , detectedURLs , null ) ;
Assert . assertEquals ( 0 , detectedURLs . size ( ) ) ;
/* Null text */
detectedURLs = new ArrayList < > ( ) ;
ContentScraper . findAbsoluteURLs ( "" , detectedURLs , null ) ;
Assert . assertEquals ( 0 , detectedURLs . size ( ) ) ;
}
/ * *
* Test unpaired brackets cleaning
* /
@Test
public void testRemoveUnpairedBrackets ( ) {
/* Null String */
Assert . assertEquals ( null , ContentScraper . removeUnpairedBrackets ( null , '{' , '}' ) ) ;
/* Empty string */
Assert . assertEquals ( "" , ContentScraper . removeUnpairedBrackets ( "" , '{' , '}' ) ) ;
/* No bracket at all */
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc" , '{' , '}' ) ) ;
/* Missing one or more opening mark */
Assert . assertEquals ( "" , ContentScraper . removeUnpairedBrackets ( "}" , '{' , '}' ) ) ;
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc}" , '{' , '}' ) ) ;
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc}def" , '{' , '}' ) ) ;
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc}}" , '{' , '}' ) ) ;
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc}def}" , '{' , '}' ) ) ;
Assert . assertEquals ( "{abc}" , ContentScraper . removeUnpairedBrackets ( "{abc}}" , '{' , '}' ) ) ;
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc}{def}}" , '{' , '}' ) ) ;
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc}{def}" , '{' , '}' ) ) ;
Assert . assertEquals ( "{abc}def" , ContentScraper . removeUnpairedBrackets ( "{abc}def}" , '{' , '}' ) ) ;
Assert . assertEquals ( "{abc}def" , ContentScraper . removeUnpairedBrackets ( "{abc}def}hij}" , '{' , '}' ) ) ;
Assert . assertEquals ( "{{abc}{def}}" , ContentScraper . removeUnpairedBrackets ( "{{abc}{def}}}" , '{' , '}' ) ) ;
/* Missing both opening and closing */
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc}de{f" , '{' , '}' ) ) ;
/* Missing one or more closing mark */
Assert . assertEquals ( "" , ContentScraper . removeUnpairedBrackets ( "{" , '{' , '}' ) ) ;
Assert . assertEquals ( "" , ContentScraper . removeUnpairedBrackets ( "{abc" , '{' , '}' ) ) ;
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc{def" , '{' , '}' ) ) ;
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc{{" , '{' , '}' ) ) ;
Assert . assertEquals ( "abc" , ContentScraper . removeUnpairedBrackets ( "abc{def{" , '{' , '}' ) ) ;
Assert . assertEquals ( "" , ContentScraper . removeUnpairedBrackets ( "{{abc}" , '{' , '}' ) ) ;
Assert . assertEquals ( "" , ContentScraper . removeUnpairedBrackets ( "{abc{def}" , '{' , '}' ) ) ;
Assert . assertEquals ( "{{abc}{def}}" , ContentScraper . removeUnpairedBrackets ( "{{abc}{def}}{" , '{' , '}' ) ) ;
/* Correctly paired marks */
Assert . assertEquals ( "abc{}" , ContentScraper . removeUnpairedBrackets ( "abc{}" , '{' , '}' ) ) ;
Assert . assertEquals ( "{abc}" , ContentScraper . removeUnpairedBrackets ( "{abc}" , '{' , '}' ) ) ;
Assert . assertEquals ( "{abc}{def}" , ContentScraper . removeUnpairedBrackets ( "{abc}{def}" , '{' , '}' ) ) ;
Assert . assertEquals ( "{{abc}{def}}" , ContentScraper . removeUnpairedBrackets ( "{{abc}{def}}" , '{' , '}' ) ) ;
}
}