sixcooler 9 years ago
commit de01b25805

@ -57,7 +57,7 @@ public class DateDetection {
// to assign names for days and months, we must know what language is used to express that time
public static enum Language {
GERMAN, ENGLISH, FRENCH, SPANISH, ITALIAN;
GERMAN, ENGLISH, FRENCH, SPANISH, ITALIAN, PORTUGUESE;
}
static {
@ -73,6 +73,7 @@ public class DateDetection {
Months.put(Language.FRENCH, new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"});
Months.put(Language.SPANISH, new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"});
Months.put(Language.ITALIAN, new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"});
Months.put(Language.PORTUGUESE,new String[]{"janeiro", "fevereiro", "março", "abril", "maio", "junho", "julho", "agosto", "setembro", "outubro", "novembro", "dezembro"});
}
@ -336,7 +337,7 @@ public class DateDetection {
private final static LanguageRecognition GERMAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN});
private final static LanguageRecognition FRENCH_LANGUAGE = new LanguageRecognition(new Language[]{Language.FRENCH});
private final static LanguageRecognition ENGLISH_GERMAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH});
private final static LanguageRecognition ENGLISH_GERMAN_FRENCH_SPANISH_ITALIAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH, Language.FRENCH, Language.SPANISH, Language.ITALIAN});
private final static LanguageRecognition ENGLISH_GERMAN_FRENCH_SPANISH_ITALIAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH, Language.FRENCH, Language.SPANISH, Language.ITALIAN, Language.PORTUGUESE});
public static interface StyleParser {
/**
@ -597,7 +598,11 @@ public class DateDetection {
"on october 20 every year",
" on october 20 every year",
"on September 29,",
"am Karfreitag um 15:00 Uhr"
"am Karfreitag um 15:00 Uhr",
"11 fevereiro 2001", // portuguese
"12. fevereiro 2002", // portuguese
"13 de fevereiro 2003", // portuguese
"Fevereiro 14, 2004" // portuguese
};
long t = System.currentTimeMillis();
for (String s: test) {

@ -115,7 +115,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
script(TagType.pair),
span(TagType.pair),
div(TagType.pair),
article(TagType.pair);
article(TagType.pair),
// tags used to capture tag content
// TODO: considere to use </head> or <body> as trigger to scape for text content
style(TagType.pair); // embedded css (if not declared as tag content is parsed as text)
public TagType type;
private TagName(final TagType type) {

@ -118,4 +118,26 @@ public class htmlParserTest extends TestCase {
ImageEntry img = scraper.getImages().get(1);
assertEquals(550,img.width());
}
/**
* Test of parseToScraper method, of class htmlParser
* for scraping tag content from text (special test to verify <style> not counted as text
*/
@Test
public void testParseToScraper_TagTest() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/");
final String mimetype = "text/html";
final String textSource = "test text";
final String testhtml = "<html>"
+ "<head><style type=\"text/css\"> h1 { color: #ffffff; }</style></head>"
+ "<body>"
+ "<p>" + textSource + "</p>"
+ "</body></html>";
ContentScraper scraper = parseToScraper(url, mimetype, new VocabularyScraper(), 0, testhtml, 10);
String txt = scraper.getText();
System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
assertEquals(txt, textSource);
}
}

Loading…
Cancel
Save