sixcooler 10 years ago
commit de01b25805

@ -57,7 +57,7 @@ public class DateDetection {
// to assign names for days and months, we must know what language is used to express that time // to assign names for days and months, we must know what language is used to express that time
public static enum Language { public static enum Language {
GERMAN, ENGLISH, FRENCH, SPANISH, ITALIAN; GERMAN, ENGLISH, FRENCH, SPANISH, ITALIAN, PORTUGUESE;
} }
static { static {
@ -73,6 +73,7 @@ public class DateDetection {
Months.put(Language.FRENCH, new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"}); Months.put(Language.FRENCH, new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"});
Months.put(Language.SPANISH, new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"}); Months.put(Language.SPANISH, new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"});
Months.put(Language.ITALIAN, new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"}); Months.put(Language.ITALIAN, new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"});
Months.put(Language.PORTUGUESE,new String[]{"janeiro", "fevereiro", "março", "abril", "maio", "junho", "julho", "agosto", "setembro", "outubro", "novembro", "dezembro"});
} }
@ -336,7 +337,7 @@ public class DateDetection {
private final static LanguageRecognition GERMAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN}); private final static LanguageRecognition GERMAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN});
private final static LanguageRecognition FRENCH_LANGUAGE = new LanguageRecognition(new Language[]{Language.FRENCH}); private final static LanguageRecognition FRENCH_LANGUAGE = new LanguageRecognition(new Language[]{Language.FRENCH});
private final static LanguageRecognition ENGLISH_GERMAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH}); private final static LanguageRecognition ENGLISH_GERMAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH});
private final static LanguageRecognition ENGLISH_GERMAN_FRENCH_SPANISH_ITALIAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH, Language.FRENCH, Language.SPANISH, Language.ITALIAN}); private final static LanguageRecognition ENGLISH_GERMAN_FRENCH_SPANISH_ITALIAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH, Language.FRENCH, Language.SPANISH, Language.ITALIAN, Language.PORTUGUESE});
public static interface StyleParser { public static interface StyleParser {
/** /**
@ -597,7 +598,11 @@ public class DateDetection {
"on october 20 every year", "on october 20 every year",
" on october 20 every year", " on october 20 every year",
"on September 29,", "on September 29,",
"am Karfreitag um 15:00 Uhr" "am Karfreitag um 15:00 Uhr",
"11 fevereiro 2001", // portuguese
"12. fevereiro 2002", // portuguese
"13 de fevereiro 2003", // portuguese
"Fevereiro 14, 2004" // portuguese
}; };
long t = System.currentTimeMillis(); long t = System.currentTimeMillis();
for (String s: test) { for (String s: test) {

@ -115,7 +115,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
script(TagType.pair), script(TagType.pair),
span(TagType.pair), span(TagType.pair),
div(TagType.pair), div(TagType.pair),
article(TagType.pair); article(TagType.pair),
// tags used to capture tag content
// TODO: considere to use </head> or <body> as trigger to scape for text content
style(TagType.pair); // embedded css (if not declared as tag content is parsed as text)
public TagType type; public TagType type;
private TagName(final TagType type) { private TagName(final TagType type) {

@ -118,4 +118,26 @@ public class htmlParserTest extends TestCase {
ImageEntry img = scraper.getImages().get(1); ImageEntry img = scraper.getImages().get(1);
assertEquals(550,img.width()); assertEquals(550,img.width());
} }
/**
* Test of parseToScraper method, of class htmlParser
* for scraping tag content from text (special test to verify <style> not counted as text
*/
@Test
public void testParseToScraper_TagTest() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/");
final String mimetype = "text/html";
final String textSource = "test text";
final String testhtml = "<html>"
+ "<head><style type=\"text/css\"> h1 { color: #ffffff; }</style></head>"
+ "<body>"
+ "<p>" + textSource + "</p>"
+ "</body></html>";
ContentScraper scraper = parseToScraper(url, mimetype, new VocabularyScraper(), 0, testhtml, 10);
String txt = scraper.getText();
System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
assertEquals(txt, textSource);
}
} }

Loading…
Cancel
Save