diff --git a/source/net/yacy/document/DateDetection.java b/source/net/yacy/document/DateDetection.java index 73662ac56..296eaf4fd 100644 --- a/source/net/yacy/document/DateDetection.java +++ b/source/net/yacy/document/DateDetection.java @@ -57,7 +57,7 @@ public class DateDetection { // to assign names for days and months, we must know what language is used to express that time public static enum Language { - GERMAN, ENGLISH, FRENCH, SPANISH, ITALIAN; + GERMAN, ENGLISH, FRENCH, SPANISH, ITALIAN, PORTUGUESE; } static { @@ -73,6 +73,7 @@ public class DateDetection { Months.put(Language.FRENCH, new String[]{"janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"}); Months.put(Language.SPANISH, new String[]{"enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"}); Months.put(Language.ITALIAN, new String[]{"gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"}); + Months.put(Language.PORTUGUESE,new String[]{"janeiro", "fevereiro", "março", "abril", "maio", "junho", "julho", "agosto", "setembro", "outubro", "novembro", "dezembro"}); } @@ -336,7 +337,7 @@ public class DateDetection { private final static LanguageRecognition GERMAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN}); private final static LanguageRecognition FRENCH_LANGUAGE = new LanguageRecognition(new Language[]{Language.FRENCH}); private final static LanguageRecognition ENGLISH_GERMAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH}); - private final static LanguageRecognition ENGLISH_GERMAN_FRENCH_SPANISH_ITALIAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH, Language.FRENCH, Language.SPANISH, Language.ITALIAN}); + private final static LanguageRecognition ENGLISH_GERMAN_FRENCH_SPANISH_ITALIAN_LANGUAGE = new LanguageRecognition(new Language[]{Language.GERMAN, Language.ENGLISH, Language.FRENCH, Language.SPANISH, Language.ITALIAN, Language.PORTUGUESE}); public static interface StyleParser { /** @@ -597,7 +598,11 @@ public class DateDetection { "on october 20 every year", " on october 20 every year", "on September 29,", - "am Karfreitag um 15:00 Uhr" + "am Karfreitag um 15:00 Uhr", + "11 fevereiro 2001", // portuguese + "12. fevereiro 2002", // portuguese + "13 de fevereiro 2003", // portuguese + "Fevereiro 14, 2004" // portuguese }; long t = System.currentTimeMillis(); for (String s: test) { diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 17f9362c7..77b59ad79 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -115,7 +115,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { script(TagType.pair), span(TagType.pair), div(TagType.pair), - article(TagType.pair); + article(TagType.pair), + // tags used to capture tag content + // TODO: considere to use or as trigger to scape for text content + style(TagType.pair); // embedded css (if not declared as tag content is parsed as text) public TagType type; private TagName(final TagType type) { diff --git a/test/net/yacy/document/parser/htmlParserTest.java b/test/net/yacy/document/parser/htmlParserTest.java index 1ab31f9bf..2d7ad5b73 100644 --- a/test/net/yacy/document/parser/htmlParserTest.java +++ b/test/net/yacy/document/parser/htmlParserTest.java @@ -118,4 +118,26 @@ public class htmlParserTest extends TestCase { ImageEntry img = scraper.getImages().get(1); assertEquals(550,img.width()); } + + /** + * Test of parseToScraper method, of class htmlParser + * for scraping tag content from text (special test to verify " + + "" + + "

" + textSource + "

" + + ""; + + ContentScraper scraper = parseToScraper(url, mimetype, new VocabularyScraper(), 0, testhtml, 10); + + String txt = scraper.getText(); + System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]"); + assertEquals(txt, textSource); + } }