diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 9015f00b3..d2e8eb9b0 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -68,6 +68,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { private static final Set linkTags0 = new HashSet(12,0.99f); private static final Set linkTags1 = new HashSet(15,0.99f); + private static final Pattern LB = Pattern.compile("\n"); + public enum TagType { singleton, pair; } @@ -167,7 +169,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.frames = new SizeLimitedSet(maxLinks); this.iframes = new SizeLimitedSet(maxLinks); this.metas = new SizeLimitedMap(maxLinks); - this.script = new HashSet(); + this.script = new SizeLimitedSet(maxLinks); this.title = EMPTY_STRING; this.headlines = new ArrayList[6]; for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList(); @@ -498,7 +500,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.script.add(absolutePath(src)); this.evaluationScores.match(Element.scriptpath, src); } else { - this.evaluationScores.match(Element.scriptcode, text); + this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(text)).replaceAll(" ")); } } @@ -509,7 +511,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { @Override public void scrapeComment(final char[] comment) { - this.evaluationScores.match(Element.comment, comment); + this.evaluationScores.match(Element.comment, LB.matcher(new String(comment)).replaceAll(" ")); } private String recursiveParse(final char[] inlineHtml) {