fix for pattern matcher in html parser

pull/1/head
Michael Peter Christen 13 years ago
parent 8a6edc0031
commit b1e7c11fba

@ -68,6 +68,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private static final Set<String> linkTags0 = new HashSet<String>(12,0.99f);
private static final Set<String> linkTags1 = new HashSet<String>(15,0.99f);
private static final Pattern LB = Pattern.compile("\n");
public enum TagType {
singleton, pair;
}
@ -167,7 +169,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.frames = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.iframes = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.metas = new SizeLimitedMap<String, String>(maxLinks);
this.script = new HashSet<MultiProtocolURI>();
this.script = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.title = EMPTY_STRING;
this.headlines = new ArrayList[6];
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
@ -498,7 +500,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.script.add(absolutePath(src));
this.evaluationScores.match(Element.scriptpath, src);
} else {
this.evaluationScores.match(Element.scriptcode, text);
this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(text)).replaceAll(" "));
}
}
@ -509,7 +511,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override
public void scrapeComment(final char[] comment) {
this.evaluationScores.match(Element.comment, comment);
this.evaluationScores.match(Element.comment, LB.matcher(new String(comment)).replaceAll(" "));
}
private String recursiveParse(final char[] inlineHtml) {

Loading…
Cancel
Save