diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index de91810f7..c1b2000bf 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -57,6 +57,7 @@ import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.CommonPattern; +import net.yacy.document.parser.html.CharacterCoding; /** * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file @@ -66,7 +67,6 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU public static final MultiProtocolURL POISON = new MultiProtocolURL(); // poison pill for concurrent link generators - private static final Pattern ampPattern = Pattern.compile(Pattern.quote("&")); private static final long serialVersionUID = -1173233022912141884L; private static final long SMB_TIMEOUT = 5000; @@ -636,7 +636,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU } else { this.searchpart = this.path.substring(r + 1); // strip & - Matcher matcher = ampPattern.matcher(this.searchpart); + Matcher matcher = CharacterCoding.ampPattern.matcher(this.searchpart); while (matcher.find()) { this.searchpart = matcher.replaceAll("&"); matcher.reset(this.searchpart); diff --git a/source/net/yacy/document/parser/html/CharacterCoding.java b/source/net/yacy/document/parser/html/CharacterCoding.java index 213c975b0..f93300cbd 100644 --- a/source/net/yacy/document/parser/html/CharacterCoding.java +++ b/source/net/yacy/document/parser/html/CharacterCoding.java @@ -26,12 +26,15 @@ package net.yacy.document.parser.html; import java.util.HashMap; import java.util.Map; +import java.util.regex.Pattern; /** * Contains methods to convert between Unicode and XML/HTML encoding. */ public final class CharacterCoding { + /** Ampersand pattern */ + public final static Pattern ampPattern = Pattern.compile(Pattern.quote("&")); /** Ampersand character in unicode encoding. */ private static final char AMP_UNICODE = "\u0026".charAt(0); /** Ampersand character in HTML encoding. */ @@ -276,14 +279,15 @@ public final class CharacterCoding { } return sb.toString(); } - + /** * Replaces HTML-encoded characters with unicode representation. * @param text text with character to replace * @return text with replaced characters */ - public static String html2unicode(final String text) { + public static String html2unicode(String text) { if (text == null) return null; + text = ampPattern.matcher(text).replaceAll("&"); // sometimes a double-replacement is necessary. int p = 0, p1, q; final StringBuilder sb = new StringBuilder(text.length()); String s; diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 00f396bf8..0ba77481f 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -204,11 +204,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { } @Override - public void scrapeText(final char[] newtext, final String insideTag) { + public void scrapeText(final char[] newtext0, final String insideTag) { // System.out.println("SCRAPE: " + UTF8.String(newtext)); if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return; int p, pl, q, s = 0; - + char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray(); + // match evaluation pattern this.evaluationScores.match(Element.text, newtext);