fix in html parser

pull/1/head
Michael Peter Christen 12 years ago
parent e1c1e57877
commit 60187a4ec2

@ -57,6 +57,7 @@ import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.parser.html.CharacterCoding;
/**
* MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
@ -66,7 +67,6 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
public static final MultiProtocolURL POISON = new MultiProtocolURL(); // poison pill for concurrent link generators
private static final Pattern ampPattern = Pattern.compile(Pattern.quote("&amp;"));
private static final long serialVersionUID = -1173233022912141884L;
private static final long SMB_TIMEOUT = 5000;
@ -636,7 +636,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
} else {
this.searchpart = this.path.substring(r + 1);
// strip &amp;
Matcher matcher = ampPattern.matcher(this.searchpart);
Matcher matcher = CharacterCoding.ampPattern.matcher(this.searchpart);
while (matcher.find()) {
this.searchpart = matcher.replaceAll("&");
matcher.reset(this.searchpart);

@ -26,12 +26,15 @@ package net.yacy.document.parser.html;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
/**
* Contains methods to convert between Unicode and XML/HTML encoding.
*/
public final class CharacterCoding {
/** Ampersand pattern */
public final static Pattern ampPattern = Pattern.compile(Pattern.quote("&amp;"));
/** Ampersand character in unicode encoding. */
private static final char AMP_UNICODE = "\u0026".charAt(0);
/** Ampersand character in HTML encoding. */
@ -276,14 +279,15 @@ public final class CharacterCoding {
}
return sb.toString();
}
/**
* Replaces HTML-encoded characters with unicode representation.
* @param text text with character to replace
* @return text with replaced characters
*/
public static String html2unicode(final String text) {
public static String html2unicode(String text) {
if (text == null) return null;
text = ampPattern.matcher(text).replaceAll("&"); // sometimes a double-replacement is necessary.
int p = 0, p1, q;
final StringBuilder sb = new StringBuilder(text.length());
String s;

@ -204,11 +204,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
@Override
public void scrapeText(final char[] newtext, final String insideTag) {
public void scrapeText(final char[] newtext0, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext));
if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return;
int p, pl, q, s = 0;
char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
// match evaluation pattern
this.evaluationScores.match(Element.text, newtext);

Loading…
Cancel
Save