From 10bddc2c2d52f7718f7bde70c08db24d0847aac1 Mon Sep 17 00:00:00 2001 From: jfhs Date: Tue, 30 Mar 2021 21:30:52 +0200 Subject: [PATCH] Decode HTML entities in all property values by default --- source/net/yacy/document/parser/html/ContentScraper.java | 7 +++---- source/net/yacy/kelondro/io/CharBuffer.java | 9 ++++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 5dcc13e1f..b4c532b0e 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -769,18 +769,18 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String content = tag.opts.getProperty("content", EMPTY_STRING); String name = tag.opts.getProperty("name", EMPTY_STRING); if (name.length() > 0) { - this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); + this.metas.put(name.toLowerCase(), content); if (name.toLowerCase().equals("generator")) { this.evaluationScores.match(Element.metagenerator, content); } } name = tag.opts.getProperty("http-equiv", EMPTY_STRING); if (name.length() > 0) { - this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); + this.metas.put(name.toLowerCase(), content); } name = tag.opts.getProperty("property", EMPTY_STRING); if (name.length() > 0) { - this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); + this.metas.put(name.toLowerCase(), content); } } else if (tag.name.equalsIgnoreCase("area")) { final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING)); @@ -904,7 +904,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { String href = tag.opts.getProperty("href", EMPTY_STRING); - href = CharacterCoding.html2unicode(href); AnchorURL url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { if (followDenied()) { diff --git a/source/net/yacy/kelondro/io/CharBuffer.java b/source/net/yacy/kelondro/io/CharBuffer.java index 35330d333..ad22d9300 100644 --- a/source/net/yacy/kelondro/io/CharBuffer.java +++ b/source/net/yacy/kelondro/io/CharBuffer.java @@ -32,6 +32,7 @@ import java.io.Writer; import java.util.Properties; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.document.parser.html.CharacterCoding; public final class CharBuffer extends Writer { @@ -444,6 +445,7 @@ public final class CharBuffer extends Writer { while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++; // doublequotes are obligatory. However, we want to be fuzzy if they // are ommittet + String value = null; if (pos >= this.length) { // error case: input ended too early break; @@ -453,7 +455,7 @@ public final class CharBuffer extends Writer { start = pos; while ((pos < this.length) && (this.buffer[pos] != doublequote)) pos++; if (pos >= this.length) break; // this is the case if we found no parent doublequote - p.setProperty(key, new String(this.buffer, start, pos - start).trim()); + value = new String(this.buffer, start, pos - start).trim(); pos++; } else if (this.buffer[pos] == singlequote) { // search next singlequote @@ -461,14 +463,15 @@ public final class CharBuffer extends Writer { start = pos; while ((pos < this.length) && (this.buffer[pos] != singlequote)) pos++; if (pos >= this.length) break; // this is the case if we found no parent singlequote - p.setProperty(key, new String(this.buffer, start, pos - start).trim()); + value = new String(this.buffer, start, pos - start).trim(); pos++; } else { // search next whitespace start = pos; while ((pos < this.length) && (this.buffer[pos] > 32)) pos++; - p.setProperty(key, new String(this.buffer, start, pos - start).trim()); + value = new String(this.buffer, start, pos - start).trim(); } + p.setProperty(key, CharacterCoding.html2unicode(value)); // pos should point now to a whitespace: eat up spaces while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++; // go on with next loop