added missing unicode transformation in href link contents during

parsing
12 years ago · a8253ca49c
parent 0cf9e9580b
commit a8253ca49c
1 changed files with 2 additions and 1 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -467,7 +467,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) {
        // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
        if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
-            final String href = tagopts.getProperty("href", EMPTY_STRING);
+            String href = tagopts.getProperty("href", EMPTY_STRING);
+            href = CharacterCoding.html2unicode(href);
            AnchorURL url;
            if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
                final String ext = MultiProtocolURL.getFileExtension(url.getFileName());