From 67beef657f82e92f48dd8425073ad81896a2ff4b Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 10 Apr 2014 18:58:03 +0200 Subject: [PATCH] strong redesign of html parser: object recursion is now made using a stack on html tag objects, not using a recursive parse-again method which may cause bad performance and huge memory allocation. The new method also produced better parsed image objects with exact anchor text references. --- .../document/parser/html/AbstractScraper.java | 5 +- .../parser/html/AbstractTransformer.java | 9 +- .../document/parser/html/ContentScraper.java | 254 ++++++++++-------- .../parser/html/ContentTransformer.java | 25 +- .../yacy/document/parser/html/Scraper.java | 6 +- .../document/parser/html/Transformer.java | 6 +- .../parser/html/TransformerWriter.java | 246 +++++++++-------- .../net/yacy/document/parser/htmlParser.java | 56 ++-- .../parser/images/genericImageParser.java | 2 + .../yacy/search/schema/HyperlinkGraph.java | 2 +- .../search/schema/WebgraphConfiguration.java | 25 +- 11 files changed, 356 insertions(+), 280 deletions(-) diff --git a/source/net/yacy/document/parser/html/AbstractScraper.java b/source/net/yacy/document/parser/html/AbstractScraper.java index d3bc2ffeb..e0980c21b 100644 --- a/source/net/yacy/document/parser/html/AbstractScraper.java +++ b/source/net/yacy/document/parser/html/AbstractScraper.java @@ -29,7 +29,6 @@ package net.yacy.document.parser.html; -import java.util.Properties; import java.util.Set; import net.yacy.kelondro.util.MemoryControl; @@ -72,10 +71,10 @@ public abstract class AbstractScraper implements Scraper { // the other methods must take into account to construct the return value correctly @Override - public abstract void scrapeTag0(String tagname, Properties tagopts); + public abstract void scrapeTag0(ContentScraper.Tag tag); @Override - public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text); + public abstract void scrapeTag1(ContentScraper.Tag tag); public static String stripAllTags(final char[] s) { if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return ""; diff --git a/source/net/yacy/document/parser/html/AbstractTransformer.java b/source/net/yacy/document/parser/html/AbstractTransformer.java index d812606c9..370d277e6 100644 --- a/source/net/yacy/document/parser/html/AbstractTransformer.java +++ b/source/net/yacy/document/parser/html/AbstractTransformer.java @@ -24,7 +24,6 @@ package net.yacy.document.parser.html; -import java.util.Properties; import java.util.TreeSet; public abstract class AbstractTransformer implements Transformer { @@ -58,13 +57,13 @@ public abstract class AbstractTransformer implements Transformer { // the other methods must take into account to construct the return value correctly @Override - public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) { - return TransformerWriter.genTag0(tagname, tagopts, quotechar); + public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) { + return TransformerWriter.genTag0(tag.name, tag.opts, quotechar); } @Override - public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) { - return TransformerWriter.genTag1(tagname, tagopts, text, quotechar); + public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) { + return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar); } @Override diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 285cf26a1..0b4770c6e 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -59,6 +59,7 @@ import net.yacy.cora.util.NumberTools; import net.yacy.document.SentenceReader; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.html.Evaluation.Element; +import net.yacy.document.parser.images.genericImageParser; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.ISO639; @@ -80,7 +81,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { singleton, pair; } - public enum Tag { + public enum TagName { html(TagType.singleton), // scraped as singleton to get attached properties like 'lang' body(TagType.singleton), // scraped as singleton to get attached properties like 'class' div(TagType.singleton), // scraped as singleton to get attached properties like 'id' @@ -111,14 +112,49 @@ public class ContentScraper extends AbstractScraper implements Scraper { style(TagType.pair); public TagType type; - private Tag(final TagType type) { + private TagName(final TagType type) { this.type = type; } } + public static class Tag { + public String name; + public Properties opts; + public CharBuffer content; + public Tag(final String name) { + this.name = name; + this.opts = new Properties(); + this.content = new CharBuffer(100); + } + public Tag(final String name, final Properties opts) { + this.name = name; + this.opts = opts; + this.content = new CharBuffer(100); + } + public Tag(final String name, final Properties opts, final CharBuffer content) { + this.name = name; + this.opts = opts; + this.content = content; + } + public void close() { + this.name = null; + this.opts = null; + if (this.content != null) this.content.close(); + this.content = null; + } + @Override + public void finalize() { + this.close(); + } + @Override + public String toString() { + return "<" + name + " " + opts + ">" + content + ""; + } + } + // all these tags must be given in lowercase, because the tags from the files are compared in lowercase static { - for (final Tag tag: Tag.values()) { + for (final TagName tag: TagName.values()) { if (tag.type == TagType.singleton) linkTags0.add(tag.name()); if (tag.type == TagType.pair) linkTags1.add(tag.name()); } @@ -321,88 +357,88 @@ public class ContentScraper extends AbstractScraper implements Scraper { } @Override - public void scrapeTag0(final String tagname, final Properties tagopts) { - if (tagname.equalsIgnoreCase("img")) { - final String src = tagopts.getProperty("src", EMPTY_STRING); + public void scrapeTag0(Tag tag) { + if (tag.name.equalsIgnoreCase("img")) { + final String src = tag.opts.getProperty("src", EMPTY_STRING); try { if (src.length() > 0) { final AnchorURL url = absolutePath(src); if (url != null) { - final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); - final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); - final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1); + final int width = Integer.parseInt(tag.opts.getProperty("width", "-1")); + final int height = Integer.parseInt(tag.opts.getProperty("height", "-1")); + final ImageEntry ie = new ImageEntry(url, tag.opts.getProperty("alt", EMPTY_STRING), width, height, -1); this.images.add(ie); } } } catch (final NumberFormatException e) {} this.evaluationScores.match(Element.imgpath, src); - } else if(tagname.equalsIgnoreCase("base")) { + } else if(tag.name.equalsIgnoreCase("base")) { try { - this.root = new DigestURL(tagopts.getProperty("href", EMPTY_STRING)); + this.root = new DigestURL(tag.opts.getProperty("href", EMPTY_STRING)); } catch (final MalformedURLException e) {} - } else if (tagname.equalsIgnoreCase("frame")) { - final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); - tagopts.put("src", src.toNormalform(true)); - src.setAll(tagopts); + } else if (tag.name.equalsIgnoreCase("frame")) { + final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING)); + tag.opts.put("src", src.toNormalform(true)); + src.setAll(tag.opts); this.anchors.add(src); this.frames.add(src); this.evaluationScores.match(Element.framepath, src.toNormalform(true)); - } else if (tagname.equalsIgnoreCase("body")) { - final String c = tagopts.getProperty("class", EMPTY_STRING); + } else if (tag.name.equalsIgnoreCase("body")) { + final String c = tag.opts.getProperty("class", EMPTY_STRING); this.evaluationScores.match(Element.bodyclass, c); - } else if (tagname.equalsIgnoreCase("div")) { - final String id = tagopts.getProperty("id", EMPTY_STRING); + } else if (tag.name.equalsIgnoreCase("div")) { + final String id = tag.opts.getProperty("id", EMPTY_STRING); this.evaluationScores.match(Element.divid, id); - final String itemtype = tagopts.getProperty("itemtype", EMPTY_STRING); + final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING); if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) { breadcrumbs++; } - } else if (tagname.equalsIgnoreCase("meta")) { - final String content = tagopts.getProperty("content", EMPTY_STRING); - String name = tagopts.getProperty("name", EMPTY_STRING); + } else if (tag.name.equalsIgnoreCase("meta")) { + final String content = tag.opts.getProperty("content", EMPTY_STRING); + String name = tag.opts.getProperty("name", EMPTY_STRING); if (name.length() > 0) { this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); if (name.toLowerCase().equals("generator")) { this.evaluationScores.match(Element.metagenerator, content); } } - name = tagopts.getProperty("http-equiv", EMPTY_STRING); + name = tag.opts.getProperty("http-equiv", EMPTY_STRING); if (name.length() > 0) { this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); } - name = tagopts.getProperty("property", EMPTY_STRING); + name = tag.opts.getProperty("property", EMPTY_STRING); if (name.length() > 0) { this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); } - } else if (tagname.equalsIgnoreCase("area")) { - final String areatitle = cleanLine(tagopts.getProperty("title", EMPTY_STRING)); - //String alt = tagopts.getProperty("alt",EMPTY_STRING); - final String href = tagopts.getProperty("href", EMPTY_STRING); + } else if (tag.name.equalsIgnoreCase("area")) { + final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING)); + //String alt = tag.opts.getProperty("alt",EMPTY_STRING); + final String href = tag.opts.getProperty("href", EMPTY_STRING); if (href.length() > 0) { - tagopts.put("name", areatitle); + tag.opts.put("name", areatitle); AnchorURL url = absolutePath(href); - tagopts.put("href", url.toNormalform(true)); - url.setAll(tagopts); + tag.opts.put("href", url.toNormalform(true)); + url.setAll(tag.opts); this.anchors.add(url); } - } else if (tagname.equalsIgnoreCase("link")) { - final String href = tagopts.getProperty("href", EMPTY_STRING); + } else if (tag.name.equalsIgnoreCase("link")) { + final String href = tag.opts.getProperty("href", EMPTY_STRING); final AnchorURL newLink = absolutePath(href); if (newLink != null) { - tagopts.put("href", newLink.toNormalform(true)); - String rel = tagopts.getProperty("rel", EMPTY_STRING); - final String linktitle = tagopts.getProperty("title", EMPTY_STRING); - final String type = tagopts.getProperty("type", EMPTY_STRING); - final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING); + tag.opts.put("href", newLink.toNormalform(true)); + String rel = tag.opts.getProperty("rel", EMPTY_STRING); + final String linktitle = tag.opts.getProperty("title", EMPTY_STRING); + final String type = tag.opts.getProperty("type", EMPTY_STRING); + final String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING); if (rel.equalsIgnoreCase("shortcut icon")) { final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1); this.images.add(ie); this.favicon = newLink; } else if (rel.equalsIgnoreCase("canonical")) { - tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next()); - newLink.setAll(tagopts); + tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next()); + newLink.setAll(tag.opts); this.anchors.add(newLink); this.canonical = newLink; } else if (rel.equalsIgnoreCase("publisher")) { @@ -417,130 +453,130 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.css.put(newLink, rel); this.evaluationScores.match(Element.csspath, href); } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) { - tagopts.put("name", linktitle); - newLink.setAll(tagopts); + tag.opts.put("name", linktitle); + newLink.setAll(tag.opts); this.anchors.add(newLink); } } - } else if(tagname.equalsIgnoreCase("embed")) { - final String src = tagopts.getProperty("src", EMPTY_STRING); + } else if(tag.name.equalsIgnoreCase("embed")) { + final String src = tag.opts.getProperty("src", EMPTY_STRING); try { if (src.length() > 0) { final AnchorURL url = absolutePath(src); if (url != null) { - final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); - final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); - tagopts.put("src", url.toNormalform(true)); - final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING)); + final int width = Integer.parseInt(tag.opts.getProperty("width", "-1")); + final int height = Integer.parseInt(tag.opts.getProperty("height", "-1")); + tag.opts.put("src", url.toNormalform(true)); + final EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING)); this.embeds.put(url, ie); - url.setAll(tagopts); + url.setAll(tag.opts); this.anchors.add(url); } } } catch (final NumberFormatException e) {} - } else if(tagname.equalsIgnoreCase("param")) { - final String name = tagopts.getProperty("name", EMPTY_STRING); + } else if(tag.name.equalsIgnoreCase("param")) { + final String name = tag.opts.getProperty("name", EMPTY_STRING); if (name.equalsIgnoreCase("movie")) { - AnchorURL url = absolutePath(tagopts.getProperty("value", EMPTY_STRING)); - tagopts.put("value", url.toNormalform(true)); - url.setAll(tagopts); + AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING)); + tag.opts.put("value", url.toNormalform(true)); + url.setAll(tag.opts); this.anchors.add(url); } - } else if (tagname.equalsIgnoreCase("iframe")) { - final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); - tagopts.put("src", src.toNormalform(true)); - src.setAll(tagopts); + } else if (tag.name.equalsIgnoreCase("iframe")) { + final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING)); + tag.opts.put("src", src.toNormalform(true)); + src.setAll(tag.opts); this.anchors.add(src); this.iframes.add(src); this.evaluationScores.match(Element.iframepath, src.toNormalform(true)); - } else if (tagname.equalsIgnoreCase("html")) { - final String lang = tagopts.getProperty("lang", EMPTY_STRING); + } else if (tag.name.equalsIgnoreCase("html")) { + final String lang = tag.opts.getProperty("lang", EMPTY_STRING); if (!lang.isEmpty()) // fake a language meta to preserv detection from this.metas.put("dc.language",lang.substring(0,2)); // fix found entries like "hu-hu" } // fire event - fireScrapeTag0(tagname, tagopts); + fireScrapeTag0(tag.name, tag.opts); } @Override - public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) { - // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text)); - if (tagname.equalsIgnoreCase("a") && text.length < 2048) { - String href = tagopts.getProperty("href", EMPTY_STRING); + public void scrapeTag1(Tag tag) { + // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); + if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { + String href = tag.opts.getProperty("href", EMPTY_STRING); href = CharacterCoding.html2unicode(href); AnchorURL url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); - if (ext.equals("png") || ext.equals("gif") || ext.equals("jpg") || ext.equals("jpeg") || ext.equals("tiff") || ext.equals("tif")) { + if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) { // special handling of such urls: put them to the image urls - final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1); + final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1); this.images.add(ie); } else { if (followDenied()) { - String rel = tagopts.getProperty("rel", EMPTY_STRING); + String rel = tag.opts.getProperty("rel", EMPTY_STRING); if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; - tagopts.put("rel", rel); + tag.opts.put("rel", rel); } - tagopts.put("text", new String(text)); - tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute - url.setAll(tagopts); - recursiveParse(url, text); + tag.opts.put("text", new String(tag.content.getChars())); + tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute + url.setAll(tag.opts); + recursiveParse(url, tag.content.getChars()); this.anchors.add(url); } } this.evaluationScores.match(Element.apath, href); } final String h; - if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { - h = recursiveParse(null, text); + if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.headlines[0].add(h); - } else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) { - h = recursiveParse(null, text); + } else if((tag.name.equalsIgnoreCase("h2")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.headlines[1].add(h); - } else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) { - h = recursiveParse(null, text); + } else if ((tag.name.equalsIgnoreCase("h3")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.headlines[2].add(h); - } else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) { - h = recursiveParse(null, text); + } else if ((tag.name.equalsIgnoreCase("h4")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.headlines[3].add(h); - } else if ((tagname.equalsIgnoreCase("h5")) && (text.length < 1024)) { - h = recursiveParse(null, text); + } else if ((tag.name.equalsIgnoreCase("h5")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.headlines[4].add(h); - } else if ((tagname.equalsIgnoreCase("h6")) && (text.length < 1024)) { - h = recursiveParse(null, text); + } else if ((tag.name.equalsIgnoreCase("h6")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.headlines[5].add(h); - } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) { - String t = recursiveParse(null, text); - this.titles.add(t); - this.evaluationScores.match(Element.title, t); - } else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) { - h = recursiveParse(null, text); + } else if ((tag.name.equalsIgnoreCase("title")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); + this.titles.add(h); + this.evaluationScores.match(Element.title, h); + } else if ((tag.name.equalsIgnoreCase("b")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.bold.inc(h); - } else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) { - h = recursiveParse(null, text); + } else if ((tag.name.equalsIgnoreCase("strong")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.bold.inc(h); - } else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) { - h = recursiveParse(null, text); + } else if ((tag.name.equalsIgnoreCase("i")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.italic.inc(h); - } else if ((tagname.equalsIgnoreCase("u")) && (text.length < 1024)) { - h = recursiveParse(null, text); + } else if ((tag.name.equalsIgnoreCase("u")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.underline.inc(h); - } else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) { - h = recursiveParse(null, text); + } else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.li.add(h); - } else if (tagname.equalsIgnoreCase("script")) { - final String src = tagopts.getProperty("src", EMPTY_STRING); + } else if (tag.name.equalsIgnoreCase("script")) { + final String src = tag.opts.getProperty("src", EMPTY_STRING); if (src.length() > 0) { this.script.add(absolutePath(src)); this.evaluationScores.match(Element.scriptpath, src); } else { - this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(text)).replaceAll(" ")); + this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" ")); } } // fire event - fireScrapeTag1(tagname, tagopts, text); + fireScrapeTag1(tag.name, tag.opts, tag.content.getChars()); } @@ -570,15 +606,20 @@ public class ContentScraper extends AbstractScraper implements Scraper { for (final AnchorURL entry: scraper.getAnchors()) { this.anchors.add(entry); } + String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars()))); for (ImageEntry ie: scraper.images) { if (linkurl != null) { ie.setLinkurl(linkurl); - ie.setAnchortext(new String(inlineHtml)); + ie.setAnchortext(line); + } + // this image may have been added recently from the same location (as this is a recursive parse) + // we want to keep only one of them, check if they are equal + if (this.images.size() > 0 && this.images.get(this.images.size() - 1).url().equals(ie.url())) { + this.images.remove(this.images.size() - 1); } this.images.add(ie); } - String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars()))); scraper.close(); return line; } @@ -681,6 +722,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } public String getText() { + this.content.trim(); try { return this.content.toString(); } catch (final OutOfMemoryError e) { diff --git a/source/net/yacy/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java index e4dbf6238..5b6fd8252 100644 --- a/source/net/yacy/document/parser/html/ContentTransformer.java +++ b/source/net/yacy/document/parser/html/ContentTransformer.java @@ -29,7 +29,6 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; -import java.util.Properties; import java.util.TreeSet; import net.yacy.cora.document.encoding.ASCII; @@ -115,27 +114,27 @@ public class ContentTransformer extends AbstractTransformer implements Transform } @Override - public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) { - if (tagname.equals("img")) { + public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) { + if (tag.name.equals("img")) { // check bluelist - if (bluelistHit(tagopts.getProperty("src", "").toCharArray())) return genBlueLetters(5); - if (bluelistHit(tagopts.getProperty("alt", "").toCharArray())) return genBlueLetters(5); + if (bluelistHit(tag.opts.getProperty("src", "").toCharArray())) return genBlueLetters(5); + if (bluelistHit(tag.opts.getProperty("alt", "").toCharArray())) return genBlueLetters(5); // replace image alternative name - tagopts.setProperty("alt", new String(transformText(tagopts.getProperty("alt", "").toCharArray()))); + tag.opts.setProperty("alt", new String(transformText(tag.opts.getProperty("alt", "").toCharArray()))); } - if (tagname.equals("input") && (tagopts.getProperty("type") != null && tagopts.getProperty("type").equals("submit"))) { + if (tag.name.equals("input") && (tag.opts.getProperty("type") != null && tag.opts.getProperty("type").equals("submit"))) { // rewrite button name - tagopts.setProperty("value", new String(transformText(tagopts.getProperty("value", "").toCharArray()))); + tag.opts.setProperty("value", new String(transformText(tag.opts.getProperty("value", "").toCharArray()))); } - return TransformerWriter.genTag0(tagname, tagopts, quotechar); + return TransformerWriter.genTag0(tag.name, tag.opts, quotechar); } @Override - public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) { - if (bluelistHit(tagopts.getProperty("href","").toCharArray())) return genBlueLetters(text.length); - if (bluelistHit(text)) return genBlueLetters(text.length); - return TransformerWriter.genTag1(tagname, tagopts, text, quotechar); + public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) { + if (bluelistHit(tag.opts.getProperty("href","").toCharArray())) return genBlueLetters(tag.content.length()); + if (bluelistHit(tag.content.getChars())) return genBlueLetters(tag.content.length()); + return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar); } @Override diff --git a/source/net/yacy/document/parser/html/Scraper.java b/source/net/yacy/document/parser/html/Scraper.java index e1dfe73e1..dc8e3e964 100644 --- a/source/net/yacy/document/parser/html/Scraper.java +++ b/source/net/yacy/document/parser/html/Scraper.java @@ -24,8 +24,6 @@ package net.yacy.document.parser.html; -import java.util.Properties; - public interface Scraper { public boolean isTag0(String tag); @@ -34,9 +32,9 @@ public interface Scraper { public void scrapeText(char[] text, String insideTag); - public void scrapeTag0(String tagname, Properties tagopts); + public void scrapeTag0(ContentScraper.Tag tag); - public void scrapeTag1(String tagname, Properties tagopts, char[] text); + public void scrapeTag1(ContentScraper.Tag tag); public void scrapeComment(final char[] comment); diff --git a/source/net/yacy/document/parser/html/Transformer.java b/source/net/yacy/document/parser/html/Transformer.java index 2aedfa120..9b605340e 100644 --- a/source/net/yacy/document/parser/html/Transformer.java +++ b/source/net/yacy/document/parser/html/Transformer.java @@ -24,8 +24,6 @@ package net.yacy.document.parser.html; -import java.util.Properties; - public interface Transformer { // the init method is used to initialize the transformer with some values @@ -52,10 +50,10 @@ public interface Transformer { public char[] transformText(char[] text); // method that is called when a body-less tag occurs - public char[] transformTag0(String tagname, Properties tagopts, char quotechar); + public char[] transformTag0(ContentScraper.Tag tag, char quotechar); // method that is called when a body-containing text occurs - public char[] transformTag1(String tagname, Properties tagopts, char[] text, char quotechar); + public char[] transformTag1(ContentScraper.Tag tag, char quotechar); public void close(); } diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index 53d8e91de..408322dcb 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -43,6 +43,7 @@ import java.net.MalformedURLException; import java.nio.charset.Charset; import java.util.Enumeration; import java.util.Properties; +import java.util.Stack; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; @@ -62,9 +63,7 @@ public final class TransformerWriter extends Writer { private final OutputStream outStream; private OutputStreamWriter out; private CharBuffer buffer; - private String filterTag; - private Properties filterOpts; - private CharBuffer filterCont; + private Stack tagStack; private final Scraper scraper; private final Transformer transformer; private boolean inSingleQuote; @@ -72,7 +71,7 @@ public final class TransformerWriter extends Writer { private boolean inComment; private boolean binaryUnsuspect; private final boolean passbyIfBinarySuspect; - + public TransformerWriter( final OutputStream outStream, final Charset charSet, @@ -95,9 +94,7 @@ public final class TransformerWriter extends Writer { this.scraper = scraper; this.transformer = transformer; this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize); - this.filterTag = null; - this.filterOpts = null; - this.filterCont = null; + this.tagStack = new Stack(); this.inSingleQuote = false; this.inDoubleQuote = false; this.inComment = false; @@ -186,63 +183,105 @@ public final class TransformerWriter extends Writer { return result; } - private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) { - //System.out.println("filterTag: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug - // distinguish the following cases: - // - (1) not collecting data for a tag and getting no tag (not opener and not close) - // - (2) not collecting data for a tag and getting a tag opener - // - (3) not collecting data for a tag and getting a tag close - // - (4) collecting data for a tag and getting no tag (not opener and not close) - // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag - // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener) - // - (7) collecting data for a tag and getting the correct close tag for that collecting tag - - if (this.filterTag == null) { + /** + * the token processor distinguishes three different types of input: opening tag, closing tag, text content + * @param in - the token to be processed + * @param quotechar + * @return a processed version of the token + */ + private char[] tokenProcessor(final char[] in, final char quotechar) { + if (in.length == 0) return in; + + // scan the string and parse structure + if (in.length <= 2 || in[0] != lb) return filterTag(in); // this is a text + + // this is a tag + String tag; + int tagend; + if (in[1] == '/') { + // a closing tag + tagend = tagEnd(in, 2); + tag = new String(in, 2, tagend - 2).toLowerCase(); + final char[] text = new char[in.length - tagend - 1]; + System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); + return filterTag(text, quotechar, tag, false); + } + + // an opening tag + tagend = tagEnd(in, 1); + tag = new String(in, 1, tagend - 1).toLowerCase(); + final char[] text = new char[in.length - tagend - 1]; + System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); + return filterTag(text, quotechar, tag, true); + } + + // distinguish the following cases: + // - (1) not collecting data for a tag and getting no tag (not opener and not close) + // - (2) not collecting data for a tag and getting a tag opener + // - (3) not collecting data for a tag and getting a tag close + // - (4) collecting data for a tag and getting no tag (not opener and not close) + // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag + // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener) + // - (7) collecting data for a tag and getting the correct close tag for that collecting tag + + /** + * + * @param content + * @return + */ + private char[] filterTag(final char[] content) { + if (this.tagStack.size() == 0) { // we are not collection tag text -> case (1) - (3) + // case (1): this is not a tag opener/closer + if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null); + if (this.transformer != null) return this.transformer.transformText(content); + return content; + } - if (tag == null) { - // case (1): this is not a tag opener/closer - if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null); - if (this.transformer != null) return this.transformer.transformText(content); - return content; - } + // we are collection tag text for the tag 'filterTag' -> case (4) - (7) + // case (4): getting no tag, go on collecting content + if (this.scraper != null) { + this.scraper.scrapeText(content, this.tagStack.lastElement().name); + } + if (this.transformer != null) { + this.tagStack.lastElement().content.append(this.transformer.transformText(content)); + } else { + this.tagStack.lastElement().content.append(content); + } + return new char[0]; + } + + private char[] filterTag(final char[] content, final char quotechar, final String tagname, final boolean opening) { + assert tagname != null; + + if (this.tagStack.size() == 0) { + // we are not collection tag text -> case (1) - (3) // we have a new tag if (opening) { // case (2): - return filterTagOpening(tag, content, quotechar); + return filterTagOpening(tagname, content, quotechar); } - // its a close tag + // its a close tag where no should be // case (3): we ignore that thing and return it again - return genTag0raw(tag, false, content); + return genTag0raw(tagname, false, content); } // we are collection tag text for the tag 'filterTag' -> case (4) - (7) - if (tag == null || tag.equals("!")) { - // case (4): getting no tag, go on collecting content - if (this.scraper != null) { - this.scraper.scrapeText(content, this.filterTag); - } - if (this.transformer != null) { - this.filterCont.append(this.transformer.transformText(content)); - } else { - this.filterCont.append(content); - } - return new char[0]; - } + if (tagname.equals("!")) filterTag(content); // it's a tag! which one? if (opening) { // case (5): the opening should not be here. But we keep the order anyway - this.filterCont.append(filterTagOpening(tag, content, quotechar)); - return filterTagCloseing(quotechar); + this.tagStack.lastElement().content.append(filterTagOpening(tagname, content, quotechar)); + return new char[0]; } - if (!tag.equalsIgnoreCase(this.filterTag)) { + if (!tagname.equalsIgnoreCase(this.tagStack.lastElement().name)) { // case (6): its a closing tag, but the wrong one. just add it. - this.filterCont.append(genTag0raw(tag, opening, content)); + this.tagStack.lastElement().content.append(genTag0raw(tagname, opening, content)); return new char[0]; } @@ -250,101 +289,66 @@ public final class TransformerWriter extends Writer { return filterTagCloseing(quotechar); } - private char[] filterTagOpening(final String tag, final char[] content, final char quotechar) { - if (this.scraper != null && this.scraper.isTag0(tag)) { + private char[] filterTagOpening(final String tagname, final char[] content, final char quotechar) { + final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); + ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser()); + charBuffer.close(); + if (this.scraper != null && this.scraper.isTag0(tagname)) { // this single tag is collected at once here - final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); - this.scraper.scrapeTag0(tag, charBuffer.propParser()); - charBuffer.close(); + this.scraper.scrapeTag0(tag); } - if (this.transformer != null && this.transformer.isTag0(tag)) { + if (this.transformer != null && this.transformer.isTag0(tagname)) { // this single tag is collected at once here - final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); char[] b = new char[0]; - try { - b = this.transformer.transformTag0(tag, scb.propParser(), quotechar); - } finally { - scb.close(); - } + b = this.transformer.transformTag0(tag, quotechar); return b; - } else if ((this.scraper != null && this.scraper.isTag1(tag)) || - (this.transformer != null && this.transformer.isTag1(tag))) { - // ok, start collecting - this.filterTag = tag; - final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); - this.filterOpts = scb.propParser(); - scb.close(); - if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset(); + } else if ((this.scraper != null && this.scraper.isTag1(tagname)) || + (this.transformer != null && this.transformer.isTag1(tagname))) { + // ok, start collecting; we don't push this here to the scraper or transformer; we do that when the tag is closed. + this.tagStack.push(tag); return new char[0]; } else { // we ignore that thing and return it again - return genTag0raw(tag, true, content); + return genTag0raw(tagname, true, content); } } private char[] filterTagCloseing(final char quotechar) { char[] ret; - if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars()); + ContentScraper.Tag tag = this.tagStack.lastElement(); + if (this.scraper != null) this.scraper.scrapeTag1(tag); if (this.transformer != null) { - ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar); + ret = this.transformer.transformTag1(tag, quotechar); } else { - ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar); + ret = genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar); + } + if ((this.scraper != null && this.scraper.isTag1(tag.name)) || + (this.transformer != null && this.transformer.isTag1(tag.name))) { + // remove the tag from the stack as soon as the tag is processed + this.tagStack.pop(); + // at this point the characters from the recently processed tag must be attached to the previous tag + if (this.tagStack.size() > 0) this.tagStack.lastElement().content.append(ret); } - this.filterTag = null; - this.filterOpts = null; - this.filterCont = null; return ret; } private char[] filterFinalize(final char quotechar) { - if (this.filterTag == null) { + if (this.tagStack.size() == 0) { return new char[0]; } // it's our closing tag! return complete result. char[] ret; - if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars()); + if (this.scraper != null) this.scraper.scrapeTag1(this.tagStack.lastElement()); if (this.transformer != null) { - ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar); + ret = this.transformer.transformTag1(this.tagStack.lastElement(), quotechar); } else { - ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar); + ret = genTag1(this.tagStack.lastElement().name, this.tagStack.lastElement().opts, this.tagStack.lastElement().content.getChars(), quotechar); } - this.filterTag = null; - this.filterOpts = null; - this.filterCont = null; + this.tagStack.pop(); return ret; } - private char[] filterSentence(final char[] in, final char quotechar) { - if (in.length == 0) return in; - //System.out.println("filterSentence, quotechar = \"" + quotechar + "\": " + new String(in)); // debug - // scan the string and parse structure - if (in.length > 2 && in[0] == lb) { - - // a tag - String tag; - int tagend; - if (in[1] == '/') { - // a closing tag - tagend = tagEnd(in, 2); - tag = new String(in, 2, tagend - 2).toLowerCase(); - final char[] text = new char[in.length - tagend - 1]; - System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); - return filterTag(tag, false, text, quotechar); - } - - // an opening tag - tagend = tagEnd(in, 1); - tag = new String(in, 1, tagend - 1).toLowerCase(); - final char[] text = new char[in.length - tagend - 1]; - System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); - return filterTag(tag, true, text, quotechar); - } - - // a text - return filterTag(null, true, in, quotechar); - } - private static int tagEnd(final char[] tag, final int start) { char c; for (int i = start; i < tag.length; i++) { @@ -358,6 +362,14 @@ public final class TransformerWriter extends Writer { return tag.length - 1; } + /** + * this is the tokenizer of the parser: it splits the input into pieces which are + * - quoted text parts + * - commented text parts + * - tags (opening and closing) + * - text content between all these parts + * The tokens are then parsed with the filterSentence method + */ @Override public void write(final int c) throws IOException { //System.out.println((char) c); @@ -375,7 +387,7 @@ public final class TransformerWriter extends Writer { if ((c == rb) && (this.buffer.length() > 0 && this.buffer.charAt(0) == lb)) { this.inSingleQuote = false; // the tag ends here. after filtering: pass on - filtered = filterSentence(this.buffer.getChars(), singlequote); + filtered = tokenProcessor(this.buffer.getChars(), singlequote); if (this.out != null) { this.out.write(filtered); } // this.buffer = new serverByteBuffer(); this.buffer.reset(); @@ -387,7 +399,7 @@ public final class TransformerWriter extends Writer { if (c == rb && this.buffer.length() > 0 && this.buffer.charAt(0) == lb) { this.inDoubleQuote = false; // the tag ends here. after filtering: pass on - filtered = filterSentence(this.buffer.getChars(), doublequote); + filtered = tokenProcessor(this.buffer.getChars(), doublequote); if (this.out != null) this.out.write(filtered); // this.buffer = new serverByteBuffer(); this.buffer.reset(); @@ -425,7 +437,7 @@ public final class TransformerWriter extends Writer { } else if (c == rb) { this.buffer.append(c); // the tag ends here. after filtering: pass on - filtered = filterSentence(this.buffer.getChars(), doublequote); + filtered = tokenProcessor(this.buffer.getChars(), doublequote); if (this.out != null) this.out.write(filtered); // this.buffer = new serverByteBuffer(); this.buffer.reset(); @@ -433,7 +445,7 @@ public final class TransformerWriter extends Writer { // this is an error case // we consider that there is one rb missing if (this.buffer.length() > 0) { - filtered = filterSentence(this.buffer.getChars(), doublequote); + filtered = tokenProcessor(this.buffer.getChars(), doublequote); if (this.out != null) this.out.write(filtered); } // this.buffer = new serverByteBuffer(); @@ -447,7 +459,7 @@ public final class TransformerWriter extends Writer { if (c == lb) { // the text ends here if (this.buffer.length() > 0) { - filtered = filterSentence(this.buffer.getChars(), doublequote); + filtered = tokenProcessor(this.buffer.getChars(), doublequote); if (this.out != null) this.out.write(filtered); } // this.buffer = new serverByteBuffer(); @@ -492,7 +504,7 @@ public final class TransformerWriter extends Writer { final char quotechar = (this.inSingleQuote) ? singlequote : doublequote; if (this.buffer != null) { if (this.buffer.length() > 0) { - final char[] filtered = filterSentence(this.buffer.getChars(), quotechar); + final char[] filtered = tokenProcessor(this.buffer.getChars(), quotechar); if (this.out != null) this.out.write(filtered); } this.buffer.close(); @@ -504,10 +516,8 @@ public final class TransformerWriter extends Writer { this.out.flush(); this.out.close(); } - this.filterTag = null; - this.filterOpts = null; - if (this.filterCont != null) this.filterCont.close(); - this.filterCont = null; + this.tagStack.clear(); + this.tagStack = null; if (this.scraper != null) this.scraper.finish(); } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 3ed19ebf1..b75d06dd5 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -28,16 +28,18 @@ import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.util.LinkedHashMap; -import java.util.regex.Pattern; +import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.util.CommonPattern; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -53,9 +55,7 @@ import com.ibm.icu.text.CharsetDetector; public class htmlParser extends AbstractParser implements Parser { - private static final Pattern patternUnderline = Pattern.compile("_"); - private final int maxLinks = 10000; - private Charset detectedcharset; + private static final int maxLinks = 10000; public htmlParser() { super("Streaming HTML Parser"); @@ -97,9 +97,10 @@ public class htmlParser extends AbstractParser implements Parser { try { // first get a document from the parsed html - final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks); + Charset[] detectedcharsetcontainer = new Charset[]{null}; + final ContentScraper scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks); // parseToScraper also detects/corrects/sets charset from html content tag - final Document document = transformScraper(location, mimeType, detectedcharset.name(), scraper); + final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper); return new Document[]{document}; } catch (final IOException e) { @@ -155,9 +156,27 @@ public class htmlParser extends AbstractParser implements Parser { return ppd; } - public ContentScraper parseToScraper( + public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, String input, int maxLinks) throws IOException { + Charset[] detectedcharsetcontainer = new Charset[]{null}; + InputStream sourceStream; + try { + sourceStream = new ByteArrayInputStream(documentCharset == null ? UTF8.getBytes(input) : input.getBytes(documentCharset)); + } catch (UnsupportedEncodingException e) { + sourceStream = new ByteArrayInputStream(UTF8.getBytes(input)); + } + ContentScraper scraper; + try { + scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks); + } catch (Failure e) { + throw new IOException(e.getMessage()); + } + return scraper; + } + + public static ContentScraper parseToScraper( final DigestURL location, final String documentCharset, + Charset[] detectedcharsetcontainer, InputStream sourceStream, final int maxLinks) throws Parser.Failure, IOException { @@ -171,13 +190,15 @@ public class htmlParser extends AbstractParser implements Parser { // nothing found: try to find a meta-tag if (charset == null) { + ScraperInputStream htmlFilter = null; try { - final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks); + htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks); sourceStream = htmlFilter; charset = htmlFilter.detectCharset(); - htmlFilter.close(); } catch (final IOException e1) { throw new Parser.Failure("Charset error:" + e1.getMessage(), location); + } finally { + if (htmlFilter != null) htmlFilter.close(); } } @@ -193,21 +214,22 @@ public class htmlParser extends AbstractParser implements Parser { // wtf? still nothing, just take system-standard if (charset == null) { - detectedcharset = Charset.defaultCharset(); + detectedcharsetcontainer[0] = Charset.defaultCharset(); } else { try { - detectedcharset = Charset.forName(charset); + detectedcharsetcontainer[0] = Charset.forName(charset); } catch (final IllegalCharsetNameException e) { - detectedcharset = Charset.defaultCharset(); + detectedcharsetcontainer[0] = Charset.defaultCharset(); } catch (final UnsupportedCharsetException e) { - detectedcharset = Charset.defaultCharset(); + detectedcharsetcontainer[0] = Charset.defaultCharset(); } } + // parsing the content final ContentScraper scraper = new ContentScraper(location, maxLinks); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available()))); try { - FileUtils.copy(sourceStream, writer, detectedcharset); + FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]); } catch (final IOException e) { throw new Parser.Failure("IO error:" + e.getMessage(), location); } finally { @@ -250,7 +272,7 @@ public class htmlParser extends AbstractParser implements Parser { if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman"; // fix wrong fill characters - encoding = patternUnderline.matcher(encoding).replaceAll("-"); + encoding = CommonPattern.UNDERSCORE.matcher(encoding).replaceAll("-"); if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312"; if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8"; @@ -306,10 +328,9 @@ public class htmlParser extends AbstractParser implements Parser { try { url = new AnchorURL(args[0]); final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null); - final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content)); + final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new ByteArrayInputStream(content)); final String title = document[0].dc_title(); System.out.println(title); - System.out.println(CharacterCoding.unicode2html(title, false)); } catch (final MalformedURLException e) { e.printStackTrace(); } catch (final IOException e) { @@ -319,6 +340,7 @@ public class htmlParser extends AbstractParser implements Parser { } catch (final InterruptedException e) { e.printStackTrace(); } + System.exit(0); } } diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 420c64417..a6394f66f 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -81,6 +81,8 @@ public class genericImageParser extends AbstractParser implements Parser { SUPPORTED_EXTENSIONS.add("jpeg"); SUPPORTED_EXTENSIONS.add("jpe"); SUPPORTED_EXTENSIONS.add("bmp"); + SUPPORTED_EXTENSIONS.add("tif"); + SUPPORTED_EXTENSIONS.add("tiff"); SUPPORTED_MIME_TYPES.add("image/png"); SUPPORTED_MIME_TYPES.add("image/gif"); SUPPORTED_MIME_TYPES.add("image/jpeg"); diff --git a/source/net/yacy/search/schema/HyperlinkGraph.java b/source/net/yacy/search/schema/HyperlinkGraph.java index 312f70674..a8ce35fe0 100644 --- a/source/net/yacy/search/schema/HyperlinkGraph.java +++ b/source/net/yacy/search/schema/HyperlinkGraph.java @@ -161,7 +161,7 @@ public class HyperlinkGraph implements Iterable { remaining--; } } - if (nodes.size() == 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges"); + if (nodes.size() == 0 && this.edges.size() > 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges"); // recusively step into depth and find next level int depth = 1; diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index 364bf483b..22ae657d1 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -51,6 +51,8 @@ import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.document.parser.htmlParser; +import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; public class WebgraphConfiguration extends SchemaConfiguration implements Serializable { @@ -219,26 +221,31 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source); processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut } + + // parse text to find images and clear text + ContentScraper textContent = null; + try {textContent = htmlParser.parseToScraper(source_url, null, text, 10);} catch (IOException e) {} + String extractedText = textContent.getText(); // add the source attributes about the target if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound); if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : ""); if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : ""); if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : "")); - if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : ""); - if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length()); - if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0); + if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, extractedText.length() > 0 ? extractedText : ""); + if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, extractedText.length()); + if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, extractedText.length() > 0 ? CommonPattern.SPACE.split(extractedText).length : 0); - ImageEntry ientry = null; - for (ImageEntry ie: images) { - if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;} + StringBuilder alttext = new StringBuilder(textContent == null ? 0 : textContent.getImages().size() * 30); + if (textContent != null) for (ImageEntry ie: textContent.getImages()) { + if (ie.alt().length() > 0) alttext.append(ie.alt()).append(' '); } - String alttext = ientry == null ? "" : ientry.alt(); - if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext); + while (alttext.length() > 0 && alttext.charAt(alttext.length() - 1) == ' ') alttext.setLength(alttext.length() - 1); + if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext.toString()); if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length()); if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0); - // add the target attributes + // add the target attributes add(edge, WebgraphSchema.target_id_s, target_id); final String target_url_string = target_url.toNormalform(false); int pr_target = target_url_string.indexOf("://",0);