From 67beef657f82e92f48dd8425073ad81896a2ff4b Mon Sep 17 00:00:00 2001
From: Michael Peter Christen
Date: Thu, 10 Apr 2014 18:58:03 +0200
Subject: [PATCH] strong redesign of html parser: object recursion is now made
using a stack on html tag objects, not using a recursive parse-again method
which may cause bad performance and huge memory allocation. The new method
also produced better parsed image objects with exact anchor text references.
---
.../document/parser/html/AbstractScraper.java | 5 +-
.../parser/html/AbstractTransformer.java | 9 +-
.../document/parser/html/ContentScraper.java | 254 ++++++++++--------
.../parser/html/ContentTransformer.java | 25 +-
.../yacy/document/parser/html/Scraper.java | 6 +-
.../document/parser/html/Transformer.java | 6 +-
.../parser/html/TransformerWriter.java | 246 +++++++++--------
.../net/yacy/document/parser/htmlParser.java | 56 ++--
.../parser/images/genericImageParser.java | 2 +
.../yacy/search/schema/HyperlinkGraph.java | 2 +-
.../search/schema/WebgraphConfiguration.java | 25 +-
11 files changed, 356 insertions(+), 280 deletions(-)
diff --git a/source/net/yacy/document/parser/html/AbstractScraper.java b/source/net/yacy/document/parser/html/AbstractScraper.java
index d3bc2ffeb..e0980c21b 100644
--- a/source/net/yacy/document/parser/html/AbstractScraper.java
+++ b/source/net/yacy/document/parser/html/AbstractScraper.java
@@ -29,7 +29,6 @@
package net.yacy.document.parser.html;
-import java.util.Properties;
import java.util.Set;
import net.yacy.kelondro.util.MemoryControl;
@@ -72,10 +71,10 @@ public abstract class AbstractScraper implements Scraper {
// the other methods must take into account to construct the return value correctly
@Override
- public abstract void scrapeTag0(String tagname, Properties tagopts);
+ public abstract void scrapeTag0(ContentScraper.Tag tag);
@Override
- public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
+ public abstract void scrapeTag1(ContentScraper.Tag tag);
public static String stripAllTags(final char[] s) {
if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return "";
diff --git a/source/net/yacy/document/parser/html/AbstractTransformer.java b/source/net/yacy/document/parser/html/AbstractTransformer.java
index d812606c9..370d277e6 100644
--- a/source/net/yacy/document/parser/html/AbstractTransformer.java
+++ b/source/net/yacy/document/parser/html/AbstractTransformer.java
@@ -24,7 +24,6 @@
package net.yacy.document.parser.html;
-import java.util.Properties;
import java.util.TreeSet;
public abstract class AbstractTransformer implements Transformer {
@@ -58,13 +57,13 @@ public abstract class AbstractTransformer implements Transformer {
// the other methods must take into account to construct the return value correctly
@Override
- public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) {
- return TransformerWriter.genTag0(tagname, tagopts, quotechar);
+ public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
+ return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
}
@Override
- public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
- return TransformerWriter.genTag1(tagname, tagopts, text, quotechar);
+ public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
+ return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
}
@Override
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 285cf26a1..0b4770c6e 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -59,6 +59,7 @@ import net.yacy.cora.util.NumberTools;
import net.yacy.document.SentenceReader;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.Evaluation.Element;
+import net.yacy.document.parser.images.genericImageParser;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.ISO639;
@@ -80,7 +81,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
singleton, pair;
}
- public enum Tag {
+ public enum TagName {
html(TagType.singleton), // scraped as singleton to get attached properties like 'lang'
body(TagType.singleton), // scraped as singleton to get attached properties like 'class'
div(TagType.singleton), // scraped as singleton to get attached properties like 'id'
@@ -111,14 +112,49 @@ public class ContentScraper extends AbstractScraper implements Scraper {
style(TagType.pair);
public TagType type;
- private Tag(final TagType type) {
+ private TagName(final TagType type) {
this.type = type;
}
}
+ public static class Tag {
+ public String name;
+ public Properties opts;
+ public CharBuffer content;
+ public Tag(final String name) {
+ this.name = name;
+ this.opts = new Properties();
+ this.content = new CharBuffer(100);
+ }
+ public Tag(final String name, final Properties opts) {
+ this.name = name;
+ this.opts = opts;
+ this.content = new CharBuffer(100);
+ }
+ public Tag(final String name, final Properties opts, final CharBuffer content) {
+ this.name = name;
+ this.opts = opts;
+ this.content = content;
+ }
+ public void close() {
+ this.name = null;
+ this.opts = null;
+ if (this.content != null) this.content.close();
+ this.content = null;
+ }
+ @Override
+ public void finalize() {
+ this.close();
+ }
+ @Override
+ public String toString() {
+ return "<" + name + " " + opts + ">" + content + "" + name + ">";
+ }
+ }
+
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase
static {
- for (final Tag tag: Tag.values()) {
+ for (final TagName tag: TagName.values()) {
if (tag.type == TagType.singleton) linkTags0.add(tag.name());
if (tag.type == TagType.pair) linkTags1.add(tag.name());
}
@@ -321,88 +357,88 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
@Override
- public void scrapeTag0(final String tagname, final Properties tagopts) {
- if (tagname.equalsIgnoreCase("img")) {
- final String src = tagopts.getProperty("src", EMPTY_STRING);
+ public void scrapeTag0(Tag tag) {
+ if (tag.name.equalsIgnoreCase("img")) {
+ final String src = tag.opts.getProperty("src", EMPTY_STRING);
try {
if (src.length() > 0) {
final AnchorURL url = absolutePath(src);
if (url != null) {
- final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
- final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
- final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1);
+ final int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
+ final int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
+ final ImageEntry ie = new ImageEntry(url, tag.opts.getProperty("alt", EMPTY_STRING), width, height, -1);
this.images.add(ie);
}
}
} catch (final NumberFormatException e) {}
this.evaluationScores.match(Element.imgpath, src);
- } else if(tagname.equalsIgnoreCase("base")) {
+ } else if(tag.name.equalsIgnoreCase("base")) {
try {
- this.root = new DigestURL(tagopts.getProperty("href", EMPTY_STRING));
+ this.root = new DigestURL(tag.opts.getProperty("href", EMPTY_STRING));
} catch (final MalformedURLException e) {}
- } else if (tagname.equalsIgnoreCase("frame")) {
- final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
- tagopts.put("src", src.toNormalform(true));
- src.setAll(tagopts);
+ } else if (tag.name.equalsIgnoreCase("frame")) {
+ final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
+ tag.opts.put("src", src.toNormalform(true));
+ src.setAll(tag.opts);
this.anchors.add(src);
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true));
- } else if (tagname.equalsIgnoreCase("body")) {
- final String c = tagopts.getProperty("class", EMPTY_STRING);
+ } else if (tag.name.equalsIgnoreCase("body")) {
+ final String c = tag.opts.getProperty("class", EMPTY_STRING);
this.evaluationScores.match(Element.bodyclass, c);
- } else if (tagname.equalsIgnoreCase("div")) {
- final String id = tagopts.getProperty("id", EMPTY_STRING);
+ } else if (tag.name.equalsIgnoreCase("div")) {
+ final String id = tag.opts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
- final String itemtype = tagopts.getProperty("itemtype", EMPTY_STRING);
+ final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
- } else if (tagname.equalsIgnoreCase("meta")) {
- final String content = tagopts.getProperty("content", EMPTY_STRING);
- String name = tagopts.getProperty("name", EMPTY_STRING);
+ } else if (tag.name.equalsIgnoreCase("meta")) {
+ final String content = tag.opts.getProperty("content", EMPTY_STRING);
+ String name = tag.opts.getProperty("name", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
if (name.toLowerCase().equals("generator")) {
this.evaluationScores.match(Element.metagenerator, content);
}
}
- name = tagopts.getProperty("http-equiv", EMPTY_STRING);
+ name = tag.opts.getProperty("http-equiv", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
}
- name = tagopts.getProperty("property", EMPTY_STRING);
+ name = tag.opts.getProperty("property", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
}
- } else if (tagname.equalsIgnoreCase("area")) {
- final String areatitle = cleanLine(tagopts.getProperty("title", EMPTY_STRING));
- //String alt = tagopts.getProperty("alt",EMPTY_STRING);
- final String href = tagopts.getProperty("href", EMPTY_STRING);
+ } else if (tag.name.equalsIgnoreCase("area")) {
+ final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING));
+ //String alt = tag.opts.getProperty("alt",EMPTY_STRING);
+ final String href = tag.opts.getProperty("href", EMPTY_STRING);
if (href.length() > 0) {
- tagopts.put("name", areatitle);
+ tag.opts.put("name", areatitle);
AnchorURL url = absolutePath(href);
- tagopts.put("href", url.toNormalform(true));
- url.setAll(tagopts);
+ tag.opts.put("href", url.toNormalform(true));
+ url.setAll(tag.opts);
this.anchors.add(url);
}
- } else if (tagname.equalsIgnoreCase("link")) {
- final String href = tagopts.getProperty("href", EMPTY_STRING);
+ } else if (tag.name.equalsIgnoreCase("link")) {
+ final String href = tag.opts.getProperty("href", EMPTY_STRING);
final AnchorURL newLink = absolutePath(href);
if (newLink != null) {
- tagopts.put("href", newLink.toNormalform(true));
- String rel = tagopts.getProperty("rel", EMPTY_STRING);
- final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
- final String type = tagopts.getProperty("type", EMPTY_STRING);
- final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
+ tag.opts.put("href", newLink.toNormalform(true));
+ String rel = tag.opts.getProperty("rel", EMPTY_STRING);
+ final String linktitle = tag.opts.getProperty("title", EMPTY_STRING);
+ final String type = tag.opts.getProperty("type", EMPTY_STRING);
+ final String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING);
if (rel.equalsIgnoreCase("shortcut icon")) {
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
this.images.add(ie);
this.favicon = newLink;
} else if (rel.equalsIgnoreCase("canonical")) {
- tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
- newLink.setAll(tagopts);
+ tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
+ newLink.setAll(tag.opts);
this.anchors.add(newLink);
this.canonical = newLink;
} else if (rel.equalsIgnoreCase("publisher")) {
@@ -417,130 +453,130 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.css.put(newLink, rel);
this.evaluationScores.match(Element.csspath, href);
} else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
- tagopts.put("name", linktitle);
- newLink.setAll(tagopts);
+ tag.opts.put("name", linktitle);
+ newLink.setAll(tag.opts);
this.anchors.add(newLink);
}
}
- } else if(tagname.equalsIgnoreCase("embed")) {
- final String src = tagopts.getProperty("src", EMPTY_STRING);
+ } else if(tag.name.equalsIgnoreCase("embed")) {
+ final String src = tag.opts.getProperty("src", EMPTY_STRING);
try {
if (src.length() > 0) {
final AnchorURL url = absolutePath(src);
if (url != null) {
- final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
- final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
- tagopts.put("src", url.toNormalform(true));
- final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING));
+ final int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
+ final int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
+ tag.opts.put("src", url.toNormalform(true));
+ final EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING));
this.embeds.put(url, ie);
- url.setAll(tagopts);
+ url.setAll(tag.opts);
this.anchors.add(url);
}
}
} catch (final NumberFormatException e) {}
- } else if(tagname.equalsIgnoreCase("param")) {
- final String name = tagopts.getProperty("name", EMPTY_STRING);
+ } else if(tag.name.equalsIgnoreCase("param")) {
+ final String name = tag.opts.getProperty("name", EMPTY_STRING);
if (name.equalsIgnoreCase("movie")) {
- AnchorURL url = absolutePath(tagopts.getProperty("value", EMPTY_STRING));
- tagopts.put("value", url.toNormalform(true));
- url.setAll(tagopts);
+ AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
+ tag.opts.put("value", url.toNormalform(true));
+ url.setAll(tag.opts);
this.anchors.add(url);
}
- } else if (tagname.equalsIgnoreCase("iframe")) {
- final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
- tagopts.put("src", src.toNormalform(true));
- src.setAll(tagopts);
+ } else if (tag.name.equalsIgnoreCase("iframe")) {
+ final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
+ tag.opts.put("src", src.toNormalform(true));
+ src.setAll(tag.opts);
this.anchors.add(src);
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
- } else if (tagname.equalsIgnoreCase("html")) {
- final String lang = tagopts.getProperty("lang", EMPTY_STRING);
+ } else if (tag.name.equalsIgnoreCase("html")) {
+ final String lang = tag.opts.getProperty("lang", EMPTY_STRING);
if (!lang.isEmpty()) // fake a language meta to preserv detection from
this.metas.put("dc.language",lang.substring(0,2)); // fix found entries like "hu-hu"
}
// fire event
- fireScrapeTag0(tagname, tagopts);
+ fireScrapeTag0(tag.name, tag.opts);
}
@Override
- public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) {
- // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
- if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
- String href = tagopts.getProperty("href", EMPTY_STRING);
+ public void scrapeTag1(Tag tag) {
+ // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
+ if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
+ String href = tag.opts.getProperty("href", EMPTY_STRING);
href = CharacterCoding.html2unicode(href);
AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
- if (ext.equals("png") || ext.equals("gif") || ext.equals("jpg") || ext.equals("jpeg") || ext.equals("tiff") || ext.equals("tif")) {
+ if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) {
// special handling of such urls: put them to the image urls
- final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1);
+ final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1);
this.images.add(ie);
} else {
if (followDenied()) {
- String rel = tagopts.getProperty("rel", EMPTY_STRING);
+ String rel = tag.opts.getProperty("rel", EMPTY_STRING);
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
- tagopts.put("rel", rel);
+ tag.opts.put("rel", rel);
}
- tagopts.put("text", new String(text));
- tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
- url.setAll(tagopts);
- recursiveParse(url, text);
+ tag.opts.put("text", new String(tag.content.getChars()));
+ tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
+ url.setAll(tag.opts);
+ recursiveParse(url, tag.content.getChars());
this.anchors.add(url);
}
}
this.evaluationScores.match(Element.apath, href);
}
final String h;
- if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[0].add(h);
- } else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ } else if((tag.name.equalsIgnoreCase("h2")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[1].add(h);
- } else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ } else if ((tag.name.equalsIgnoreCase("h3")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[2].add(h);
- } else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ } else if ((tag.name.equalsIgnoreCase("h4")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[3].add(h);
- } else if ((tagname.equalsIgnoreCase("h5")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ } else if ((tag.name.equalsIgnoreCase("h5")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[4].add(h);
- } else if ((tagname.equalsIgnoreCase("h6")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ } else if ((tag.name.equalsIgnoreCase("h6")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[5].add(h);
- } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
- String t = recursiveParse(null, text);
- this.titles.add(t);
- this.evaluationScores.match(Element.title, t);
- } else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ } else if ((tag.name.equalsIgnoreCase("title")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
+ this.titles.add(h);
+ this.evaluationScores.match(Element.title, h);
+ } else if ((tag.name.equalsIgnoreCase("b")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.bold.inc(h);
- } else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ } else if ((tag.name.equalsIgnoreCase("strong")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.bold.inc(h);
- } else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ } else if ((tag.name.equalsIgnoreCase("i")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.italic.inc(h);
- } else if ((tagname.equalsIgnoreCase("u")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ } else if ((tag.name.equalsIgnoreCase("u")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.underline.inc(h);
- } else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
- h = recursiveParse(null, text);
+ } else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) {
+ h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.li.add(h);
- } else if (tagname.equalsIgnoreCase("script")) {
- final String src = tagopts.getProperty("src", EMPTY_STRING);
+ } else if (tag.name.equalsIgnoreCase("script")) {
+ final String src = tag.opts.getProperty("src", EMPTY_STRING);
if (src.length() > 0) {
this.script.add(absolutePath(src));
this.evaluationScores.match(Element.scriptpath, src);
} else {
- this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(text)).replaceAll(" "));
+ this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" "));
}
}
// fire event
- fireScrapeTag1(tagname, tagopts, text);
+ fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
}
@@ -570,15 +606,20 @@ public class ContentScraper extends AbstractScraper implements Scraper {
for (final AnchorURL entry: scraper.getAnchors()) {
this.anchors.add(entry);
}
+ String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
for (ImageEntry ie: scraper.images) {
if (linkurl != null) {
ie.setLinkurl(linkurl);
- ie.setAnchortext(new String(inlineHtml));
+ ie.setAnchortext(line);
+ }
+ // this image may have been added recently from the same location (as this is a recursive parse)
+ // we want to keep only one of them, check if they are equal
+ if (this.images.size() > 0 && this.images.get(this.images.size() - 1).url().equals(ie.url())) {
+ this.images.remove(this.images.size() - 1);
}
this.images.add(ie);
}
- String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
scraper.close();
return line;
}
@@ -681,6 +722,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
public String getText() {
+ this.content.trim();
try {
return this.content.toString();
} catch (final OutOfMemoryError e) {
diff --git a/source/net/yacy/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java
index e4dbf6238..5b6fd8252 100644
--- a/source/net/yacy/document/parser/html/ContentTransformer.java
+++ b/source/net/yacy/document/parser/html/ContentTransformer.java
@@ -29,7 +29,6 @@ import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Properties;
import java.util.TreeSet;
import net.yacy.cora.document.encoding.ASCII;
@@ -115,27 +114,27 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
@Override
- public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) {
- if (tagname.equals("img")) {
+ public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
+ if (tag.name.equals("img")) {
// check bluelist
- if (bluelistHit(tagopts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
- if (bluelistHit(tagopts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);
+ if (bluelistHit(tag.opts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
+ if (bluelistHit(tag.opts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);
// replace image alternative name
- tagopts.setProperty("alt", new String(transformText(tagopts.getProperty("alt", "").toCharArray())));
+ tag.opts.setProperty("alt", new String(transformText(tag.opts.getProperty("alt", "").toCharArray())));
}
- if (tagname.equals("input") && (tagopts.getProperty("type") != null && tagopts.getProperty("type").equals("submit"))) {
+ if (tag.name.equals("input") && (tag.opts.getProperty("type") != null && tag.opts.getProperty("type").equals("submit"))) {
// rewrite button name
- tagopts.setProperty("value", new String(transformText(tagopts.getProperty("value", "").toCharArray())));
+ tag.opts.setProperty("value", new String(transformText(tag.opts.getProperty("value", "").toCharArray())));
}
- return TransformerWriter.genTag0(tagname, tagopts, quotechar);
+ return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
}
@Override
- public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
- if (bluelistHit(tagopts.getProperty("href","").toCharArray())) return genBlueLetters(text.length);
- if (bluelistHit(text)) return genBlueLetters(text.length);
- return TransformerWriter.genTag1(tagname, tagopts, text, quotechar);
+ public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
+ if (bluelistHit(tag.opts.getProperty("href","").toCharArray())) return genBlueLetters(tag.content.length());
+ if (bluelistHit(tag.content.getChars())) return genBlueLetters(tag.content.length());
+ return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
}
@Override
diff --git a/source/net/yacy/document/parser/html/Scraper.java b/source/net/yacy/document/parser/html/Scraper.java
index e1dfe73e1..dc8e3e964 100644
--- a/source/net/yacy/document/parser/html/Scraper.java
+++ b/source/net/yacy/document/parser/html/Scraper.java
@@ -24,8 +24,6 @@
package net.yacy.document.parser.html;
-import java.util.Properties;
-
public interface Scraper {
public boolean isTag0(String tag);
@@ -34,9 +32,9 @@ public interface Scraper {
public void scrapeText(char[] text, String insideTag);
- public void scrapeTag0(String tagname, Properties tagopts);
+ public void scrapeTag0(ContentScraper.Tag tag);
- public void scrapeTag1(String tagname, Properties tagopts, char[] text);
+ public void scrapeTag1(ContentScraper.Tag tag);
public void scrapeComment(final char[] comment);
diff --git a/source/net/yacy/document/parser/html/Transformer.java b/source/net/yacy/document/parser/html/Transformer.java
index 2aedfa120..9b605340e 100644
--- a/source/net/yacy/document/parser/html/Transformer.java
+++ b/source/net/yacy/document/parser/html/Transformer.java
@@ -24,8 +24,6 @@
package net.yacy.document.parser.html;
-import java.util.Properties;
-
public interface Transformer {
// the init method is used to initialize the transformer with some values
@@ -52,10 +50,10 @@ public interface Transformer {
public char[] transformText(char[] text);
// method that is called when a body-less tag occurs
- public char[] transformTag0(String tagname, Properties tagopts, char quotechar);
+ public char[] transformTag0(ContentScraper.Tag tag, char quotechar);
// method that is called when a body-containing text occurs
- public char[] transformTag1(String tagname, Properties tagopts, char[] text, char quotechar);
+ public char[] transformTag1(ContentScraper.Tag tag, char quotechar);
public void close();
}
diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java
index 53d8e91de..408322dcb 100644
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@@ -43,6 +43,7 @@ import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.Properties;
+import java.util.Stack;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
@@ -62,9 +63,7 @@ public final class TransformerWriter extends Writer {
private final OutputStream outStream;
private OutputStreamWriter out;
private CharBuffer buffer;
- private String filterTag;
- private Properties filterOpts;
- private CharBuffer filterCont;
+ private Stack tagStack;
private final Scraper scraper;
private final Transformer transformer;
private boolean inSingleQuote;
@@ -72,7 +71,7 @@ public final class TransformerWriter extends Writer {
private boolean inComment;
private boolean binaryUnsuspect;
private final boolean passbyIfBinarySuspect;
-
+
public TransformerWriter(
final OutputStream outStream,
final Charset charSet,
@@ -95,9 +94,7 @@ public final class TransformerWriter extends Writer {
this.scraper = scraper;
this.transformer = transformer;
this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize);
- this.filterTag = null;
- this.filterOpts = null;
- this.filterCont = null;
+ this.tagStack = new Stack();
this.inSingleQuote = false;
this.inDoubleQuote = false;
this.inComment = false;
@@ -186,63 +183,105 @@ public final class TransformerWriter extends Writer {
return result;
}
- private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
- //System.out.println("filterTag: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
- // distinguish the following cases:
- // - (1) not collecting data for a tag and getting no tag (not opener and not close)
- // - (2) not collecting data for a tag and getting a tag opener
- // - (3) not collecting data for a tag and getting a tag close
- // - (4) collecting data for a tag and getting no tag (not opener and not close)
- // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
- // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
- // - (7) collecting data for a tag and getting the correct close tag for that collecting tag
-
- if (this.filterTag == null) {
+ /**
+ * the token processor distinguishes three different types of input: opening tag, closing tag, text content
+ * @param in - the token to be processed
+ * @param quotechar
+ * @return a processed version of the token
+ */
+ private char[] tokenProcessor(final char[] in, final char quotechar) {
+ if (in.length == 0) return in;
+
+ // scan the string and parse structure
+ if (in.length <= 2 || in[0] != lb) return filterTag(in); // this is a text
+
+ // this is a tag
+ String tag;
+ int tagend;
+ if (in[1] == '/') {
+ // a closing tag
+ tagend = tagEnd(in, 2);
+ tag = new String(in, 2, tagend - 2).toLowerCase();
+ final char[] text = new char[in.length - tagend - 1];
+ System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
+ return filterTag(text, quotechar, tag, false);
+ }
+
+ // an opening tag
+ tagend = tagEnd(in, 1);
+ tag = new String(in, 1, tagend - 1).toLowerCase();
+ final char[] text = new char[in.length - tagend - 1];
+ System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
+ return filterTag(text, quotechar, tag, true);
+ }
+
+ // distinguish the following cases:
+ // - (1) not collecting data for a tag and getting no tag (not opener and not close)
+ // - (2) not collecting data for a tag and getting a tag opener
+ // - (3) not collecting data for a tag and getting a tag close
+ // - (4) collecting data for a tag and getting no tag (not opener and not close)
+ // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
+ // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
+ // - (7) collecting data for a tag and getting the correct close tag for that collecting tag
+
+ /**
+ *
+ * @param content
+ * @return
+ */
+ private char[] filterTag(final char[] content) {
+ if (this.tagStack.size() == 0) {
// we are not collection tag text -> case (1) - (3)
+ // case (1): this is not a tag opener/closer
+ if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
+ if (this.transformer != null) return this.transformer.transformText(content);
+ return content;
+ }
- if (tag == null) {
- // case (1): this is not a tag opener/closer
- if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
- if (this.transformer != null) return this.transformer.transformText(content);
- return content;
- }
+ // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
+ // case (4): getting no tag, go on collecting content
+ if (this.scraper != null) {
+ this.scraper.scrapeText(content, this.tagStack.lastElement().name);
+ }
+ if (this.transformer != null) {
+ this.tagStack.lastElement().content.append(this.transformer.transformText(content));
+ } else {
+ this.tagStack.lastElement().content.append(content);
+ }
+ return new char[0];
+ }
+
+ private char[] filterTag(final char[] content, final char quotechar, final String tagname, final boolean opening) {
+ assert tagname != null;
+
+ if (this.tagStack.size() == 0) {
+ // we are not collection tag text -> case (1) - (3)
// we have a new tag
if (opening) {
// case (2):
- return filterTagOpening(tag, content, quotechar);
+ return filterTagOpening(tagname, content, quotechar);
}
- // its a close tag
+ // its a close tag where no should be
// case (3): we ignore that thing and return it again
- return genTag0raw(tag, false, content);
+ return genTag0raw(tagname, false, content);
}
// we are collection tag text for the tag 'filterTag' -> case (4) - (7)
- if (tag == null || tag.equals("!")) {
- // case (4): getting no tag, go on collecting content
- if (this.scraper != null) {
- this.scraper.scrapeText(content, this.filterTag);
- }
- if (this.transformer != null) {
- this.filterCont.append(this.transformer.transformText(content));
- } else {
- this.filterCont.append(content);
- }
- return new char[0];
- }
+ if (tagname.equals("!")) filterTag(content);
// it's a tag! which one?
if (opening) {
// case (5): the opening should not be here. But we keep the order anyway
- this.filterCont.append(filterTagOpening(tag, content, quotechar));
- return filterTagCloseing(quotechar);
+ this.tagStack.lastElement().content.append(filterTagOpening(tagname, content, quotechar));
+ return new char[0];
}
- if (!tag.equalsIgnoreCase(this.filterTag)) {
+ if (!tagname.equalsIgnoreCase(this.tagStack.lastElement().name)) {
// case (6): its a closing tag, but the wrong one. just add it.
- this.filterCont.append(genTag0raw(tag, opening, content));
+ this.tagStack.lastElement().content.append(genTag0raw(tagname, opening, content));
return new char[0];
}
@@ -250,101 +289,66 @@ public final class TransformerWriter extends Writer {
return filterTagCloseing(quotechar);
}
- private char[] filterTagOpening(final String tag, final char[] content, final char quotechar) {
- if (this.scraper != null && this.scraper.isTag0(tag)) {
+ private char[] filterTagOpening(final String tagname, final char[] content, final char quotechar) {
+ final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
+ ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
+ charBuffer.close();
+ if (this.scraper != null && this.scraper.isTag0(tagname)) {
// this single tag is collected at once here
- final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
- this.scraper.scrapeTag0(tag, charBuffer.propParser());
- charBuffer.close();
+ this.scraper.scrapeTag0(tag);
}
- if (this.transformer != null && this.transformer.isTag0(tag)) {
+ if (this.transformer != null && this.transformer.isTag0(tagname)) {
// this single tag is collected at once here
- final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
char[] b = new char[0];
- try {
- b = this.transformer.transformTag0(tag, scb.propParser(), quotechar);
- } finally {
- scb.close();
- }
+ b = this.transformer.transformTag0(tag, quotechar);
return b;
- } else if ((this.scraper != null && this.scraper.isTag1(tag)) ||
- (this.transformer != null && this.transformer.isTag1(tag))) {
- // ok, start collecting
- this.filterTag = tag;
- final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
- this.filterOpts = scb.propParser();
- scb.close();
- if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
+ } else if ((this.scraper != null && this.scraper.isTag1(tagname)) ||
+ (this.transformer != null && this.transformer.isTag1(tagname))) {
+ // ok, start collecting; we don't push this here to the scraper or transformer; we do that when the tag is closed.
+ this.tagStack.push(tag);
return new char[0];
} else {
// we ignore that thing and return it again
- return genTag0raw(tag, true, content);
+ return genTag0raw(tagname, true, content);
}
}
private char[] filterTagCloseing(final char quotechar) {
char[] ret;
- if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
+ ContentScraper.Tag tag = this.tagStack.lastElement();
+ if (this.scraper != null) this.scraper.scrapeTag1(tag);
if (this.transformer != null) {
- ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+ ret = this.transformer.transformTag1(tag, quotechar);
} else {
- ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+ ret = genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
+ }
+ if ((this.scraper != null && this.scraper.isTag1(tag.name)) ||
+ (this.transformer != null && this.transformer.isTag1(tag.name))) {
+ // remove the tag from the stack as soon as the tag is processed
+ this.tagStack.pop();
+ // at this point the characters from the recently processed tag must be attached to the previous tag
+ if (this.tagStack.size() > 0) this.tagStack.lastElement().content.append(ret);
}
- this.filterTag = null;
- this.filterOpts = null;
- this.filterCont = null;
return ret;
}
private char[] filterFinalize(final char quotechar) {
- if (this.filterTag == null) {
+ if (this.tagStack.size() == 0) {
return new char[0];
}
// it's our closing tag! return complete result.
char[] ret;
- if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
+ if (this.scraper != null) this.scraper.scrapeTag1(this.tagStack.lastElement());
if (this.transformer != null) {
- ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+ ret = this.transformer.transformTag1(this.tagStack.lastElement(), quotechar);
} else {
- ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
+ ret = genTag1(this.tagStack.lastElement().name, this.tagStack.lastElement().opts, this.tagStack.lastElement().content.getChars(), quotechar);
}
- this.filterTag = null;
- this.filterOpts = null;
- this.filterCont = null;
+ this.tagStack.pop();
return ret;
}
- private char[] filterSentence(final char[] in, final char quotechar) {
- if (in.length == 0) return in;
- //System.out.println("filterSentence, quotechar = \"" + quotechar + "\": " + new String(in)); // debug
- // scan the string and parse structure
- if (in.length > 2 && in[0] == lb) {
-
- // a tag
- String tag;
- int tagend;
- if (in[1] == '/') {
- // a closing tag
- tagend = tagEnd(in, 2);
- tag = new String(in, 2, tagend - 2).toLowerCase();
- final char[] text = new char[in.length - tagend - 1];
- System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
- return filterTag(tag, false, text, quotechar);
- }
-
- // an opening tag
- tagend = tagEnd(in, 1);
- tag = new String(in, 1, tagend - 1).toLowerCase();
- final char[] text = new char[in.length - tagend - 1];
- System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
- return filterTag(tag, true, text, quotechar);
- }
-
- // a text
- return filterTag(null, true, in, quotechar);
- }
-
private static int tagEnd(final char[] tag, final int start) {
char c;
for (int i = start; i < tag.length; i++) {
@@ -358,6 +362,14 @@ public final class TransformerWriter extends Writer {
return tag.length - 1;
}
+ /**
+ * this is the tokenizer of the parser: it splits the input into pieces which are
+ * - quoted text parts
+ * - commented text parts
+ * - tags (opening and closing)
+ * - text content between all these parts
+ * The tokens are then parsed with the filterSentence method
+ */
@Override
public void write(final int c) throws IOException {
//System.out.println((char) c);
@@ -375,7 +387,7 @@ public final class TransformerWriter extends Writer {
if ((c == rb) && (this.buffer.length() > 0 && this.buffer.charAt(0) == lb)) {
this.inSingleQuote = false;
// the tag ends here. after filtering: pass on
- filtered = filterSentence(this.buffer.getChars(), singlequote);
+ filtered = tokenProcessor(this.buffer.getChars(), singlequote);
if (this.out != null) { this.out.write(filtered); }
// this.buffer = new serverByteBuffer();
this.buffer.reset();
@@ -387,7 +399,7 @@ public final class TransformerWriter extends Writer {
if (c == rb && this.buffer.length() > 0 && this.buffer.charAt(0) == lb) {
this.inDoubleQuote = false;
// the tag ends here. after filtering: pass on
- filtered = filterSentence(this.buffer.getChars(), doublequote);
+ filtered = tokenProcessor(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
// this.buffer = new serverByteBuffer();
this.buffer.reset();
@@ -425,7 +437,7 @@ public final class TransformerWriter extends Writer {
} else if (c == rb) {
this.buffer.append(c);
// the tag ends here. after filtering: pass on
- filtered = filterSentence(this.buffer.getChars(), doublequote);
+ filtered = tokenProcessor(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
// this.buffer = new serverByteBuffer();
this.buffer.reset();
@@ -433,7 +445,7 @@ public final class TransformerWriter extends Writer {
// this is an error case
// we consider that there is one rb missing
if (this.buffer.length() > 0) {
- filtered = filterSentence(this.buffer.getChars(), doublequote);
+ filtered = tokenProcessor(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
}
// this.buffer = new serverByteBuffer();
@@ -447,7 +459,7 @@ public final class TransformerWriter extends Writer {
if (c == lb) {
// the text ends here
if (this.buffer.length() > 0) {
- filtered = filterSentence(this.buffer.getChars(), doublequote);
+ filtered = tokenProcessor(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
}
// this.buffer = new serverByteBuffer();
@@ -492,7 +504,7 @@ public final class TransformerWriter extends Writer {
final char quotechar = (this.inSingleQuote) ? singlequote : doublequote;
if (this.buffer != null) {
if (this.buffer.length() > 0) {
- final char[] filtered = filterSentence(this.buffer.getChars(), quotechar);
+ final char[] filtered = tokenProcessor(this.buffer.getChars(), quotechar);
if (this.out != null) this.out.write(filtered);
}
this.buffer.close();
@@ -504,10 +516,8 @@ public final class TransformerWriter extends Writer {
this.out.flush();
this.out.close();
}
- this.filterTag = null;
- this.filterOpts = null;
- if (this.filterCont != null) this.filterCont.close();
- this.filterCont = null;
+ this.tagStack.clear();
+ this.tagStack = null;
if (this.scraper != null) this.scraper.finish();
}
diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java
index 3ed19ebf1..b75d06dd5 100644
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@@ -28,16 +28,18 @@ import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.LinkedHashMap;
-import java.util.regex.Pattern;
+import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@@ -53,9 +55,7 @@ import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser {
- private static final Pattern patternUnderline = Pattern.compile("_");
- private final int maxLinks = 10000;
- private Charset detectedcharset;
+ private static final int maxLinks = 10000;
public htmlParser() {
super("Streaming HTML Parser");
@@ -97,9 +97,10 @@ public class htmlParser extends AbstractParser implements Parser {
try {
// first get a document from the parsed html
- final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks);
+ Charset[] detectedcharsetcontainer = new Charset[]{null};
+ final ContentScraper scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks);
// parseToScraper also detects/corrects/sets charset from html content tag
- final Document document = transformScraper(location, mimeType, detectedcharset.name(), scraper);
+ final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
return new Document[]{document};
} catch (final IOException e) {
@@ -155,9 +156,27 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd;
}
- public ContentScraper parseToScraper(
+ public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, String input, int maxLinks) throws IOException {
+ Charset[] detectedcharsetcontainer = new Charset[]{null};
+ InputStream sourceStream;
+ try {
+ sourceStream = new ByteArrayInputStream(documentCharset == null ? UTF8.getBytes(input) : input.getBytes(documentCharset));
+ } catch (UnsupportedEncodingException e) {
+ sourceStream = new ByteArrayInputStream(UTF8.getBytes(input));
+ }
+ ContentScraper scraper;
+ try {
+ scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks);
+ } catch (Failure e) {
+ throw new IOException(e.getMessage());
+ }
+ return scraper;
+ }
+
+ public static ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
+ Charset[] detectedcharsetcontainer,
InputStream sourceStream,
final int maxLinks) throws Parser.Failure, IOException {
@@ -171,13 +190,15 @@ public class htmlParser extends AbstractParser implements Parser {
// nothing found: try to find a meta-tag
if (charset == null) {
+ ScraperInputStream htmlFilter = null;
try {
- final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
+ htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
- htmlFilter.close();
} catch (final IOException e1) {
throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
+ } finally {
+ if (htmlFilter != null) htmlFilter.close();
}
}
@@ -193,21 +214,22 @@ public class htmlParser extends AbstractParser implements Parser {
// wtf? still nothing, just take system-standard
if (charset == null) {
- detectedcharset = Charset.defaultCharset();
+ detectedcharsetcontainer[0] = Charset.defaultCharset();
} else {
try {
- detectedcharset = Charset.forName(charset);
+ detectedcharsetcontainer[0] = Charset.forName(charset);
} catch (final IllegalCharsetNameException e) {
- detectedcharset = Charset.defaultCharset();
+ detectedcharsetcontainer[0] = Charset.defaultCharset();
} catch (final UnsupportedCharsetException e) {
- detectedcharset = Charset.defaultCharset();
+ detectedcharsetcontainer[0] = Charset.defaultCharset();
}
}
+
// parsing the content
final ContentScraper scraper = new ContentScraper(location, maxLinks);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
- FileUtils.copy(sourceStream, writer, detectedcharset);
+ FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
} catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally {
@@ -250,7 +272,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
// fix wrong fill characters
- encoding = patternUnderline.matcher(encoding).replaceAll("-");
+ encoding = CommonPattern.UNDERSCORE.matcher(encoding).replaceAll("-");
if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312";
if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
@@ -306,10 +328,9 @@ public class htmlParser extends AbstractParser implements Parser {
try {
url = new AnchorURL(args[0]);
final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null);
- final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
+ final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new ByteArrayInputStream(content));
final String title = document[0].dc_title();
System.out.println(title);
- System.out.println(CharacterCoding.unicode2html(title, false));
} catch (final MalformedURLException e) {
e.printStackTrace();
} catch (final IOException e) {
@@ -319,6 +340,7 @@ public class htmlParser extends AbstractParser implements Parser {
} catch (final InterruptedException e) {
e.printStackTrace();
}
+ System.exit(0);
}
}
diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java
index 420c64417..a6394f66f 100644
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@@ -81,6 +81,8 @@ public class genericImageParser extends AbstractParser implements Parser {
SUPPORTED_EXTENSIONS.add("jpeg");
SUPPORTED_EXTENSIONS.add("jpe");
SUPPORTED_EXTENSIONS.add("bmp");
+ SUPPORTED_EXTENSIONS.add("tif");
+ SUPPORTED_EXTENSIONS.add("tiff");
SUPPORTED_MIME_TYPES.add("image/png");
SUPPORTED_MIME_TYPES.add("image/gif");
SUPPORTED_MIME_TYPES.add("image/jpeg");
diff --git a/source/net/yacy/search/schema/HyperlinkGraph.java b/source/net/yacy/search/schema/HyperlinkGraph.java
index 312f70674..a8ce35fe0 100644
--- a/source/net/yacy/search/schema/HyperlinkGraph.java
+++ b/source/net/yacy/search/schema/HyperlinkGraph.java
@@ -161,7 +161,7 @@ public class HyperlinkGraph implements Iterable {
remaining--;
}
}
- if (nodes.size() == 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");
+ if (nodes.size() == 0 && this.edges.size() > 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");
// recusively step into depth and find next level
int depth = 1;
diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java
index 364bf483b..22ae657d1 100644
--- a/source/net/yacy/search/schema/WebgraphConfiguration.java
+++ b/source/net/yacy/search/schema/WebgraphConfiguration.java
@@ -51,6 +51,8 @@ import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.document.parser.htmlParser;
+import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
@@ -219,26 +221,31 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
+
+ // parse text to find images and clear text
+ ContentScraper textContent = null;
+ try {textContent = htmlParser.parseToScraper(source_url, null, text, 10);} catch (IOException e) {}
+ String extractedText = textContent.getText();
// add the source attributes about the target
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
- if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
- if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
- if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
+ if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, extractedText.length() > 0 ? extractedText : "");
+ if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, extractedText.length());
+ if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, extractedText.length() > 0 ? CommonPattern.SPACE.split(extractedText).length : 0);
- ImageEntry ientry = null;
- for (ImageEntry ie: images) {
- if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;}
+ StringBuilder alttext = new StringBuilder(textContent == null ? 0 : textContent.getImages().size() * 30);
+ if (textContent != null) for (ImageEntry ie: textContent.getImages()) {
+ if (ie.alt().length() > 0) alttext.append(ie.alt()).append(' ');
}
- String alttext = ientry == null ? "" : ientry.alt();
- if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
+ while (alttext.length() > 0 && alttext.charAt(alttext.length() - 1) == ' ') alttext.setLength(alttext.length() - 1);
+ if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext.toString());
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
- // add the target attributes
+ // add the target attributes
add(edge, WebgraphSchema.target_id_s, target_id);
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);