Merge branch 'master' of git@gitorious.org:yacy/rc1.git

pull/1/head
orbiter 11 years ago
commit 95780eed32

@ -43,8 +43,10 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.SitemapImporter;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.data.WorkTables;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
@ -218,7 +220,11 @@ public class Crawler_p {
if (crawlName.length() == 0 && sitemapURLStr.length() > 0) crawlName = "sitemap loader for " + sitemapURLStr;
// delete old robots entries
for (DigestURL ru: rootURLs) sb.robots.delete(ru);
for (DigestURL ru: rootURLs) {
sb.robots.delete(ru);
try {Cache.delete(RobotsTxt.robotsURL(RobotsTxt.getHostPort(ru)).hash());} catch (IOException e) {}
}
try {sb.robots.clear();} catch (IOException e) {} // to be safe: clear all.
// set the crawl filter
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING);

@ -639,7 +639,7 @@ public class CrawlQueues {
} else {
// starting a load from the internet
request.setStatus("worker-loading", WorkflowJob.STATUS_RUNNING);
String result = null;
String error = null;
// load a resource and push queue entry to switchboard queue
// returns null if everything went fine, a fail reason string if a problem occurred
@ -651,23 +651,29 @@ public class CrawlQueues {
if (CrawlQueues.log.isFine()) {
CrawlQueues.log.fine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
}
result = "no content (possibly caused by cache policy)";
error = "no content (possibly caused by cache policy)";
} else {
request.setStatus("loaded", WorkflowJob.STATUS_RUNNING);
final String storedFailMessage = CrawlQueues.this.sb.toIndexer(response);
request.setStatus("enqueued-" + ((storedFailMessage == null) ? "ok" : "fail"), WorkflowJob.STATUS_FINISHED);
result = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
error = (storedFailMessage == null) ? null : "not enqueued to indexer: " + storedFailMessage;
}
} catch (final IOException e) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.log.isFine()) {
CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
}
result = "load error - " + e.getMessage();
error = "load error - " + e.getMessage();
}
if (result != null) {
CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + result, -1);
if (error != null) {
if (error.endsWith("$")) {
// the "$" mark at the end of the error message means, that the error was already pushed to the error-db by the reporting method
// thus we only push this message if we don't have that mark
error = error.substring(0, error.length() - 1).trim();
} else {
CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + error, -1);
}
request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED);
} else {
request.setStatus("worker-processed", WorkflowJob.STATUS_FINISHED);

@ -78,7 +78,7 @@ public final class HTTPLoader {
if (retryCount < 0) {
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
}
DigestURL url = request.url();
@ -94,7 +94,7 @@ public final class HTTPLoader {
final String hostlow = host.toLowerCase();
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}
// resolve yacy and yacyh domains
@ -141,7 +141,7 @@ public final class HTTPLoader {
if (redirectionUrlString.isEmpty()) {
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
}
// normalize URL
@ -161,7 +161,7 @@ public final class HTTPLoader {
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.$");
}
// retry crawling with new url
@ -170,11 +170,11 @@ public final class HTTPLoader {
}
// we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
} else if (responseBody == null) {
// no response, reject file
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
} else if (statusCode == 200 || statusCode == 203) {
// the transfer is ok
@ -185,7 +185,7 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) {
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
}
// create a new cache entry
@ -202,7 +202,7 @@ public final class HTTPLoader {
} else {
// if the response has not the right response type then reject file
this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
}
}

@ -155,13 +155,7 @@ public class RobotsTxt {
}
// generating the proper url to download the robots txt
DigestURL robotsURL = null;
try {
robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;
}
DigestURL robotsURL = robotsURL(urlHostPort);
Response response = null;
if (robotsURL != null) {
@ -230,14 +224,8 @@ public class RobotsTxt {
if (robotsTable.containsKey(robotsTable.encodedKey(urlHostPort))) return;
// generating the proper url to download the robots txt
DigestURL robotsURL = null;
try {
robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;
}
DigestURL robotsURL = robotsURL(urlHostPort);
Response response = null;
if (robotsURL != null) {
if (log.isFine()) log.fine("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
@ -332,7 +320,7 @@ public class RobotsTxt {
}
}
static final String getHostPort(final MultiProtocolURL theURL) {
public static final String getHostPort(final MultiProtocolURL theURL) {
int port = theURL.getPort();
if (port == -1) {
if (theURL.getProtocol().equalsIgnoreCase("http")) {
@ -349,7 +337,18 @@ public class RobotsTxt {
sb.append(host).append(':').append(Integer.toString(port));
return sb.toString();
}
public static DigestURL robotsURL(final String urlHostPort) {
DigestURL robotsURL = null;
try {
robotsURL = new DigestURL((urlHostPort.endsWith(":443") ? "https://" : "http://") + urlHostPort + "/robots.txt");
} catch (final MalformedURLException e) {
log.severe("Unable to generate robots.txt URL for host:port '" + urlHostPort + "'.", e);
robotsURL = null;
}
return robotsURL;
}
public static class CheckEntry {
public final DigestURL digestURL;
public final RobotsTxtEntry robotsTxtEntry;

@ -29,7 +29,6 @@
package net.yacy.document.parser.html;
import java.util.Properties;
import java.util.Set;
import net.yacy.kelondro.util.MemoryControl;
@ -72,10 +71,10 @@ public abstract class AbstractScraper implements Scraper {
// the other methods must take into account to construct the return value correctly
@Override
public abstract void scrapeTag0(String tagname, Properties tagopts);
public abstract void scrapeTag0(ContentScraper.Tag tag);
@Override
public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
public abstract void scrapeTag1(ContentScraper.Tag tag);
public static String stripAllTags(final char[] s) {
if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return "";

@ -24,7 +24,6 @@
package net.yacy.document.parser.html;
import java.util.Properties;
import java.util.TreeSet;
public abstract class AbstractTransformer implements Transformer {
@ -58,13 +57,13 @@ public abstract class AbstractTransformer implements Transformer {
// the other methods must take into account to construct the return value correctly
@Override
public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) {
return TransformerWriter.genTag0(tagname, tagopts, quotechar);
public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
}
@Override
public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
return TransformerWriter.genTag1(tagname, tagopts, text, quotechar);
public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
}
@Override

@ -59,6 +59,7 @@ import net.yacy.cora.util.NumberTools;
import net.yacy.document.SentenceReader;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.Evaluation.Element;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.ISO639;
@ -80,7 +81,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
singleton, pair;
}
public enum Tag {
public enum TagName {
html(TagType.singleton), // scraped as singleton to get attached properties like 'lang'
body(TagType.singleton), // scraped as singleton to get attached properties like 'class'
div(TagType.singleton), // scraped as singleton to get attached properties like 'id'
@ -111,14 +112,49 @@ public class ContentScraper extends AbstractScraper implements Scraper {
style(TagType.pair);
public TagType type;
private Tag(final TagType type) {
private TagName(final TagType type) {
this.type = type;
}
}
public static class Tag {
public String name;
public Properties opts;
public CharBuffer content;
public Tag(final String name) {
this.name = name;
this.opts = new Properties();
this.content = new CharBuffer(100);
}
public Tag(final String name, final Properties opts) {
this.name = name;
this.opts = opts;
this.content = new CharBuffer(100);
}
public Tag(final String name, final Properties opts, final CharBuffer content) {
this.name = name;
this.opts = opts;
this.content = content;
}
public void close() {
this.name = null;
this.opts = null;
if (this.content != null) this.content.close();
this.content = null;
}
@Override
public void finalize() {
this.close();
}
@Override
public String toString() {
return "<" + name + " " + opts + ">" + content + "</" + name + ">";
}
}
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase
static {
for (final Tag tag: Tag.values()) {
for (final TagName tag: TagName.values()) {
if (tag.type == TagType.singleton) linkTags0.add(tag.name());
if (tag.type == TagType.pair) linkTags1.add(tag.name());
}
@ -321,88 +357,88 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
@Override
public void scrapeTag0(final String tagname, final Properties tagopts) {
if (tagname.equalsIgnoreCase("img")) {
final String src = tagopts.getProperty("src", EMPTY_STRING);
public void scrapeTag0(Tag tag) {
if (tag.name.equalsIgnoreCase("img")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
try {
if (src.length() > 0) {
final AnchorURL url = absolutePath(src);
if (url != null) {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1);
final int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
final int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
final ImageEntry ie = new ImageEntry(url, tag.opts.getProperty("alt", EMPTY_STRING), width, height, -1);
this.images.add(ie);
}
}
} catch (final NumberFormatException e) {}
this.evaluationScores.match(Element.imgpath, src);
} else if(tagname.equalsIgnoreCase("base")) {
} else if(tag.name.equalsIgnoreCase("base")) {
try {
this.root = new DigestURL(tagopts.getProperty("href", EMPTY_STRING));
this.root = new DigestURL(tag.opts.getProperty("href", EMPTY_STRING));
} catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) {
final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
tagopts.put("src", src.toNormalform(true));
src.setAll(tagopts);
} else if (tag.name.equalsIgnoreCase("frame")) {
final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
this.anchors.add(src);
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true));
} else if (tagname.equalsIgnoreCase("body")) {
final String c = tagopts.getProperty("class", EMPTY_STRING);
} else if (tag.name.equalsIgnoreCase("body")) {
final String c = tag.opts.getProperty("class", EMPTY_STRING);
this.evaluationScores.match(Element.bodyclass, c);
} else if (tagname.equalsIgnoreCase("div")) {
final String id = tagopts.getProperty("id", EMPTY_STRING);
} else if (tag.name.equalsIgnoreCase("div")) {
final String id = tag.opts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
final String itemtype = tagopts.getProperty("itemtype", EMPTY_STRING);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++;
}
} else if (tagname.equalsIgnoreCase("meta")) {
final String content = tagopts.getProperty("content", EMPTY_STRING);
String name = tagopts.getProperty("name", EMPTY_STRING);
} else if (tag.name.equalsIgnoreCase("meta")) {
final String content = tag.opts.getProperty("content", EMPTY_STRING);
String name = tag.opts.getProperty("name", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
if (name.toLowerCase().equals("generator")) {
this.evaluationScores.match(Element.metagenerator, content);
}
}
name = tagopts.getProperty("http-equiv", EMPTY_STRING);
name = tag.opts.getProperty("http-equiv", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
}
name = tagopts.getProperty("property", EMPTY_STRING);
name = tag.opts.getProperty("property", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
}
} else if (tagname.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tagopts.getProperty("title", EMPTY_STRING));
//String alt = tagopts.getProperty("alt",EMPTY_STRING);
final String href = tagopts.getProperty("href", EMPTY_STRING);
} else if (tag.name.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING));
//String alt = tag.opts.getProperty("alt",EMPTY_STRING);
final String href = tag.opts.getProperty("href", EMPTY_STRING);
if (href.length() > 0) {
tagopts.put("name", areatitle);
tag.opts.put("name", areatitle);
AnchorURL url = absolutePath(href);
tagopts.put("href", url.toNormalform(true));
url.setAll(tagopts);
tag.opts.put("href", url.toNormalform(true));
url.setAll(tag.opts);
this.anchors.add(url);
}
} else if (tagname.equalsIgnoreCase("link")) {
final String href = tagopts.getProperty("href", EMPTY_STRING);
} else if (tag.name.equalsIgnoreCase("link")) {
final String href = tag.opts.getProperty("href", EMPTY_STRING);
final AnchorURL newLink = absolutePath(href);
if (newLink != null) {
tagopts.put("href", newLink.toNormalform(true));
String rel = tagopts.getProperty("rel", EMPTY_STRING);
final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
final String type = tagopts.getProperty("type", EMPTY_STRING);
final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
tag.opts.put("href", newLink.toNormalform(true));
String rel = tag.opts.getProperty("rel", EMPTY_STRING);
final String linktitle = tag.opts.getProperty("title", EMPTY_STRING);
final String type = tag.opts.getProperty("type", EMPTY_STRING);
final String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING);
if (rel.equalsIgnoreCase("shortcut icon")) {
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
this.images.add(ie);
this.favicon = newLink;
} else if (rel.equalsIgnoreCase("canonical")) {
tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
newLink.setAll(tagopts);
tag.opts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
newLink.setAll(tag.opts);
this.anchors.add(newLink);
this.canonical = newLink;
} else if (rel.equalsIgnoreCase("publisher")) {
@ -417,130 +453,130 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.css.put(newLink, rel);
this.evaluationScores.match(Element.csspath, href);
} else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
tagopts.put("name", linktitle);
newLink.setAll(tagopts);
tag.opts.put("name", linktitle);
newLink.setAll(tag.opts);
this.anchors.add(newLink);
}
}
} else if(tagname.equalsIgnoreCase("embed")) {
final String src = tagopts.getProperty("src", EMPTY_STRING);
} else if(tag.name.equalsIgnoreCase("embed")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
try {
if (src.length() > 0) {
final AnchorURL url = absolutePath(src);
if (url != null) {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
tagopts.put("src", url.toNormalform(true));
final EmbedEntry ie = new EmbedEntry(url, width, height, tagopts.getProperty("type", EMPTY_STRING), tagopts.getProperty("pluginspage", EMPTY_STRING));
final int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
final int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
tag.opts.put("src", url.toNormalform(true));
final EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING));
this.embeds.put(url, ie);
url.setAll(tagopts);
url.setAll(tag.opts);
this.anchors.add(url);
}
}
} catch (final NumberFormatException e) {}
} else if(tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", EMPTY_STRING);
} else if(tag.name.equalsIgnoreCase("param")) {
final String name = tag.opts.getProperty("name", EMPTY_STRING);
if (name.equalsIgnoreCase("movie")) {
AnchorURL url = absolutePath(tagopts.getProperty("value", EMPTY_STRING));
tagopts.put("value", url.toNormalform(true));
url.setAll(tagopts);
AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
tag.opts.put("value", url.toNormalform(true));
url.setAll(tag.opts);
this.anchors.add(url);
}
} else if (tagname.equalsIgnoreCase("iframe")) {
final AnchorURL src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
tagopts.put("src", src.toNormalform(true));
src.setAll(tagopts);
} else if (tag.name.equalsIgnoreCase("iframe")) {
final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
this.anchors.add(src);
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
} else if (tagname.equalsIgnoreCase("html")) {
final String lang = tagopts.getProperty("lang", EMPTY_STRING);
} else if (tag.name.equalsIgnoreCase("html")) {
final String lang = tag.opts.getProperty("lang", EMPTY_STRING);
if (!lang.isEmpty()) // fake a language meta to preserv detection from <html lang="xx" />
this.metas.put("dc.language",lang.substring(0,2)); // fix found entries like "hu-hu"
}
// fire event
fireScrapeTag0(tagname, tagopts);
fireScrapeTag0(tag.name, tag.opts);
}
@Override
public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
String href = tagopts.getProperty("href", EMPTY_STRING);
public void scrapeTag1(Tag tag) {
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
String href = tag.opts.getProperty("href", EMPTY_STRING);
href = CharacterCoding.html2unicode(href);
AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext.equals("png") || ext.equals("gif") || ext.equals("jpg") || ext.equals("jpeg") || ext.equals("tiff") || ext.equals("tif")) {
if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) {
// special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1);
final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1);
this.images.add(ie);
} else {
if (followDenied()) {
String rel = tagopts.getProperty("rel", EMPTY_STRING);
String rel = tag.opts.getProperty("rel", EMPTY_STRING);
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
tagopts.put("rel", rel);
tag.opts.put("rel", rel);
}
tagopts.put("text", new String(text));
tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tagopts);
recursiveParse(url, text);
tag.opts.put("text", new String(tag.content.getChars()));
tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tag.opts);
recursiveParse(url, tag.content.getChars());
this.anchors.add(url);
}
}
this.evaluationScores.match(Element.apath, href);
}
final String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
h = recursiveParse(null, text);
if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[0].add(h);
} else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
h = recursiveParse(null, text);
} else if((tag.name.equalsIgnoreCase("h2")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[1].add(h);
} else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
h = recursiveParse(null, text);
} else if ((tag.name.equalsIgnoreCase("h3")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[2].add(h);
} else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
h = recursiveParse(null, text);
} else if ((tag.name.equalsIgnoreCase("h4")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[3].add(h);
} else if ((tagname.equalsIgnoreCase("h5")) && (text.length < 1024)) {
h = recursiveParse(null, text);
} else if ((tag.name.equalsIgnoreCase("h5")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[4].add(h);
} else if ((tagname.equalsIgnoreCase("h6")) && (text.length < 1024)) {
h = recursiveParse(null, text);
} else if ((tag.name.equalsIgnoreCase("h6")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[5].add(h);
} else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
String t = recursiveParse(null, text);
this.titles.add(t);
this.evaluationScores.match(Element.title, t);
} else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
h = recursiveParse(null, text);
} else if ((tag.name.equalsIgnoreCase("title")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
this.titles.add(h);
this.evaluationScores.match(Element.title, h);
} else if ((tag.name.equalsIgnoreCase("b")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.bold.inc(h);
} else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) {
h = recursiveParse(null, text);
} else if ((tag.name.equalsIgnoreCase("strong")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.bold.inc(h);
} else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
h = recursiveParse(null, text);
} else if ((tag.name.equalsIgnoreCase("i")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.italic.inc(h);
} else if ((tagname.equalsIgnoreCase("u")) && (text.length < 1024)) {
h = recursiveParse(null, text);
} else if ((tag.name.equalsIgnoreCase("u")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.underline.inc(h);
} else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
h = recursiveParse(null, text);
} else if ((tag.name.equalsIgnoreCase("li")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.li.add(h);
} else if (tagname.equalsIgnoreCase("script")) {
final String src = tagopts.getProperty("src", EMPTY_STRING);
} else if (tag.name.equalsIgnoreCase("script")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
if (src.length() > 0) {
this.script.add(absolutePath(src));
this.evaluationScores.match(Element.scriptpath, src);
} else {
this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(text)).replaceAll(" "));
this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(tag.content.getChars())).replaceAll(" "));
}
}
// fire event
fireScrapeTag1(tagname, tagopts, text);
fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
}
@ -570,15 +606,20 @@ public class ContentScraper extends AbstractScraper implements Scraper {
for (final AnchorURL entry: scraper.getAnchors()) {
this.anchors.add(entry);
}
String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
for (ImageEntry ie: scraper.images) {
if (linkurl != null) {
ie.setLinkurl(linkurl);
ie.setAnchortext(new String(inlineHtml));
ie.setAnchortext(line);
}
// this image may have been added recently from the same location (as this is a recursive parse)
// we want to keep only one of them, check if they are equal
if (this.images.size() > 0 && this.images.get(this.images.size() - 1).url().equals(ie.url())) {
this.images.remove(this.images.size() - 1);
}
this.images.add(ie);
}
String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
scraper.close();
return line;
}
@ -681,6 +722,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
public String getText() {
this.content.trim();
try {
return this.content.toString();
} catch (final OutOfMemoryError e) {

@ -29,7 +29,6 @@ import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Properties;
import java.util.TreeSet;
import net.yacy.cora.document.encoding.ASCII;
@ -115,27 +114,27 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
@Override
public char[] transformTag0(final String tagname, final Properties tagopts, final char quotechar) {
if (tagname.equals("img")) {
public char[] transformTag0(final ContentScraper.Tag tag, final char quotechar) {
if (tag.name.equals("img")) {
// check bluelist
if (bluelistHit(tagopts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
if (bluelistHit(tagopts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);
if (bluelistHit(tag.opts.getProperty("src", "").toCharArray())) return genBlueLetters(5);
if (bluelistHit(tag.opts.getProperty("alt", "").toCharArray())) return genBlueLetters(5);
// replace image alternative name
tagopts.setProperty("alt", new String(transformText(tagopts.getProperty("alt", "").toCharArray())));
tag.opts.setProperty("alt", new String(transformText(tag.opts.getProperty("alt", "").toCharArray())));
}
if (tagname.equals("input") && (tagopts.getProperty("type") != null && tagopts.getProperty("type").equals("submit"))) {
if (tag.name.equals("input") && (tag.opts.getProperty("type") != null && tag.opts.getProperty("type").equals("submit"))) {
// rewrite button name
tagopts.setProperty("value", new String(transformText(tagopts.getProperty("value", "").toCharArray())));
tag.opts.setProperty("value", new String(transformText(tag.opts.getProperty("value", "").toCharArray())));
}
return TransformerWriter.genTag0(tagname, tagopts, quotechar);
return TransformerWriter.genTag0(tag.name, tag.opts, quotechar);
}
@Override
public char[] transformTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
if (bluelistHit(tagopts.getProperty("href","").toCharArray())) return genBlueLetters(text.length);
if (bluelistHit(text)) return genBlueLetters(text.length);
return TransformerWriter.genTag1(tagname, tagopts, text, quotechar);
public char[] transformTag1(final ContentScraper.Tag tag, final char quotechar) {
if (bluelistHit(tag.opts.getProperty("href","").toCharArray())) return genBlueLetters(tag.content.length());
if (bluelistHit(tag.content.getChars())) return genBlueLetters(tag.content.length());
return TransformerWriter.genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
}
@Override

@ -24,8 +24,6 @@
package net.yacy.document.parser.html;
import java.util.Properties;
public interface Scraper {
public boolean isTag0(String tag);
@ -34,9 +32,9 @@ public interface Scraper {
public void scrapeText(char[] text, String insideTag);
public void scrapeTag0(String tagname, Properties tagopts);
public void scrapeTag0(ContentScraper.Tag tag);
public void scrapeTag1(String tagname, Properties tagopts, char[] text);
public void scrapeTag1(ContentScraper.Tag tag);
public void scrapeComment(final char[] comment);

@ -24,8 +24,6 @@
package net.yacy.document.parser.html;
import java.util.Properties;
public interface Transformer {
// the init method is used to initialize the transformer with some values
@ -52,10 +50,10 @@ public interface Transformer {
public char[] transformText(char[] text);
// method that is called when a body-less tag occurs
public char[] transformTag0(String tagname, Properties tagopts, char quotechar);
public char[] transformTag0(ContentScraper.Tag tag, char quotechar);
// method that is called when a body-containing text occurs
public char[] transformTag1(String tagname, Properties tagopts, char[] text, char quotechar);
public char[] transformTag1(ContentScraper.Tag tag, char quotechar);
public void close();
}

@ -43,6 +43,7 @@ import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.Properties;
import java.util.Stack;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
@ -62,9 +63,7 @@ public final class TransformerWriter extends Writer {
private final OutputStream outStream;
private OutputStreamWriter out;
private CharBuffer buffer;
private String filterTag;
private Properties filterOpts;
private CharBuffer filterCont;
private Stack<ContentScraper.Tag> tagStack;
private final Scraper scraper;
private final Transformer transformer;
private boolean inSingleQuote;
@ -72,7 +71,7 @@ public final class TransformerWriter extends Writer {
private boolean inComment;
private boolean binaryUnsuspect;
private final boolean passbyIfBinarySuspect;
public TransformerWriter(
final OutputStream outStream,
final Charset charSet,
@ -95,9 +94,7 @@ public final class TransformerWriter extends Writer {
this.scraper = scraper;
this.transformer = transformer;
this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize);
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
this.tagStack = new Stack<ContentScraper.Tag>();
this.inSingleQuote = false;
this.inDoubleQuote = false;
this.inComment = false;
@ -186,63 +183,105 @@ public final class TransformerWriter extends Writer {
return result;
}
private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
//System.out.println("filterTag: filterTag=" + ((this.filterTag == null) ? "null" : this.filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + new String(content)); // debug
// distinguish the following cases:
// - (1) not collecting data for a tag and getting no tag (not opener and not close)
// - (2) not collecting data for a tag and getting a tag opener
// - (3) not collecting data for a tag and getting a tag close
// - (4) collecting data for a tag and getting no tag (not opener and not close)
// - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
// - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
// - (7) collecting data for a tag and getting the correct close tag for that collecting tag
if (this.filterTag == null) {
/**
* the token processor distinguishes three different types of input: opening tag, closing tag, text content
* @param in - the token to be processed
* @param quotechar
* @return a processed version of the token
*/
private char[] tokenProcessor(final char[] in, final char quotechar) {
if (in.length == 0) return in;
// scan the string and parse structure
if (in.length <= 2 || in[0] != lb) return filterTag(in); // this is a text
// this is a tag
String tag;
int tagend;
if (in[1] == '/') {
// a closing tag
tagend = tagEnd(in, 2);
tag = new String(in, 2, tagend - 2).toLowerCase();
final char[] text = new char[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(text, quotechar, tag, false);
}
// an opening tag
tagend = tagEnd(in, 1);
tag = new String(in, 1, tagend - 1).toLowerCase();
final char[] text = new char[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(text, quotechar, tag, true);
}
// distinguish the following cases:
// - (1) not collecting data for a tag and getting no tag (not opener and not close)
// - (2) not collecting data for a tag and getting a tag opener
// - (3) not collecting data for a tag and getting a tag close
// - (4) collecting data for a tag and getting no tag (not opener and not close)
// - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag
// - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener)
// - (7) collecting data for a tag and getting the correct close tag for that collecting tag
/**
*
* @param content
* @return
*/
private char[] filterTag(final char[] content) {
if (this.tagStack.size() == 0) {
// we are not collection tag text -> case (1) - (3)
// case (1): this is not a tag opener/closer
if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
if (this.transformer != null) return this.transformer.transformText(content);
return content;
}
if (tag == null) {
// case (1): this is not a tag opener/closer
if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
if (this.transformer != null) return this.transformer.transformText(content);
return content;
}
// we are collection tag text for the tag 'filterTag' -> case (4) - (7)
// case (4): getting no tag, go on collecting content
if (this.scraper != null) {
this.scraper.scrapeText(content, this.tagStack.lastElement().name);
}
if (this.transformer != null) {
this.tagStack.lastElement().content.append(this.transformer.transformText(content));
} else {
this.tagStack.lastElement().content.append(content);
}
return new char[0];
}
private char[] filterTag(final char[] content, final char quotechar, final String tagname, final boolean opening) {
assert tagname != null;
if (this.tagStack.size() == 0) {
// we are not collection tag text -> case (1) - (3)
// we have a new tag
if (opening) {
// case (2):
return filterTagOpening(tag, content, quotechar);
return filterTagOpening(tagname, content, quotechar);
}
// its a close tag
// its a close tag where no should be
// case (3): we ignore that thing and return it again
return genTag0raw(tag, false, content);
return genTag0raw(tagname, false, content);
}
// we are collection tag text for the tag 'filterTag' -> case (4) - (7)
if (tag == null || tag.equals("!")) {
// case (4): getting no tag, go on collecting content
if (this.scraper != null) {
this.scraper.scrapeText(content, this.filterTag);
}
if (this.transformer != null) {
this.filterCont.append(this.transformer.transformText(content));
} else {
this.filterCont.append(content);
}
return new char[0];
}
if (tagname.equals("!")) filterTag(content);
// it's a tag! which one?
if (opening) {
// case (5): the opening should not be here. But we keep the order anyway
this.filterCont.append(filterTagOpening(tag, content, quotechar));
return filterTagCloseing(quotechar);
this.tagStack.lastElement().content.append(filterTagOpening(tagname, content, quotechar));
return new char[0];
}
if (!tag.equalsIgnoreCase(this.filterTag)) {
if (!tagname.equalsIgnoreCase(this.tagStack.lastElement().name)) {
// case (6): its a closing tag, but the wrong one. just add it.
this.filterCont.append(genTag0raw(tag, opening, content));
this.tagStack.lastElement().content.append(genTag0raw(tagname, opening, content));
return new char[0];
}
@ -250,101 +289,66 @@ public final class TransformerWriter extends Writer {
return filterTagCloseing(quotechar);
}
private char[] filterTagOpening(final String tag, final char[] content, final char quotechar) {
if (this.scraper != null && this.scraper.isTag0(tag)) {
private char[] filterTagOpening(final String tagname, final char[] content, final char quotechar) {
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
charBuffer.close();
if (this.scraper != null && this.scraper.isTag0(tagname)) {
// this single tag is collected at once here
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.scraper.scrapeTag0(tag, charBuffer.propParser());
charBuffer.close();
this.scraper.scrapeTag0(tag);
}
if (this.transformer != null && this.transformer.isTag0(tag)) {
if (this.transformer != null && this.transformer.isTag0(tagname)) {
// this single tag is collected at once here
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
char[] b = new char[0];
try {
b = this.transformer.transformTag0(tag, scb.propParser(), quotechar);
} finally {
scb.close();
}
b = this.transformer.transformTag0(tag, quotechar);
return b;
} else if ((this.scraper != null && this.scraper.isTag1(tag)) ||
(this.transformer != null && this.transformer.isTag1(tag))) {
// ok, start collecting
this.filterTag = tag;
final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
this.filterOpts = scb.propParser();
scb.close();
if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
} else if ((this.scraper != null && this.scraper.isTag1(tagname)) ||
(this.transformer != null && this.transformer.isTag1(tagname))) {
// ok, start collecting; we don't push this here to the scraper or transformer; we do that when the tag is closed.
this.tagStack.push(tag);
return new char[0];
} else {
// we ignore that thing and return it again
return genTag0raw(tag, true, content);
return genTag0raw(tagname, true, content);
}
}
private char[] filterTagCloseing(final char quotechar) {
char[] ret;
if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
ContentScraper.Tag tag = this.tagStack.lastElement();
if (this.scraper != null) this.scraper.scrapeTag1(tag);
if (this.transformer != null) {
ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
ret = this.transformer.transformTag1(tag, quotechar);
} else {
ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
ret = genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar);
}
if ((this.scraper != null && this.scraper.isTag1(tag.name)) ||
(this.transformer != null && this.transformer.isTag1(tag.name))) {
// remove the tag from the stack as soon as the tag is processed
this.tagStack.pop();
// at this point the characters from the recently processed tag must be attached to the previous tag
if (this.tagStack.size() > 0) this.tagStack.lastElement().content.append(ret);
}
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
return ret;
}
private char[] filterFinalize(final char quotechar) {
if (this.filterTag == null) {
if (this.tagStack.size() == 0) {
return new char[0];
}
// it's our closing tag! return complete result.
char[] ret;
if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
if (this.scraper != null) this.scraper.scrapeTag1(this.tagStack.lastElement());
if (this.transformer != null) {
ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
ret = this.transformer.transformTag1(this.tagStack.lastElement(), quotechar);
} else {
ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
ret = genTag1(this.tagStack.lastElement().name, this.tagStack.lastElement().opts, this.tagStack.lastElement().content.getChars(), quotechar);
}
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
this.tagStack.pop();
return ret;
}
private char[] filterSentence(final char[] in, final char quotechar) {
if (in.length == 0) return in;
//System.out.println("filterSentence, quotechar = \"" + quotechar + "\": " + new String(in)); // debug
// scan the string and parse structure
if (in.length > 2 && in[0] == lb) {
// a tag
String tag;
int tagend;
if (in[1] == '/') {
// a closing tag
tagend = tagEnd(in, 2);
tag = new String(in, 2, tagend - 2).toLowerCase();
final char[] text = new char[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(tag, false, text, quotechar);
}
// an opening tag
tagend = tagEnd(in, 1);
tag = new String(in, 1, tagend - 1).toLowerCase();
final char[] text = new char[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(tag, true, text, quotechar);
}
// a text
return filterTag(null, true, in, quotechar);
}
private static int tagEnd(final char[] tag, final int start) {
char c;
for (int i = start; i < tag.length; i++) {
@ -358,6 +362,14 @@ public final class TransformerWriter extends Writer {
return tag.length - 1;
}
/**
* this is the tokenizer of the parser: it splits the input into pieces which are
* - quoted text parts
* - commented text parts
* - tags (opening and closing)
* - text content between all these parts
* The tokens are then parsed with the filterSentence method
*/
@Override
public void write(final int c) throws IOException {
//System.out.println((char) c);
@ -375,7 +387,7 @@ public final class TransformerWriter extends Writer {
if ((c == rb) && (this.buffer.length() > 0 && this.buffer.charAt(0) == lb)) {
this.inSingleQuote = false;
// the tag ends here. after filtering: pass on
filtered = filterSentence(this.buffer.getChars(), singlequote);
filtered = tokenProcessor(this.buffer.getChars(), singlequote);
if (this.out != null) { this.out.write(filtered); }
// this.buffer = new serverByteBuffer();
this.buffer.reset();
@ -387,7 +399,7 @@ public final class TransformerWriter extends Writer {
if (c == rb && this.buffer.length() > 0 && this.buffer.charAt(0) == lb) {
this.inDoubleQuote = false;
// the tag ends here. after filtering: pass on
filtered = filterSentence(this.buffer.getChars(), doublequote);
filtered = tokenProcessor(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
// this.buffer = new serverByteBuffer();
this.buffer.reset();
@ -425,7 +437,7 @@ public final class TransformerWriter extends Writer {
} else if (c == rb) {
this.buffer.append(c);
// the tag ends here. after filtering: pass on
filtered = filterSentence(this.buffer.getChars(), doublequote);
filtered = tokenProcessor(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
// this.buffer = new serverByteBuffer();
this.buffer.reset();
@ -433,7 +445,7 @@ public final class TransformerWriter extends Writer {
// this is an error case
// we consider that there is one rb missing
if (this.buffer.length() > 0) {
filtered = filterSentence(this.buffer.getChars(), doublequote);
filtered = tokenProcessor(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
}
// this.buffer = new serverByteBuffer();
@ -447,7 +459,7 @@ public final class TransformerWriter extends Writer {
if (c == lb) {
// the text ends here
if (this.buffer.length() > 0) {
filtered = filterSentence(this.buffer.getChars(), doublequote);
filtered = tokenProcessor(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
}
// this.buffer = new serverByteBuffer();
@ -492,7 +504,7 @@ public final class TransformerWriter extends Writer {
final char quotechar = (this.inSingleQuote) ? singlequote : doublequote;
if (this.buffer != null) {
if (this.buffer.length() > 0) {
final char[] filtered = filterSentence(this.buffer.getChars(), quotechar);
final char[] filtered = tokenProcessor(this.buffer.getChars(), quotechar);
if (this.out != null) this.out.write(filtered);
}
this.buffer.close();
@ -504,10 +516,8 @@ public final class TransformerWriter extends Writer {
this.out.flush();
this.out.close();
}
this.filterTag = null;
this.filterOpts = null;
if (this.filterCont != null) this.filterCont.close();
this.filterCont = null;
this.tagStack.clear();
this.tagStack = null;
if (this.scraper != null) this.scraper.finish();
}

@ -28,16 +28,18 @@ import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.LinkedHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -53,9 +55,7 @@ import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_");
private final int maxLinks = 10000;
private Charset detectedcharset;
private static final int maxLinks = 10000;
public htmlParser() {
super("Streaming HTML Parser");
@ -97,9 +97,10 @@ public class htmlParser extends AbstractParser implements Parser {
try {
// first get a document from the parsed html
final ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream, maxLinks);
Charset[] detectedcharsetcontainer = new Charset[]{null};
final ContentScraper scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks);
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharset.name(), scraper);
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
return new Document[]{document};
} catch (final IOException e) {
@ -155,9 +156,27 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd;
}
public ContentScraper parseToScraper(
public static ContentScraper parseToScraper(final DigestURL location, final String documentCharset, String input, int maxLinks) throws IOException {
Charset[] detectedcharsetcontainer = new Charset[]{null};
InputStream sourceStream;
try {
sourceStream = new ByteArrayInputStream(documentCharset == null ? UTF8.getBytes(input) : input.getBytes(documentCharset));
} catch (UnsupportedEncodingException e) {
sourceStream = new ByteArrayInputStream(UTF8.getBytes(input));
}
ContentScraper scraper;
try {
scraper = parseToScraper(location, documentCharset, detectedcharsetcontainer, sourceStream, maxLinks);
} catch (Failure e) {
throw new IOException(e.getMessage());
}
return scraper;
}
public static ContentScraper parseToScraper(
final DigestURL location,
final String documentCharset,
Charset[] detectedcharsetcontainer,
InputStream sourceStream,
final int maxLinks) throws Parser.Failure, IOException {
@ -171,13 +190,15 @@ public class htmlParser extends AbstractParser implements Parser {
// nothing found: try to find a meta-tag
if (charset == null) {
ScraperInputStream htmlFilter = null;
try {
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
htmlFilter = new ScraperInputStream(sourceStream, documentCharset, location, null, false, maxLinks);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
htmlFilter.close();
} catch (final IOException e1) {
throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
} finally {
if (htmlFilter != null) htmlFilter.close();
}
}
@ -193,21 +214,22 @@ public class htmlParser extends AbstractParser implements Parser {
// wtf? still nothing, just take system-standard
if (charset == null) {
detectedcharset = Charset.defaultCharset();
detectedcharsetcontainer[0] = Charset.defaultCharset();
} else {
try {
detectedcharset = Charset.forName(charset);
detectedcharsetcontainer[0] = Charset.forName(charset);
} catch (final IllegalCharsetNameException e) {
detectedcharset = Charset.defaultCharset();
detectedcharsetcontainer[0] = Charset.defaultCharset();
} catch (final UnsupportedCharsetException e) {
detectedcharset = Charset.defaultCharset();
detectedcharsetcontainer[0] = Charset.defaultCharset();
}
}
// parsing the content
final ContentScraper scraper = new ContentScraper(location, maxLinks);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {
FileUtils.copy(sourceStream, writer, detectedcharset);
FileUtils.copy(sourceStream, writer, detectedcharsetcontainer[0]);
} catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally {
@ -250,7 +272,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
// fix wrong fill characters
encoding = patternUnderline.matcher(encoding).replaceAll("-");
encoding = CommonPattern.UNDERSCORE.matcher(encoding).replaceAll("-");
if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312";
if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
@ -306,10 +328,9 @@ public class htmlParser extends AbstractParser implements Parser {
try {
url = new AnchorURL(args[0]);
final byte[] content = url.get(ClientIdentification.yacyInternetCrawlerAgent, null, null);
final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
final Document[] document = new htmlParser().parse(url, "text/html", "utf-8", new ByteArrayInputStream(content));
final String title = document[0].dc_title();
System.out.println(title);
System.out.println(CharacterCoding.unicode2html(title, false));
} catch (final MalformedURLException e) {
e.printStackTrace();
} catch (final IOException e) {
@ -319,6 +340,7 @@ public class htmlParser extends AbstractParser implements Parser {
} catch (final InterruptedException e) {
e.printStackTrace();
}
System.exit(0);
}
}

@ -81,6 +81,8 @@ public class genericImageParser extends AbstractParser implements Parser {
SUPPORTED_EXTENSIONS.add("jpeg");
SUPPORTED_EXTENSIONS.add("jpe");
SUPPORTED_EXTENSIONS.add("bmp");
SUPPORTED_EXTENSIONS.add("tif");
SUPPORTED_EXTENSIONS.add("tiff");
SUPPORTED_MIME_TYPES.add("image/png");
SUPPORTED_MIME_TYPES.add("image/gif");
SUPPORTED_MIME_TYPES.add("image/jpeg");

@ -163,12 +163,13 @@ public final class LoaderDispatcher {
check = this.loaderSteering.remove(request.url());
if (check != null) check.release(1000);
return response;
} catch (final IOException e) {
} catch (final IOException e) {
throw new IOException(e);
} finally {
// release the semaphore anyway
check = this.loaderSteering.remove(request.url());
if (check != null) check.release(1000);
// Very noisy: ConcurrentLog.logException(e);
throw new IOException(e);
if (check != null) check.release(1000);
// Very noisy: ConcurrentLog.logException(e);
}
}
@ -190,7 +191,7 @@ public final class LoaderDispatcher {
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request.url(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}
// check if we have the page in the cache
@ -244,13 +245,13 @@ public final class LoaderDispatcher {
}
}
// check case where we want results from the cache exclusively, and never from the internet (offline mode)
// check case where we want results from the cache exclusively, and never from the Internet (offline mode)
if (cacheStrategy == CacheStrategy.CACHEONLY) {
// we had a chance to get the content from the cache .. its over. We don't have it.
throw new IOException("cache only strategy");
}
// now forget about the cache, nothing there. Try to load the content from the internet
// now forget about the cache, nothing there. Try to load the content from the Internet
// check access time: this is a double-check (we checked possibly already in the balancer)
// to make sure that we don't DoS the target by mistake
@ -302,7 +303,7 @@ public final class LoaderDispatcher {
// no caching wanted. Thats ok, do not write any message
return response;
}
// second check tells us if the protocoll tells us something about caching
// second check tells us if the protocol tells us something about caching
final String storeError = response.shallStoreCacheForCrawler();
if (storeError == null) {
try {

@ -114,8 +114,14 @@ public class ErrorCache {
if (this.fulltext.getDefaultConnector() != null && failCategory.store) {
// send the error to solr
try {
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
this.fulltext.getDefaultConnector().add(errorDoc);
// do not overwrite error reports with error reports
SolrDocument olddoc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName());
if (olddoc == null ||
olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) == null ||
((Integer) olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName())) == 200) {
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration());
this.fulltext.getDefaultConnector().add(errorDoc);
}
} catch (final IOException e) {
ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
}

@ -176,7 +176,7 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
remaining--;
}
}
if (nodes.size() == 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");
if (nodes.size() == 0 && this.edges.size() > 0) ConcurrentLog.warn("HyperlinkGraph", "could not find a root node for " + hostname + " in " + this.edges.size() + " edges");
// recusively step into depth and find next level
int depth = 1;

@ -51,6 +51,8 @@ import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
@ -219,26 +221,31 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
// parse text to find images and clear text
ContentScraper textContent = null;
try {textContent = htmlParser.parseToScraper(source_url, null, text, 10);} catch (IOException e) {}
String extractedText = textContent.getText();
// add the source attributes about the target
if (allAttr || contains(WebgraphSchema.target_inbound_b)) add(edge, WebgraphSchema.target_inbound_b, inbound);
if (allAttr || contains(WebgraphSchema.target_name_t)) add(edge, WebgraphSchema.target_name_t, name.length() > 0 ? name : "");
if (allAttr || contains(WebgraphSchema.target_rel_s)) add(edge, WebgraphSchema.target_rel_s, rel.length() > 0 ? rel : "");
if (allAttr || contains(WebgraphSchema.target_relflags_i)) add(edge, WebgraphSchema.target_relflags_i, relEval(rel.length() > 0 ? rel : ""));
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, text.length() > 0 ? text : "");
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, text.length());
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, text.length() > 0 ? CommonPattern.SPACE.split(text).length : 0);
if (allAttr || contains(WebgraphSchema.target_linktext_t)) add(edge, WebgraphSchema.target_linktext_t, extractedText.length() > 0 ? extractedText : "");
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, extractedText.length());
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, extractedText.length() > 0 ? CommonPattern.SPACE.split(extractedText).length : 0);
ImageEntry ientry = null;
for (ImageEntry ie: images) {
if (ie.linkurl() != null && ie.linkurl().equals(target_url)) {ientry = ie; break;}
StringBuilder alttext = new StringBuilder(textContent == null ? 0 : textContent.getImages().size() * 30);
if (textContent != null) for (ImageEntry ie: textContent.getImages()) {
if (ie.alt().length() > 0) alttext.append(ie.alt()).append(' ');
}
String alttext = ientry == null ? "" : ientry.alt();
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext);
while (alttext.length() > 0 && alttext.charAt(alttext.length() - 1) == ' ') alttext.setLength(alttext.length() - 1);
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, alttext.toString());
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, alttext.length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, alttext.length() > 0 ? CommonPattern.SPACE.split(alttext).length : 0);
// add the target attributes
// add the target attributes
add(edge, WebgraphSchema.target_id_s, target_id);
final String target_url_string = target_url.toNormalform(false);
int pr_target = target_url_string.indexOf("://",0);

Loading…
Cancel
Save