From d54c5d310a1f80ad80b42d72db6fd738f39c30d3 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 10 Oct 2015 23:49:58 +0200 Subject: [PATCH 1/3] add links with image extension not automatically to image links. With the wide spread use e.g. of Wikimedia the url file extension of links with image extension often point to html. --- .../document/parser/html/ContentScraper.java | 26 +++++++------------ .../parser/html/TransformerWriter.java | 3 +-- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 3e2caa3fa..6bd73f704 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -57,7 +57,6 @@ import net.yacy.document.SentenceReader; import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.html.Evaluation.Element; -import net.yacy.document.parser.images.genericImageParser; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.ISO639; @@ -552,23 +551,16 @@ public class ContentScraper extends AbstractScraper implements Scraper { href = CharacterCoding.html2unicode(href); AnchorURL url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { - final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); - if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) { - // special handling of such urls: put them to the image urls - final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1); - this.images.add(ie); - } else { - if (followDenied()) { - String rel = tag.opts.getProperty("rel", EMPTY_STRING); - if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; - tag.opts.put("rel", rel); - } - tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like " test " - tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute - url.setAll(tag.opts); - recursiveParse(url, tag.content.getChars()); - this.anchors.add(url); + if (followDenied()) { + String rel = tag.opts.getProperty("rel", EMPTY_STRING); + if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; + tag.opts.put("rel", rel); } + tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like " test " + tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute + url.setAll(tag.opts); + recursiveParse(url, tag.content.getChars()); + this.anchors.add(url); } this.evaluationScores.match(Element.apath, href); } diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index 2f361395d..18bdb0ba0 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -291,8 +291,7 @@ public final class TransformerWriter extends Writer { } if (this.transformer != null && this.transformer.isTag0(tagname)) { // this single tag is collected at once here - char[] b = new char[0]; - b = this.transformer.transformTag0(tag, quotechar); + char[] b = this.transformer.transformTag0(tag, quotechar); return b; } else if ((this.scraper != null && this.scraper.isTag1(tagname)) || (this.transformer != null && this.transformer.isTag1(tagname))) { From 78e8c6f3e5d3f3b0928b7c5ce1253dad934f1345 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 11 Oct 2015 01:23:52 +0200 Subject: [PATCH 2/3] refactor special handling (static override) of SUPPORTED_EXTENSIONS/MIME_TYPES not used for genericImageParser --- .../parser/images/genericImageParser.java | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index c6370f00b..74e04f779 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -65,30 +65,25 @@ import com.drew.metadata.Metadata; import com.drew.metadata.Tag; import com.drew.metadata.exif.GpsDirectory; +/** + * Parser for images, bmp and jpeg and all supported by the Java Image I/O API + * by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered) + * http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html + */ public class genericImageParser extends AbstractParser implements Parser { - /** - * a list of mime types that are supported by this parser class - * @see #getSupportedMimeTypes() - */ - public static final Set SUPPORTED_MIME_TYPES = new HashSet(); - public static final Set SUPPORTED_EXTENSIONS = new HashSet(); - static { + public genericImageParser() { + super("Generic Image Parser"); + SUPPORTED_EXTENSIONS.add("bmp"); - // by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered) - // http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html SUPPORTED_EXTENSIONS.add("jpe"); // not listed in ImageIO extension but sometimes uses for jpeg SUPPORTED_EXTENSIONS.addAll(Arrays.asList(ImageIO.getReaderFileSuffixes())); SUPPORTED_MIME_TYPES.add("image/bmp"); - SUPPORTED_MIME_TYPES.add("image/jpg"); // this is in fact a 'wrong' mime type. We leave it here because that is a common error that occurs in the internet frequently + SUPPORTED_MIME_TYPES.add("image/jpg"); // this is in fact a 'wrong' mime type. We leave it here because that is a common error that occurs in the internet frequently SUPPORTED_MIME_TYPES.addAll(Arrays.asList(ImageIO.getReaderMIMETypes())); } - public genericImageParser() { - super("Generic Image Parser"); - } - @Override public Document[] parse( final AnchorURL location, @@ -130,7 +125,8 @@ public class genericImageParser extends AbstractParser implements Parser { try { b = FileUtils.read(source); // check jpeg file signature (magic number FF D8 FF) - if ((b[0] != (byte) 0xFF) // cast to signed byte (-1) + if (b.length < 3 + || (b[0] != (byte) 0xFF) // cast to signed byte (-1) || (b[1] != (byte) 0xD8) //cast to signed byte (-40) || (b[2] != (byte) 0xFF)) { throw new Parser.Failure("File has no jpeg signature", location); @@ -232,7 +228,7 @@ public class genericImageParser extends AbstractParser implements Parser { return SUPPORTED_EXTENSIONS; } - public static ImageInfo parseJavaImage( + private ImageInfo parseJavaImage( final AnchorURL location, final InputStream sourceStream) throws Parser.Failure { BufferedImage image = null; @@ -247,7 +243,7 @@ public class genericImageParser extends AbstractParser implements Parser { return parseJavaImage(location, image); } - public static ImageInfo parseJavaImage( + private ImageInfo parseJavaImage( final AnchorURL location, final BufferedImage image) { final ImageInfo ii = new ImageInfo(location); @@ -284,7 +280,7 @@ public class genericImageParser extends AbstractParser implements Parser { return ii; } - public static class ImageInfo { + private class ImageInfo { public AnchorURL location; public BufferedImage image; public StringBuilder info; From 10b0eb106fb5321098cdf2a02d8055c59a8fa6a8 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 11 Oct 2015 06:06:40 +0200 Subject: [PATCH 3/3] fix link target on iframe list in CrawlProfileEditor --- htroot/Table_API_p.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/htroot/Table_API_p.html b/htroot/Table_API_p.html index 08e11e82b..720a0f110 100644 --- a/htroot/Table_API_p.html +++ b/htroot/Table_API_p.html @@ -85,7 +85,7 @@ To see a list of all APIs, please visit the #[type]# #(isCrawlerStart)#::

-
::

+ ::

- + #(/isCrawlerStart)# #[comment]#