diff --git a/htroot/Table_API_p.html b/htroot/Table_API_p.html index 08e11e82b..720a0f110 100644 --- a/htroot/Table_API_p.html +++ b/htroot/Table_API_p.html @@ -85,7 +85,7 @@ To see a list of all APIs, please visit the #[type]# #(isCrawlerStart)#::

-
::

+ ::

- + #(/isCrawlerStart)# #[comment]# diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 3e2caa3fa..6bd73f704 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -57,7 +57,6 @@ import net.yacy.document.SentenceReader; import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.html.Evaluation.Element; -import net.yacy.document.parser.images.genericImageParser; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.ISO639; @@ -552,23 +551,16 @@ public class ContentScraper extends AbstractScraper implements Scraper { href = CharacterCoding.html2unicode(href); AnchorURL url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { - final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); - if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) { - // special handling of such urls: put them to the image urls - final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1); - this.images.add(ie); - } else { - if (followDenied()) { - String rel = tag.opts.getProperty("rel", EMPTY_STRING); - if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; - tag.opts.put("rel", rel); - } - tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like " test " - tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute - url.setAll(tag.opts); - recursiveParse(url, tag.content.getChars()); - this.anchors.add(url); + if (followDenied()) { + String rel = tag.opts.getProperty("rel", EMPTY_STRING); + if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; + tag.opts.put("rel", rel); } + tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like " test " + tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute + url.setAll(tag.opts); + recursiveParse(url, tag.content.getChars()); + this.anchors.add(url); } this.evaluationScores.match(Element.apath, href); } diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index 2f361395d..18bdb0ba0 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -291,8 +291,7 @@ public final class TransformerWriter extends Writer { } if (this.transformer != null && this.transformer.isTag0(tagname)) { // this single tag is collected at once here - char[] b = new char[0]; - b = this.transformer.transformTag0(tag, quotechar); + char[] b = this.transformer.transformTag0(tag, quotechar); return b; } else if ((this.scraper != null && this.scraper.isTag1(tagname)) || (this.transformer != null && this.transformer.isTag1(tagname))) { diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index c6370f00b..74e04f779 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -65,30 +65,25 @@ import com.drew.metadata.Metadata; import com.drew.metadata.Tag; import com.drew.metadata.exif.GpsDirectory; +/** + * Parser for images, bmp and jpeg and all supported by the Java Image I/O API + * by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered) + * http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html + */ public class genericImageParser extends AbstractParser implements Parser { - /** - * a list of mime types that are supported by this parser class - * @see #getSupportedMimeTypes() - */ - public static final Set SUPPORTED_MIME_TYPES = new HashSet(); - public static final Set SUPPORTED_EXTENSIONS = new HashSet(); - static { + public genericImageParser() { + super("Generic Image Parser"); + SUPPORTED_EXTENSIONS.add("bmp"); - // by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered) - // http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html SUPPORTED_EXTENSIONS.add("jpe"); // not listed in ImageIO extension but sometimes uses for jpeg SUPPORTED_EXTENSIONS.addAll(Arrays.asList(ImageIO.getReaderFileSuffixes())); SUPPORTED_MIME_TYPES.add("image/bmp"); - SUPPORTED_MIME_TYPES.add("image/jpg"); // this is in fact a 'wrong' mime type. We leave it here because that is a common error that occurs in the internet frequently + SUPPORTED_MIME_TYPES.add("image/jpg"); // this is in fact a 'wrong' mime type. We leave it here because that is a common error that occurs in the internet frequently SUPPORTED_MIME_TYPES.addAll(Arrays.asList(ImageIO.getReaderMIMETypes())); } - public genericImageParser() { - super("Generic Image Parser"); - } - @Override public Document[] parse( final AnchorURL location, @@ -130,7 +125,8 @@ public class genericImageParser extends AbstractParser implements Parser { try { b = FileUtils.read(source); // check jpeg file signature (magic number FF D8 FF) - if ((b[0] != (byte) 0xFF) // cast to signed byte (-1) + if (b.length < 3 + || (b[0] != (byte) 0xFF) // cast to signed byte (-1) || (b[1] != (byte) 0xD8) //cast to signed byte (-40) || (b[2] != (byte) 0xFF)) { throw new Parser.Failure("File has no jpeg signature", location); @@ -232,7 +228,7 @@ public class genericImageParser extends AbstractParser implements Parser { return SUPPORTED_EXTENSIONS; } - public static ImageInfo parseJavaImage( + private ImageInfo parseJavaImage( final AnchorURL location, final InputStream sourceStream) throws Parser.Failure { BufferedImage image = null; @@ -247,7 +243,7 @@ public class genericImageParser extends AbstractParser implements Parser { return parseJavaImage(location, image); } - public static ImageInfo parseJavaImage( + private ImageInfo parseJavaImage( final AnchorURL location, final BufferedImage image) { final ImageInfo ii = new ImageInfo(location); @@ -284,7 +280,7 @@ public class genericImageParser extends AbstractParser implements Parser { return ii; } - public static class ImageInfo { + private class ImageInfo { public AnchorURL location; public BufferedImage image; public StringBuilder info;