sixcooler 9 years ago
commit 6695e5cdd3

@ -85,7 +85,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<td valign="top" align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
<td valign="top">#[type]#
#(isCrawlerStart)#::<br/><br/>
<a href="#[url]#" title="clone"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>::<br/><br/>
<a href="#[url]#" title="clone" target="_parent"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>::<br/><br/>
<script>
var f = document.createElement("form");
@ -103,7 +103,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
#{/attr}#
document.body.appendChild(f);
</script>
<a href="#" title="clone" onclick="document.forms['#[pk]#'].submit(); return false;"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>
<a href="#" title="clone" target="_parent" onclick="document.forms['#[pk]#'].submit(); return false;"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>
#(/isCrawlerStart)#</td>
<td valign="top">#[comment]#</td>

@ -57,7 +57,6 @@ import net.yacy.document.SentenceReader;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.Evaluation.Element;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.ISO639;
@ -552,23 +551,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
href = CharacterCoding.html2unicode(href);
AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) {
// special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1);
this.images.add(ie);
} else {
if (followDenied()) {
String rel = tag.opts.getProperty("rel", EMPTY_STRING);
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
tag.opts.put("rel", rel);
}
tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like "<a ...> <span>test</span> </a>"
tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tag.opts);
recursiveParse(url, tag.content.getChars());
this.anchors.add(url);
if (followDenied()) {
String rel = tag.opts.getProperty("rel", EMPTY_STRING);
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
tag.opts.put("rel", rel);
}
tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like "<a ...> <span>test</span> </a>"
tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tag.opts);
recursiveParse(url, tag.content.getChars());
this.anchors.add(url);
}
this.evaluationScores.match(Element.apath, href);
}

@ -291,8 +291,7 @@ public final class TransformerWriter extends Writer {
}
if (this.transformer != null && this.transformer.isTag0(tagname)) {
// this single tag is collected at once here
char[] b = new char[0];
b = this.transformer.transformTag0(tag, quotechar);
char[] b = this.transformer.transformTag0(tag, quotechar);
return b;
} else if ((this.scraper != null && this.scraper.isTag1(tagname)) ||
(this.transformer != null && this.transformer.isTag1(tagname))) {

@ -65,30 +65,25 @@ import com.drew.metadata.Metadata;
import com.drew.metadata.Tag;
import com.drew.metadata.exif.GpsDirectory;
/**
* Parser for images, bmp and jpeg and all supported by the Java Image I/O API
* by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered)
* http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html
*/
public class genericImageParser extends AbstractParser implements Parser {
/**
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
public genericImageParser() {
super("Generic Image Parser");
SUPPORTED_EXTENSIONS.add("bmp");
// by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered)
// http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html
SUPPORTED_EXTENSIONS.add("jpe"); // not listed in ImageIO extension but sometimes uses for jpeg
SUPPORTED_EXTENSIONS.addAll(Arrays.asList(ImageIO.getReaderFileSuffixes()));
SUPPORTED_MIME_TYPES.add("image/bmp");
SUPPORTED_MIME_TYPES.add("image/jpg"); // this is in fact a 'wrong' mime type. We leave it here because that is a common error that occurs in the internet frequently
SUPPORTED_MIME_TYPES.add("image/jpg"); // this is in fact a 'wrong' mime type. We leave it here because that is a common error that occurs in the internet frequently
SUPPORTED_MIME_TYPES.addAll(Arrays.asList(ImageIO.getReaderMIMETypes()));
}
public genericImageParser() {
super("Generic Image Parser");
}
@Override
public Document[] parse(
final AnchorURL location,
@ -130,7 +125,8 @@ public class genericImageParser extends AbstractParser implements Parser {
try {
b = FileUtils.read(source);
// check jpeg file signature (magic number FF D8 FF)
if ((b[0] != (byte) 0xFF) // cast to signed byte (-1)
if (b.length < 3
|| (b[0] != (byte) 0xFF) // cast to signed byte (-1)
|| (b[1] != (byte) 0xD8) //cast to signed byte (-40)
|| (b[2] != (byte) 0xFF)) {
throw new Parser.Failure("File has no jpeg signature", location);
@ -232,7 +228,7 @@ public class genericImageParser extends AbstractParser implements Parser {
return SUPPORTED_EXTENSIONS;
}
public static ImageInfo parseJavaImage(
private ImageInfo parseJavaImage(
final AnchorURL location,
final InputStream sourceStream) throws Parser.Failure {
BufferedImage image = null;
@ -247,7 +243,7 @@ public class genericImageParser extends AbstractParser implements Parser {
return parseJavaImage(location, image);
}
public static ImageInfo parseJavaImage(
private ImageInfo parseJavaImage(
final AnchorURL location,
final BufferedImage image) {
final ImageInfo ii = new ImageInfo(location);
@ -284,7 +280,7 @@ public class genericImageParser extends AbstractParser implements Parser {
return ii;
}
public static class ImageInfo {
private class ImageInfo {
public AnchorURL location;
public BufferedImage image;
public StringBuilder info;

Loading…
Cancel
Save