luc 9 years ago
commit ff963cbe23

@ -85,7 +85,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<td valign="top" align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
<td valign="top">#[type]#
#(isCrawlerStart)#::<br/><br/>
<a href="#[url]#" title="clone"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>::<br/><br/>
<a href="#[url]#" title="clone" target="_parent"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>::<br/><br/>
<script>
var f = document.createElement("form");
@ -93,7 +93,9 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
f.setAttribute("enctype", "multipart/form-data");
f.setAttribute("accept-charset", "UTF-8");
f.setAttribute("action", "#[servlet]#");
f.setAttribute("target", "_parent");
f.setAttribute("id", "#[pk]#");
f.setAttribute("name", "#[pk]#");
#{attr}#
var e = document.createElement("input");
e.setAttribute("type", "hidden");

@ -945,6 +945,14 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return this.searchpart;
}
/**
* Returns a search part parameter map key=value
* in internal url encoded format
* for unescaped return values
* @see #getAttributes()
*
* @return key name value
*/
public Map<String, String> getSearchpartMap() {
if (this.searchpart == null) return null;
this.searchpart = this.searchpart.replaceAll("&amp;", "&");
@ -1027,6 +1035,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
/**
* Evaluates url search part and returns attribute '=' value pairs
* the returned values are in clear text (without urlencoding).
*
* To get the parameter map as (url-encoded key and values)
* @see getSearchpartMap()
*
* @return map key=attribue name, value=string after '='
*/
@ -1037,9 +1049,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
for (final String element : questp) {
int p = element.indexOf('=');
if (p != -1) {
map.put(element.substring(0, p), element.substring(p + 1));
map.put(unescape(element.substring(0, p)), unescape(element.substring(p + 1)));
} else {
if (!element.isEmpty()) map.put(element, "");
if (!element.isEmpty()) map.put(unescape(element), "");
}
}
return map;

@ -57,7 +57,6 @@ import net.yacy.document.SentenceReader;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.Evaluation.Element;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.ISO639;
@ -552,23 +551,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
href = CharacterCoding.html2unicode(href);
AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) {
// special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1);
this.images.add(ie);
} else {
if (followDenied()) {
String rel = tag.opts.getProperty("rel", EMPTY_STRING);
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
tag.opts.put("rel", rel);
}
tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like "<a ...> <span>test</span> </a>"
tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tag.opts);
recursiveParse(url, tag.content.getChars());
this.anchors.add(url);
if (followDenied()) {
String rel = tag.opts.getProperty("rel", EMPTY_STRING);
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
tag.opts.put("rel", rel);
}
tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like "<a ...> <span>test</span> </a>"
tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tag.opts);
recursiveParse(url, tag.content.getChars());
this.anchors.add(url);
}
this.evaluationScores.match(Element.apath, href);
}

@ -291,8 +291,7 @@ public final class TransformerWriter extends Writer {
}
if (this.transformer != null && this.transformer.isTag0(tagname)) {
// this single tag is collected at once here
char[] b = new char[0];
b = this.transformer.transformTag0(tag, quotechar);
char[] b = this.transformer.transformTag0(tag, quotechar);
return b;
} else if ((this.scraper != null && this.scraper.isTag1(tagname)) ||
(this.transformer != null && this.transformer.isTag1(tagname))) {

@ -65,30 +65,25 @@ import com.drew.metadata.Metadata;
import com.drew.metadata.Tag;
import com.drew.metadata.exif.GpsDirectory;
/**
* Parser for images, bmp and jpeg and all supported by the Java Image I/O API
* by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered)
* http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html
*/
public class genericImageParser extends AbstractParser implements Parser {
/**
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
public genericImageParser() {
super("Generic Image Parser");
SUPPORTED_EXTENSIONS.add("bmp");
// by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered)
// http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html
SUPPORTED_EXTENSIONS.add("jpe"); // not listed in ImageIO extension but sometimes uses for jpeg
SUPPORTED_EXTENSIONS.addAll(Arrays.asList(ImageIO.getReaderFileSuffixes()));
SUPPORTED_MIME_TYPES.add("image/bmp");
SUPPORTED_MIME_TYPES.add("image/jpg"); // this is in fact a 'wrong' mime type. We leave it here because that is a common error that occurs in the internet frequently
SUPPORTED_MIME_TYPES.add("image/jpg"); // this is in fact a 'wrong' mime type. We leave it here because that is a common error that occurs in the internet frequently
SUPPORTED_MIME_TYPES.addAll(Arrays.asList(ImageIO.getReaderMIMETypes()));
}
public genericImageParser() {
super("Generic Image Parser");
}
@Override
public Document[] parse(
final AnchorURL location,
@ -130,7 +125,8 @@ public class genericImageParser extends AbstractParser implements Parser {
try {
b = FileUtils.read(source);
// check jpeg file signature (magic number FF D8 FF)
if ((b[0] != (byte) 0xFF) // cast to signed byte (-1)
if (b.length < 3
|| (b[0] != (byte) 0xFF) // cast to signed byte (-1)
|| (b[1] != (byte) 0xD8) //cast to signed byte (-40)
|| (b[2] != (byte) 0xFF)) {
throw new Parser.Failure("File has no jpeg signature", location);
@ -232,7 +228,7 @@ public class genericImageParser extends AbstractParser implements Parser {
return SUPPORTED_EXTENSIONS;
}
public static ImageInfo parseJavaImage(
private ImageInfo parseJavaImage(
final AnchorURL location,
final InputStream sourceStream) throws Parser.Failure {
BufferedImage image = null;
@ -247,7 +243,7 @@ public class genericImageParser extends AbstractParser implements Parser {
return parseJavaImage(location, image);
}
public static ImageInfo parseJavaImage(
private ImageInfo parseJavaImage(
final AnchorURL location,
final BufferedImage image) {
final ImageInfo ii = new ImageInfo(location);
@ -284,7 +280,7 @@ public class genericImageParser extends AbstractParser implements Parser {
return ii;
}
public static class ImageInfo {
private class ImageInfo {
public AnchorURL location;
public BufferedImage image;
public StringBuilder info;

Loading…
Cancel
Save