check mime prior to ext for metadata modification for images

pull/14/head
reger 10 years ago
parent 19f1308bf0
commit c33229fc0c

@ -1178,6 +1178,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return extension != null && extension.length() > 0 && "cgi.exe".indexOf(extension.toLowerCase()) >= 0;
}
/**
* @deprecated use a mimetype considering methode (e.g. Document.getContentDomain() == ContentDomain.IMAGE)
*/
@Deprecated
public static final boolean isImage(final String extension) {
return extension != null && extension.length() > 0 && Response.docTypeExt(extension.toLowerCase()) == Response.DT_IMAGE;
}

@ -48,6 +48,7 @@ import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
@ -537,14 +538,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
LinkedHashMap<DigestURL,String> outboundLinks = document.outboundLinks();
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
List<ImageEntry> images = new ArrayList<ImageEntry>();
int c = 0;
final Object parser = document.getParserObject();
boolean containsCanonical = false;
DigestURL canonical = null;
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
images = html.getImages();
List<ImageEntry> images = html.getImages();
// header tags
int h = 0;
@ -913,7 +913,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) {
// handle image source meta data
if ((allAttr || contains(CollectionSchema.images_text_t)) && (document.getContentDomain() == ContentDomain.IMAGE)) {
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
content = digestURL.toTokens(); // remove all other entry but the url tokens
}

Loading…
Cancel
Save