Merge branch 'master' of ssh://git@github.com/yacy/yacy_search_server.git

pull/12/head
luccioman 9 years ago
commit e0dda0c01c

@ -1178,6 +1178,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return extension != null && extension.length() > 0 && "cgi.exe".indexOf(extension.toLowerCase()) >= 0;
}
/**
* @deprecated use a mimetype considering methode (e.g. Document.getContentDomain() == ContentDomain.IMAGE)
*/
@Deprecated
public static final boolean isImage(final String extension) {
return extension != null && extension.length() > 0 && Response.docTypeExt(extension.toLowerCase()) == Response.DT_IMAGE;
}

@ -1598,7 +1598,7 @@ public final class SearchEvent {
Collection<Object> altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
Collection<Object> imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
if (imgO != null && imgO.size() > 0 && imgO instanceof List<?>) {
List<Object> alt = altO == null ? new ArrayList<Object>(imgO.size()) : (List<Object>) altO;
List<Object> alt = altO == null ? null : (List<Object>) altO;
List<Object> img = (List<Object>) imgO;
List<String> prt = CollectionConfiguration.indexedList2protocolList(doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()), img.size());
Collection<Object> heightO = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName());
@ -1608,17 +1608,21 @@ public final class SearchEvent {
for (int c = 0; c < img.size(); c++) {
String image_urlstub = (String) img.get(c);
if (image_urlstub.endsWith(".ico")) continue; // we don't want favicons, makes the result look idiotic
String image_alt = alt != null && alt.size() > c ? (String) alt.get(c) : "";
boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt));
try {
int h = height == null ? 0 : (Integer) height.get(c);
int w = width == null ? 0 : (Integer) width.get(c);
// check size good for display (parser may init unknown dimension with -1)
if (h > 0 && h <= 16) continue; // to small for display
if (w > 0 && w <= 16) continue; // to small for display
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? prt.get(c) : "http") + "://" + image_urlstub);
Integer h = height == null ? null : (Integer) height.get(c);
Integer w = width == null ? null : (Integer) width.get(c);
boolean sizeok = h != null && w != null && h.intValue() > 16 && w.intValue() > 16;
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !containsSpare(id)) {
ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w == null ? 0 : w, h == null ? 0 : h, 0);
if (match || sizeok) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult);
String image_alt = (alt != null && alt.size() > c) ? (String) alt.get(c) : "";
ImageResult imageResult = new ImageResult(doc.url(), imageUrl, "", image_alt, w, h, 0);
boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt));
if (match) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult);
}
} catch (MalformedURLException e) {
continue;

@ -48,6 +48,7 @@ import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
@ -537,14 +538,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
LinkedHashMap<DigestURL,String> outboundLinks = document.outboundLinks();
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
List<ImageEntry> images = new ArrayList<ImageEntry>();
int c = 0;
final Object parser = document.getParserObject();
boolean containsCanonical = false;
DigestURL canonical = null;
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
images = html.getImages();
List<ImageEntry> images = html.getImages();
// header tags
int h = 0;
@ -912,12 +912,27 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
!content.endsWith(" " + r)) content += " " + r;
}
}
if ((allAttr || contains(CollectionSchema.images_text_t)) && MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(digestURL.getFileName()))) {
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
content = digestURL.toTokens(); // remove all other entry but the url tokens
// handle image source meta data
if (document.getContentDomain() == ContentDomain.IMAGE) {
// add image pixel size if known
Iterator<ImageEntry> imgit = document.getImages().values().iterator();
if (imgit.hasNext()) {
ImageEntry img = imgit.next();
int imgpixels = (img.height() < 0 || img.width() < 0) ? -1 : img.height() * img.width();
if (imgpixels > 0) {
if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, img.height());
if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, img.width());
if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
}
}
if (allAttr || contains(CollectionSchema.images_text_t)) {
add(doc, CollectionSchema.images_text_t, content); // the content may contain the exif data from the image parser
content = digestURL.toTokens(); // remove all other entry but the url tokens
}
}
// content (must be written after special parser data, since this can influence the content)
if (allAttr || contains(CollectionSchema.text_t)) add(doc, CollectionSchema.text_t, content);
if (allAttr || contains(CollectionSchema.wordcount_i)) {

Loading…
Cancel
Save