bugfix for image search

pull/1/head
Michael Peter Christen 11 years ago
parent c7995d3e2a
commit b893c42a0f

@ -58,6 +58,7 @@ import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.parser.html.CharacterCoding;
/**
@ -1040,7 +1041,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
}
public static final boolean isImage(final String extension) {
return extension != null && extension.length() > 0 && "png.gif.jpg.jpeg.tif.tiff.ico".indexOf(extension.toLowerCase()) >= 0;
return extension != null && extension.length() > 0 && Response.docTypeExt(extension.toLowerCase()) == Response.DT_IMAGE;
}
public final boolean isIndividual() {

@ -69,35 +69,64 @@ public class Response {
private int status; // tracker indexing status, see status defs below
private final boolean fromCache;
// doctype calculation
/**
* doctype calculation by file extension
* TODO: this must be enhanced with a more generic way of configuration
* @param ext
* @return a character denoting the file type
*/
public static char docTypeExt(final String ext) {
if (ext == null) return DT_UNKNOWN;
if (ext.equals("gif")) return DT_IMAGE;
if (ext.equals("ico")) return DT_IMAGE;
if (ext.equals("bmp")) return DT_IMAGE;
if (ext.equals("jpg")) return DT_IMAGE;
if (ext.equals("jpeg")) return DT_IMAGE;
if (ext.equals("png")) return DT_IMAGE;
if (ext.equals("tif")) return DT_IMAGE;
if (ext.equals("tiff")) return DT_IMAGE;
if (ext.equals("htm")) return DT_HTML;
if (ext.equals("html")) return DT_HTML;
if (ext.equals("txt")) return DT_TEXT;
if (ext.equals("doc")) return DT_DOC;
if (ext.equals("rtf")) return DT_DOC;
if (ext.equals("pdf")) return DT_PDFPS;
if (ext.equals("ps")) return DT_PDFPS;
if (ext.equals("mp3")) return DT_AUDIO;
if (ext.equals("aac")) return DT_AUDIO;
if (ext.equals("m4a")) return DT_AUDIO;
if (ext.equals("ogg")) return DT_AUDIO;
if (ext.equals("wav")) return DT_AUDIO;
if (ext.equals("wma")) return DT_AUDIO;
if (ext.equals("avi")) return DT_MOVIE;
if (ext.equals("mov")) return DT_MOVIE;
if (ext.equals("qt")) return DT_MOVIE;
if (ext.equals("mpg")) return DT_MOVIE;
if (ext.equals("mp4")) return DT_MOVIE;
if (ext.equals("m4v")) return DT_MOVIE;
if (ext.equals("mkv")) return DT_MOVIE;
if (ext.equals("md5")) return DT_SHARE;
if (ext.equals("mpeg")) return DT_MOVIE;
if (ext.equals("asf")) return DT_FLASH;
return DT_UNKNOWN;
}
/**
* doctype calculation based on file extensions; this is the url wrapper
* @param url
* @return a character denoting the file type
*/
public static char docType(final MultiProtocolURL url) {
String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext == null) return DT_UNKNOWN;
if (ext.equals(".gif")) return DT_IMAGE;
if (ext.equals(".ico")) return DT_IMAGE;
if (ext.equals(".bmp")) return DT_IMAGE;
if (ext.equals(".jpg")) return DT_IMAGE;
if (ext.equals(".jpeg")) return DT_IMAGE;
if (ext.equals(".png")) return DT_IMAGE;
if (ext.equals(".tif")) return DT_IMAGE;
if (ext.equals(".tiff")) return DT_IMAGE;
if (ext.equals(".htm")) return DT_HTML;
if (ext.equals(".html")) return DT_HTML;
if (ext.equals(".txt")) return DT_TEXT;
if (ext.equals(".doc")) return DT_DOC;
if (ext.equals(".rtf")) return DT_DOC;
if (ext.equals(".pdf")) return DT_PDFPS;
if (ext.equals(".ps")) return DT_PDFPS;
if (ext.equals(".avi")) return DT_MOVIE;
if (ext.equals(".mov")) return DT_MOVIE;
if (ext.equals(".qt")) return DT_MOVIE;
if (ext.equals(".mpg")) return DT_MOVIE;
if (ext.equals(".md5")) return DT_SHARE;
if (ext.equals(".mpeg")) return DT_MOVIE;
if (ext.equals(".asf")) return DT_FLASH;
return DT_UNKNOWN;
return docTypeExt(ext);
}
/**
* doctype calculation based on the mime type
* @param mime
* @return a character denoting the file type
*/
public static char docType(final String mime) {
// serverLog.logFinest("PLASMA", "docType mime=" + mime);
char doctype = DT_UNKNOWN;
@ -120,6 +149,12 @@ public class Response {
return doctype;
}
/**
* reverse mime type calculation; this is just a heuristic
* @param ext
* @param doctype
* @return a mime type string
*/
public static String[] doctype2mime(String ext, char doctype) {
if (doctype == DT_PDFPS) return new String[]{"application/pdf"};
if (doctype == DT_HTML) return new String[]{"text/html"};

@ -367,7 +367,9 @@ public class QueryGoal {
// combine these queries for all relevant fields
q.append(" AND (");
q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(w).append("^10.0) OR ");
q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(w).append("^100.0) OR ");
q.append('(').append(CollectionSchema.title.getSolrFieldName()).append(':').append(w).append("^50.0) OR ");
q.append('(').append(CollectionSchema.keywords.getSolrFieldName()).append(':').append(w).append("^10.0) OR ");
q.append('(').append(CollectionSchema.text_t.getSolrFieldName()).append(':').append(w).append(')');
q.append(')');

@ -65,6 +65,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
import net.yacy.document.Condenser;
import net.yacy.document.LargeNumberCache;
@ -1467,42 +1468,38 @@ public final class SearchEvent {
public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
if (item < imageViewed.size()) return nthImage(item);
if (imageSpare.size() > 0) return nextSpare();
ResultEntry ms = oneResult(item, timeout);
// check if the match was made in the url or in the image links
if (ms != null) {
SolrDocument doc = ms.getNode();
if (ms == null) throw new MalformedURLException("no image url found");
// try to get more
SolrDocument doc = ms.getNode();
// there can be two different kinds of image hits: either the document itself is an image or images are embedded in the links of text documents.
String mime = (String) doc.getFirstValue(CollectionSchema.content_type.getSolrFieldName());
if (Response.docType(ms.url()) == Response.DT_IMAGE || Response.docType(mime) == Response.DT_IMAGE) {
String id = ASCII.String(ms.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
} else {
Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
if (img != null) {
int c = 0;
for (Object i: img) {
String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) {
if (img != null && img.size() > 0) {
for (int c = 0; c < img.size(); c++) {
String image_urlstub = (String) SetTools.nth(img, c);
String image_alt = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
if (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)) {
try {
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i);
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + image_urlstub);
Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", image_alt, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
} catch (MalformedURLException e) {
continue;
}
}
c++;
}
}
if (MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(ms.url().getFileName()))) {
String id = ASCII.String(ms.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
}
if (img != null && img.size() > 0) {
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0));
String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : "";
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0));
}
}
if (imageSpare.size() > 0) return nextSpare();
throw new MalformedURLException("no image url found");

Loading…
Cancel
Save