enhanced image search: bugfixes and performance enhancements

pull/1/head
Michael Peter Christen 11 years ago
parent ebd0be2cea
commit 1735dbc9d9

@ -106,7 +106,8 @@ public class ViewImage {
if (image == null) {
byte[] resourceb = null;
if (url != null) try {
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, agent);
} catch (final IOException e) {
ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage());

@ -34,7 +34,6 @@ import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.RequestHeader.FileType;
@ -189,7 +188,7 @@ public class yacysearchitem {
String resultFileName = resultURL.getFileName();
prop.putHTML("content_target", target);
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
//if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading
prop.put("content_urlhash", resulthashString);
prop.put("content_ranking", Float.toString(result.score()));
@ -275,8 +274,8 @@ public class yacysearchitem {
final String imageUrlstring = image.imageUrl.toNormalform(true);
final String target = sb.getConfig(imageUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
final String license = URLLicense.aquireLicense(image.imageUrl);
sb.loader.loadIfNotExistBackground(image.imageUrl, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
final String license = URLLicense.aquireLicense(image.imageUrl); // this is just the license key to get the image forwarded through the YaCy thumbnail viewer, not an actual lawful license
//sb.loader.loadIfNotExistBackground(image.imageUrl, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
prop.putHTML("content_item_hrefCache", "/ViewImage.png?maxwidth=128&maxheight=128&quadratic=&url=" + imageUrlstring);
prop.putHTML("content_item_href", imageUrlstring);
prop.putHTML("content_item_target", target);

@ -149,10 +149,13 @@ public final class LoaderDispatcher {
}
public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException {
Semaphore check = this.loaderSteering.get(request.url());
Semaphore check = this.loaderSteering.get(request.url());
if (check != null) {
// a loading process may be going on for that url
//ConcurrentLog.info("LoaderDispatcher", "waiting for " + request.url().toNormalform(true));
long t = System.currentTimeMillis();
try { check.tryAcquire(5, TimeUnit.SECONDS);} catch (final InterruptedException e) {}
ConcurrentLog.info("LoaderDispatcher", "waited " + (System.currentTimeMillis() - t) + " ms for " + request.url().toNormalform(true));
// now the process may have terminated and we run a normal loading
// which may be successful faster because of a cache hit
}
@ -163,13 +166,12 @@ public final class LoaderDispatcher {
check = this.loaderSteering.remove(request.url());
if (check != null) check.release(1000);
return response;
} catch (final IOException e) {
} catch (final Throwable e) {
throw new IOException(e);
} finally {
// release the semaphore anyway
check = this.loaderSteering.remove(request.url());
if (check != null) check.release(1000);
// Very noisy: ConcurrentLog.logException(e);
if (check != null) check.release(1000);
}
}

@ -94,6 +94,7 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
import net.yacy.search.ranking.ReferenceOrder;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.snippet.ResultEntry;
import net.yacy.search.snippet.TextSnippet;
@ -185,8 +186,11 @@ public final class SearchEvent {
public final AtomicInteger remote_solr_peerCount;// the number of peers which contributed to the remote search result
public int getResultCount() {
return this.local_rwi_available.get() + this.remote_rwi_available.get() +
this.remote_solr_available.get() + this.local_solr_stored.get();
return Math.max(
this.local_rwi_available.get() + this.remote_rwi_available.get() +
this.remote_solr_available.get() + this.local_solr_stored.get(),
imageViewed.size() + sizeSpare()
);
}
protected SearchEvent(
@ -1474,56 +1478,89 @@ public final class SearchEvent {
return null;
}
private int imagePageCounter = 0;
private LinkedHashMap<String, ImageResult> imageViewed = new LinkedHashMap<String, ImageResult>();
private LinkedHashMap<String, ImageResult> imageSpare = new LinkedHashMap<String, ImageResult>();
private LinkedHashMap<String, ImageResult> imageSpareGood = new LinkedHashMap<String, ImageResult>();
private LinkedHashMap<String, ImageResult> imageSpareBad = new LinkedHashMap<String, ImageResult>();
private ImageResult nthImage(int item) {
Object o = SetTools.nth(this.imageViewed.values(), item);
if (o == null) return null;
return (ImageResult) o;
}
private boolean hasSpare() {
return imageSpareGood.size() > 0 || imageSpareBad.size() > 0;
}
private boolean containsSpare(String id) {
return imageSpareGood.containsKey(id) || imageSpareBad.containsKey(id);
}
private int sizeSpare() {
return imageSpareGood.size() + imageSpareBad.size();
}
private ImageResult nextSpare() {
Map.Entry<String, ImageResult> next = imageSpare.entrySet().iterator().next();
imageViewed.put(next.getKey(), next.getValue());
imageSpare.remove(next.getKey());
return next.getValue();
if (imageSpareGood.size() > 0) {
Map.Entry<String, ImageResult> next = imageSpareGood.entrySet().iterator().next();
imageViewed.put(next.getKey(), next.getValue());
imageSpareGood.remove(next.getKey());
return next.getValue();
}
if (imageSpareBad.size() > 0) {
Map.Entry<String, ImageResult> next = imageSpareBad.entrySet().iterator().next();
imageViewed.put(next.getKey(), next.getValue());
imageSpareBad.remove(next.getKey());
return next.getValue();
}
return null;
}
public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
if (item < imageViewed.size()) return nthImage(item);
if (imageSpare.size() > 0) return nextSpare();
ResultEntry ms = oneResult(item, timeout);
if (imageSpareGood.size() > 0) return nextSpare(); // first put out all good spare, but no bad spare
ResultEntry ms = oneResult(imagePageCounter++, timeout); // we must use a different counter here because the image counter can be higher when one page filled up several spare
// check if the match was made in the url or in the image links
if (ms == null) throw new MalformedURLException("no image url found");
if (ms == null) {
if (hasSpare()) return nextSpare();
throw new MalformedURLException("no image url found");
}
// try to get more
SolrDocument doc = ms.getNode();
// there can be two different kinds of image hits: either the document itself is an image or images are embedded in the links of text documents.
String mime = (String) doc.getFirstValue(CollectionSchema.content_type.getSolrFieldName());
if (Response.docType(ms.url()) == Response.DT_IMAGE || Response.docType(mime) == Response.DT_IMAGE) {
boolean fakeImageHost = ms.url().getHost() != null && ms.url().getHost().indexOf("wikipedia") > 0; // pages with image extension from wikipedia do not contain image files but html files... I know this is a bad hack, but many results come from wikipedia and we must handle that
if (!fakeImageHost && (Response.docType(ms.url()) == Response.DT_IMAGE || Response.docType(mime) == Response.DT_IMAGE)) {
String id = ASCII.String(ms.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
if (!imageViewed.containsKey(id) && !containsSpare(id)) imageSpareGood.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
} else {
Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
if (img != null && img.size() > 0) {
Collection<Object> altO = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
Collection<Object> imgO = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
if (imgO != null && imgO.size() > 0 && imgO instanceof List<?>) {
List<Object> alt = altO == null ? new ArrayList<Object>(imgO.size()) : (List<Object>) altO;
List<Object> img = (List<Object>) imgO;
List<String> prt = CollectionConfiguration.indexedList2protocolList(doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName()), img.size());
Collection<Object> heightO = doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName());
Collection<Object> widthO = doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName());
List<Object> height = heightO == null ? new ArrayList<Object>(heightO.size()) : (List<Object>) heightO;
List<Object> width = widthO == null ? new ArrayList<Object>(widthO.size()) : (List<Object>) widthO;
for (int c = 0; c < img.size(); c++) {
String image_urlstub = (String) SetTools.nth(img, c);
String image_alt = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
if (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)) {
try {
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + image_urlstub);
Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", image_alt, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
} catch (MalformedURLException e) {
continue;
String image_urlstub = (String) img.get(c);
String image_alt = alt != null && alt.size() > c ? (String) alt.get(c) : "";
boolean match = (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt));
try {
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? prt.get(c) : "http") + "://" + image_urlstub);
Integer h = (Integer) height.get(c);
Integer w = (Integer) width.get(c);
boolean sizeok = h != null && w != null && h.intValue() > 16 && w.intValue() > 16;
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !containsSpare(id)) {
ImageResult imageResult = new ImageResult(ms.url(), imageUrl, "", image_alt, w == null ? 0 : w, h == null ? 0 : h, 0);
if (match || sizeok) imageSpareGood.put(id, imageResult); else imageSpareBad.put(id, imageResult);
}
} catch (MalformedURLException e) {
continue;
}
}
}
}
if (imageSpare.size() > 0) return nextSpare();
if (hasSpare()) return nextSpare();
throw new MalformedURLException("no image url found");
}

Loading…
Cancel
Save