You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
153 lines
6.3 KiB
153 lines
6.3 KiB
// plasmaCrawlResultImages.java
|
|
// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net
|
|
// first published 13.04.2008 on http://yacy.net
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
|
// $LastChangedRevision: 1986 $
|
|
// $LastChangedBy: orbiter $
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.crawler;
|
|
|
|
import java.util.HashMap;
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
import java.util.concurrent.ConcurrentLinkedQueue;
|
|
|
|
import de.anomic.document.Document;
|
|
import de.anomic.document.parser.html.ImageEntry;
|
|
import de.anomic.yacy.yacyURL;
|
|
|
|
public class ResultImages {
|
|
|
|
// we maintain two different queues for private and public crawls and divide both into two halves:
|
|
// such images that appear to be good quality for a image monitor bacause their size is known, and other images
|
|
// that are not declared with sizes.
|
|
private static final ConcurrentLinkedQueue<OriginEntry> privateImageQueueHigh = new ConcurrentLinkedQueue<OriginEntry>();
|
|
private static final ConcurrentLinkedQueue<OriginEntry> privateImageQueueLow = new ConcurrentLinkedQueue<OriginEntry>();
|
|
private static final ConcurrentLinkedQueue<OriginEntry> publicImageQueueHigh = new ConcurrentLinkedQueue<OriginEntry>();
|
|
private static final ConcurrentLinkedQueue<OriginEntry> publicImageQueueLow = new ConcurrentLinkedQueue<OriginEntry>();
|
|
|
|
// we also check all links for a double-check so we don't get the same image more than once in any queue
|
|
// image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence:
|
|
// the same images may be linked from different pages
|
|
private static final ConcurrentHashMap<String, Long> doubleCheck = new ConcurrentHashMap<String, Long>(); // (url-hash, time) when the url appeared first
|
|
|
|
public static void registerImages(final Document document, final boolean privateEntry) {
|
|
if (document == null) return;
|
|
if (document.dc_source() == null) return;
|
|
|
|
final HashMap<String, ImageEntry> images = document.getImages();
|
|
for (final ImageEntry image: images.values()) {
|
|
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
|
|
if (doubleCheck.containsKey(image.url().hash())) continue;
|
|
doubleCheck.put(image.url().hash(), System.currentTimeMillis());
|
|
|
|
final String name = image.url().getFile();
|
|
boolean good = false;
|
|
if (image.width() > 120 &&
|
|
image.height() > 100 &&
|
|
image.width() < 1200 &&
|
|
image.height() < 1000 &&
|
|
name.lastIndexOf(".gif") == -1) {
|
|
// && ((urlString.lastIndexOf(".jpg") != -1)) ||
|
|
// ((urlString.lastIndexOf(".png") != -1)){
|
|
|
|
good = true;
|
|
float ratio;
|
|
if (image.width() > image.height()) {
|
|
ratio = (float) image.width() / (float) image.height();
|
|
} else {
|
|
ratio = (float) image.height() / (float) image.width();
|
|
}
|
|
if (ratio < 1.0f || ratio > 2.0f) good = false;
|
|
}
|
|
if (good) {
|
|
if (privateEntry) {
|
|
privateImageQueueHigh.add(new OriginEntry(image, document.dc_source()));
|
|
} else {
|
|
publicImageQueueHigh.add(new OriginEntry(image, document.dc_source()));
|
|
}
|
|
} else {
|
|
if (privateEntry) {
|
|
privateImageQueueLow.add(new OriginEntry(image, document.dc_source()));
|
|
} else {
|
|
publicImageQueueLow.add(new OriginEntry(image, document.dc_source()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
public static OriginEntry next(final boolean privateEntryOnly) {
|
|
OriginEntry e = null;
|
|
if (privateEntryOnly) {
|
|
e = privateImageQueueHigh.poll();
|
|
if (e == null) e = privateImageQueueLow.poll();
|
|
} else {
|
|
e = publicImageQueueHigh.poll();
|
|
if (e == null) e = privateImageQueueHigh.poll();
|
|
if (e == null) e = publicImageQueueLow.poll();
|
|
if (e == null) e = privateImageQueueLow.poll();
|
|
}
|
|
return e;
|
|
}
|
|
|
|
public static int queueSize(final boolean privateEntryOnly) {
|
|
int publicSize = 0;
|
|
if (!privateEntryOnly) {
|
|
publicSize = publicImageQueueHigh.size() + publicImageQueueLow.size();
|
|
}
|
|
return privateImageQueueHigh.size() + privateImageQueueLow.size() + publicSize;
|
|
}
|
|
|
|
public static int privateQueueHighSize() {
|
|
return privateImageQueueHigh.size();
|
|
}
|
|
|
|
public static int privateQueueLowSize() {
|
|
return privateImageQueueLow.size();
|
|
}
|
|
|
|
public static int publicQueueHighSize() {
|
|
return publicImageQueueHigh.size();
|
|
}
|
|
|
|
public static int publicQueueLowSize() {
|
|
return publicImageQueueLow.size();
|
|
}
|
|
|
|
public static void clearQueues() {
|
|
privateImageQueueHigh.clear();
|
|
privateImageQueueLow.clear();
|
|
publicImageQueueHigh.clear();
|
|
publicImageQueueLow.clear();
|
|
doubleCheck.clear();
|
|
}
|
|
|
|
public static class OriginEntry {
|
|
public ImageEntry imageEntry;
|
|
public yacyURL baseURL;
|
|
public OriginEntry(final ImageEntry imageEntry, final yacyURL baseURL) {
|
|
this.imageEntry = imageEntry;
|
|
this.baseURL = baseURL;
|
|
}
|
|
}
|
|
|
|
}
|