- renamed also the plasmaCrawlResults to have a consistent naming for url and image queues - added a double-check for the images - added additional queues for the images: all worse-quality images go there, so the queue can be used also if no sizes are given; no image is lost - added a cleanup for the stacks so they cannot flood the memory git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4722 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
d7e89c2aca
commit
1995faef8d
@ -1,101 +0,0 @@
|
||||
// collageQueue.java
|
||||
// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net
|
||||
// first published 13.04.2008 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.data;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class collageQueue {
|
||||
|
||||
private static final ConcurrentLinkedQueue<ImageOriginEntry> privateImageQueue = new ConcurrentLinkedQueue<ImageOriginEntry>();
|
||||
private static final ConcurrentLinkedQueue<ImageOriginEntry> publicImageQueue = new ConcurrentLinkedQueue<ImageOriginEntry>();
|
||||
|
||||
public static void registerImages(plasmaParserDocument document, boolean privateEntry) {
|
||||
if (document == null) return;
|
||||
if (document.dc_source() == null) return;
|
||||
|
||||
HashMap<String, htmlFilterImageEntry> images = document.getImages();
|
||||
for (htmlFilterImageEntry image: images.values()) {
|
||||
String name = image.url().getFile();
|
||||
|
||||
if (image.width() > 120 &&
|
||||
image.height() > 100 &&
|
||||
image.width() < 1200 &&
|
||||
image.height() < 1000 &&
|
||||
name.lastIndexOf(".gif") == -1) {
|
||||
// && ((urlString.lastIndexOf(".jpg") != -1)) ||
|
||||
// ((urlString.lastIndexOf(".png") != -1)){
|
||||
float ratio;
|
||||
if (image.width() > image.height()) {
|
||||
ratio = (float) image.width() / (float) image.height();
|
||||
} else {
|
||||
ratio = (float) image.height() / (float) image.width();
|
||||
}
|
||||
if (ratio >= 1.0f && ratio <= 2.0f) {
|
||||
if (privateEntry) {
|
||||
privateImageQueue.add(new ImageOriginEntry(image, document.dc_source()));
|
||||
} else {
|
||||
publicImageQueue.add(new ImageOriginEntry(image, document.dc_source()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static ImageOriginEntry next(boolean privateEntryOnly) {
|
||||
ImageOriginEntry e = null;
|
||||
if (privateEntryOnly) {
|
||||
e = privateImageQueue.poll();
|
||||
} else {
|
||||
e = publicImageQueue.poll();
|
||||
if (e == null) e = privateImageQueue.poll();
|
||||
}
|
||||
return e;
|
||||
}
|
||||
|
||||
public static int privateQueueSize() {
|
||||
return privateImageQueue.size();
|
||||
}
|
||||
|
||||
public static int publicQueueSize() {
|
||||
return publicImageQueue.size();
|
||||
}
|
||||
|
||||
public static class ImageOriginEntry {
|
||||
public htmlFilterImageEntry imageEntry;
|
||||
public yacyURL baseURL;
|
||||
public ImageOriginEntry(htmlFilterImageEntry imageEntry, yacyURL baseURL) {
|
||||
this.imageEntry = imageEntry;
|
||||
this.baseURL = baseURL;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,152 @@
|
||||
// plasmaCrawlResultImages.java
|
||||
// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net
|
||||
// first published 13.04.2008 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||
// $LastChangedRevision: 1986 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class plasmaCrawlResultImages {
|
||||
|
||||
// we maintain two different queues for private and public crawls and divide both into two halves:
|
||||
// such images that appear to be good quality for a image monitor bacause their size is known, and other images
|
||||
// that are not declared with sizes.
|
||||
private static final ConcurrentLinkedQueue<OriginEntry> privateImageQueueHigh = new ConcurrentLinkedQueue<OriginEntry>();
|
||||
private static final ConcurrentLinkedQueue<OriginEntry> privateImageQueueLow = new ConcurrentLinkedQueue<OriginEntry>();
|
||||
private static final ConcurrentLinkedQueue<OriginEntry> publicImageQueueHigh = new ConcurrentLinkedQueue<OriginEntry>();
|
||||
private static final ConcurrentLinkedQueue<OriginEntry> publicImageQueueLow = new ConcurrentLinkedQueue<OriginEntry>();
|
||||
|
||||
// we also check all links for a double-check so we don't get the same image more than once in any queue
|
||||
// image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence:
|
||||
// the same images may be linked from different pages
|
||||
private static final ConcurrentHashMap<String, Long> doubleCheck = new ConcurrentHashMap<String, Long>(); // (url-hash, time) when the url appeared first
|
||||
|
||||
public static void registerImages(plasmaParserDocument document, boolean privateEntry) {
|
||||
if (document == null) return;
|
||||
if (document.dc_source() == null) return;
|
||||
|
||||
HashMap<String, htmlFilterImageEntry> images = document.getImages();
|
||||
for (htmlFilterImageEntry image: images.values()) {
|
||||
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
|
||||
if (doubleCheck.containsKey(image.url().hash())) continue;
|
||||
doubleCheck.put(image.url().hash(), System.currentTimeMillis());
|
||||
|
||||
String name = image.url().getFile();
|
||||
boolean good = false;
|
||||
if (image.width() > 120 &&
|
||||
image.height() > 100 &&
|
||||
image.width() < 1200 &&
|
||||
image.height() < 1000 &&
|
||||
name.lastIndexOf(".gif") == -1) {
|
||||
// && ((urlString.lastIndexOf(".jpg") != -1)) ||
|
||||
// ((urlString.lastIndexOf(".png") != -1)){
|
||||
|
||||
good = true;
|
||||
float ratio;
|
||||
if (image.width() > image.height()) {
|
||||
ratio = (float) image.width() / (float) image.height();
|
||||
} else {
|
||||
ratio = (float) image.height() / (float) image.width();
|
||||
}
|
||||
if (ratio < 1.0f || ratio > 2.0f) good = false;
|
||||
}
|
||||
if (good) {
|
||||
if (privateEntry) {
|
||||
privateImageQueueHigh.add(new OriginEntry(image, document.dc_source()));
|
||||
} else {
|
||||
publicImageQueueHigh.add(new OriginEntry(image, document.dc_source()));
|
||||
}
|
||||
} else {
|
||||
if (privateEntry) {
|
||||
privateImageQueueLow.add(new OriginEntry(image, document.dc_source()));
|
||||
} else {
|
||||
publicImageQueueLow.add(new OriginEntry(image, document.dc_source()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static OriginEntry next(boolean privateEntryOnly) {
|
||||
OriginEntry e = null;
|
||||
if (privateEntryOnly) {
|
||||
e = privateImageQueueHigh.poll();
|
||||
if (e == null) e = privateImageQueueLow.poll();
|
||||
} else {
|
||||
e = publicImageQueueHigh.poll();
|
||||
if (e == null) e = privateImageQueueHigh.poll();
|
||||
if (e == null) e = publicImageQueueLow.poll();
|
||||
if (e == null) e = privateImageQueueLow.poll();
|
||||
}
|
||||
return e;
|
||||
}
|
||||
|
||||
public static int queueSize(boolean privateEntryOnly) {
|
||||
if (privateEntryOnly) {
|
||||
return privateImageQueueHigh.size() + privateImageQueueLow.size();
|
||||
} else {
|
||||
return privateImageQueueHigh.size() + privateImageQueueLow.size() +
|
||||
publicImageQueueHigh.size() + publicImageQueueLow.size();
|
||||
}
|
||||
}
|
||||
|
||||
public static int privateQueueHighSize() {
|
||||
return privateImageQueueHigh.size();
|
||||
}
|
||||
|
||||
public static int privateQueueLowSize() {
|
||||
return privateImageQueueLow.size();
|
||||
}
|
||||
|
||||
public static int publicQueueHighSize() {
|
||||
return publicImageQueueHigh.size();
|
||||
}
|
||||
|
||||
public static int publicQueueLowSize() {
|
||||
return publicImageQueueLow.size();
|
||||
}
|
||||
|
||||
public static void clearQueues() {
|
||||
privateImageQueueHigh.clear();
|
||||
privateImageQueueLow.clear();
|
||||
publicImageQueueHigh.clear();
|
||||
publicImageQueueLow.clear();
|
||||
doubleCheck.clear();
|
||||
}
|
||||
|
||||
public static class OriginEntry {
|
||||
public htmlFilterImageEntry imageEntry;
|
||||
public yacyURL baseURL;
|
||||
public OriginEntry(htmlFilterImageEntry imageEntry, yacyURL baseURL) {
|
||||
this.imageEntry = imageEntry;
|
||||
this.baseURL = baseURL;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue