diff --git a/htroot/Collage.html b/htroot/Collage.html
index 753c91f26..61d2b1821 100755
--- a/htroot/Collage.html
+++ b/htroot/Collage.html
@@ -15,7 +15,8 @@ form dd {
width: 130px;
text-align:right;
}
-
+
+
#%env/templates/header.template%#
diff --git a/htroot/Collage.java b/htroot/Collage.java
index e27fefba7..f7a50a829 100755
--- a/htroot/Collage.java
+++ b/htroot/Collage.java
@@ -39,8 +39,8 @@
import java.util.Random;
-import de.anomic.data.collageQueue;
import de.anomic.http.httpHeader;
+import de.anomic.plasma.plasmaCrawlResultImages;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -54,7 +54,7 @@ public class Collage {
private static int fifoSize = 0;
private static long zIndex = 0;
- private static collageQueue.ImageOriginEntry origins[] = new collageQueue.ImageOriginEntry[fifoMax];
+ private static plasmaCrawlResultImages.OriginEntry origins[] = new plasmaCrawlResultImages.OriginEntry[fifoMax];
private static Integer imgWidth[] = new Integer[fifoMax];
private static Integer imgHeight[] = new Integer[fifoMax];
private static Integer imgPosX[] = new Integer[fifoMax];
@@ -66,7 +66,7 @@ public class Collage {
final serverObjects prop = new serverObjects();
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
- collageQueue.ImageOriginEntry nextOrigin = collageQueue.next(!authenticated);
+ plasmaCrawlResultImages.OriginEntry nextOrigin = plasmaCrawlResultImages.next(!authenticated);
if (nextOrigin != null) {
if (fifoSize == 0 || origins[fifoPos] != nextOrigin) {
@@ -86,7 +86,6 @@ public class Collage {
}
}
-
if (fifoSize > 0) {
prop.put("imgurl", "1");
@@ -105,8 +104,9 @@ public class Collage {
prop.put("imgurl", "0");
}
- prop.putNum("privateQueueSize", collageQueue.privateQueueSize());
- prop.putNum("publicQueueSize", collageQueue.publicQueueSize());
+ prop.putNum("refresh", Math.max(2, Math.min(5, 500 / (1 + plasmaCrawlResultImages.queueSize(!authenticated)))));
+ prop.put("privateQueueSize", plasmaCrawlResultImages.privateQueueHighSize() + "+" + plasmaCrawlResultImages.privateQueueLowSize());
+ prop.put("publicQueueSize", plasmaCrawlResultImages.publicQueueHighSize() + "+" + plasmaCrawlResultImages.publicQueueLowSize());
return prop;
}
}
\ No newline at end of file
diff --git a/htroot/xml/util/getpageinfo_p.java b/htroot/xml/util/getpageinfo_p.java
index b20d10f02..6b9e9c675 100644
--- a/htroot/xml/util/getpageinfo_p.java
+++ b/htroot/xml/util/getpageinfo_p.java
@@ -82,7 +82,9 @@ public class getpageinfo_p {
if (actions.indexOf("title")>=0) {
try {
yacyURL u = new yacyURL(url, null);
- String contentString=new String(HttpClient.wget(u.toString()));
+ byte[] r = HttpClient.wget(u.toString());
+ if (r == null) return prop;
+ String contentString=new String(r);
htmlFilterContentScraper scraper = new htmlFilterContentScraper(u);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
diff --git a/source/de/anomic/data/collageQueue.java b/source/de/anomic/data/collageQueue.java
deleted file mode 100755
index dc6369cf7..000000000
--- a/source/de/anomic/data/collageQueue.java
+++ /dev/null
@@ -1,101 +0,0 @@
-// collageQueue.java
-// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net
-// first published 13.04.2008 on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
-// $LastChangedRevision: 1986 $
-// $LastChangedBy: orbiter $
-//
-// LICENSE
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-package de.anomic.data;
-
-import java.util.HashMap;
-import java.util.concurrent.ConcurrentLinkedQueue;
-
-import de.anomic.htmlFilter.htmlFilterImageEntry;
-import de.anomic.plasma.plasmaParserDocument;
-import de.anomic.yacy.yacyURL;
-
-public class collageQueue {
-
- private static final ConcurrentLinkedQueue privateImageQueue = new ConcurrentLinkedQueue();
- private static final ConcurrentLinkedQueue publicImageQueue = new ConcurrentLinkedQueue();
-
- public static void registerImages(plasmaParserDocument document, boolean privateEntry) {
- if (document == null) return;
- if (document.dc_source() == null) return;
-
- HashMap images = document.getImages();
- for (htmlFilterImageEntry image: images.values()) {
- String name = image.url().getFile();
-
- if (image.width() > 120 &&
- image.height() > 100 &&
- image.width() < 1200 &&
- image.height() < 1000 &&
- name.lastIndexOf(".gif") == -1) {
- // && ((urlString.lastIndexOf(".jpg") != -1)) ||
- // ((urlString.lastIndexOf(".png") != -1)){
- float ratio;
- if (image.width() > image.height()) {
- ratio = (float) image.width() / (float) image.height();
- } else {
- ratio = (float) image.height() / (float) image.width();
- }
- if (ratio >= 1.0f && ratio <= 2.0f) {
- if (privateEntry) {
- privateImageQueue.add(new ImageOriginEntry(image, document.dc_source()));
- } else {
- publicImageQueue.add(new ImageOriginEntry(image, document.dc_source()));
- }
- }
- }
- }
- }
-
- public static ImageOriginEntry next(boolean privateEntryOnly) {
- ImageOriginEntry e = null;
- if (privateEntryOnly) {
- e = privateImageQueue.poll();
- } else {
- e = publicImageQueue.poll();
- if (e == null) e = privateImageQueue.poll();
- }
- return e;
- }
-
- public static int privateQueueSize() {
- return privateImageQueue.size();
- }
-
- public static int publicQueueSize() {
- return publicImageQueue.size();
- }
-
- public static class ImageOriginEntry {
- public htmlFilterImageEntry imageEntry;
- public yacyURL baseURL;
- public ImageOriginEntry(htmlFilterImageEntry imageEntry, yacyURL baseURL) {
- this.imageEntry = imageEntry;
- this.baseURL = baseURL;
- }
- }
-
-}
diff --git a/source/de/anomic/plasma/plasmaCrawlResultImages.java b/source/de/anomic/plasma/plasmaCrawlResultImages.java
new file mode 100755
index 000000000..6a54e8ed7
--- /dev/null
+++ b/source/de/anomic/plasma/plasmaCrawlResultImages.java
@@ -0,0 +1,152 @@
+// plasmaCrawlResultImages.java
+// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net
+// first published 13.04.2008 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
+// $LastChangedRevision: 1986 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+package de.anomic.plasma;
+
+import java.util.HashMap;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+import de.anomic.htmlFilter.htmlFilterImageEntry;
+import de.anomic.yacy.yacyURL;
+
+public class plasmaCrawlResultImages {
+
+ // we maintain two different queues for private and public crawls and divide both into two halves:
+ // such images that appear to be good quality for a image monitor bacause their size is known, and other images
+ // that are not declared with sizes.
+ private static final ConcurrentLinkedQueue privateImageQueueHigh = new ConcurrentLinkedQueue();
+ private static final ConcurrentLinkedQueue privateImageQueueLow = new ConcurrentLinkedQueue();
+ private static final ConcurrentLinkedQueue publicImageQueueHigh = new ConcurrentLinkedQueue();
+ private static final ConcurrentLinkedQueue publicImageQueueLow = new ConcurrentLinkedQueue();
+
+ // we also check all links for a double-check so we don't get the same image more than once in any queue
+ // image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence:
+ // the same images may be linked from different pages
+ private static final ConcurrentHashMap doubleCheck = new ConcurrentHashMap(); // (url-hash, time) when the url appeared first
+
+ public static void registerImages(plasmaParserDocument document, boolean privateEntry) {
+ if (document == null) return;
+ if (document.dc_source() == null) return;
+
+ HashMap images = document.getImages();
+ for (htmlFilterImageEntry image: images.values()) {
+ // do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
+ if (doubleCheck.containsKey(image.url().hash())) continue;
+ doubleCheck.put(image.url().hash(), System.currentTimeMillis());
+
+ String name = image.url().getFile();
+ boolean good = false;
+ if (image.width() > 120 &&
+ image.height() > 100 &&
+ image.width() < 1200 &&
+ image.height() < 1000 &&
+ name.lastIndexOf(".gif") == -1) {
+ // && ((urlString.lastIndexOf(".jpg") != -1)) ||
+ // ((urlString.lastIndexOf(".png") != -1)){
+
+ good = true;
+ float ratio;
+ if (image.width() > image.height()) {
+ ratio = (float) image.width() / (float) image.height();
+ } else {
+ ratio = (float) image.height() / (float) image.width();
+ }
+ if (ratio < 1.0f || ratio > 2.0f) good = false;
+ }
+ if (good) {
+ if (privateEntry) {
+ privateImageQueueHigh.add(new OriginEntry(image, document.dc_source()));
+ } else {
+ publicImageQueueHigh.add(new OriginEntry(image, document.dc_source()));
+ }
+ } else {
+ if (privateEntry) {
+ privateImageQueueLow.add(new OriginEntry(image, document.dc_source()));
+ } else {
+ publicImageQueueLow.add(new OriginEntry(image, document.dc_source()));
+ }
+ }
+ }
+ }
+
+ public static OriginEntry next(boolean privateEntryOnly) {
+ OriginEntry e = null;
+ if (privateEntryOnly) {
+ e = privateImageQueueHigh.poll();
+ if (e == null) e = privateImageQueueLow.poll();
+ } else {
+ e = publicImageQueueHigh.poll();
+ if (e == null) e = privateImageQueueHigh.poll();
+ if (e == null) e = publicImageQueueLow.poll();
+ if (e == null) e = privateImageQueueLow.poll();
+ }
+ return e;
+ }
+
+ public static int queueSize(boolean privateEntryOnly) {
+ if (privateEntryOnly) {
+ return privateImageQueueHigh.size() + privateImageQueueLow.size();
+ } else {
+ return privateImageQueueHigh.size() + privateImageQueueLow.size() +
+ publicImageQueueHigh.size() + publicImageQueueLow.size();
+ }
+ }
+
+ public static int privateQueueHighSize() {
+ return privateImageQueueHigh.size();
+ }
+
+ public static int privateQueueLowSize() {
+ return privateImageQueueLow.size();
+ }
+
+ public static int publicQueueHighSize() {
+ return publicImageQueueHigh.size();
+ }
+
+ public static int publicQueueLowSize() {
+ return publicImageQueueLow.size();
+ }
+
+ public static void clearQueues() {
+ privateImageQueueHigh.clear();
+ privateImageQueueLow.clear();
+ publicImageQueueHigh.clear();
+ publicImageQueueLow.clear();
+ doubleCheck.clear();
+ }
+
+ public static class OriginEntry {
+ public htmlFilterImageEntry imageEntry;
+ public yacyURL baseURL;
+ public OriginEntry(htmlFilterImageEntry imageEntry, yacyURL baseURL) {
+ this.imageEntry = imageEntry;
+ this.baseURL = baseURL;
+ }
+ }
+
+}
diff --git a/source/de/anomic/plasma/plasmaCrawlResults.java b/source/de/anomic/plasma/plasmaCrawlResultURLs.java
similarity index 99%
rename from source/de/anomic/plasma/plasmaCrawlResults.java
rename to source/de/anomic/plasma/plasmaCrawlResultURLs.java
index 6bf9ccc6e..6265dfcd1 100644
--- a/source/de/anomic/plasma/plasmaCrawlResults.java
+++ b/source/de/anomic/plasma/plasmaCrawlResultURLs.java
@@ -58,7 +58,7 @@ import de.anomic.index.indexURLReference;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
-public final class plasmaCrawlResults {
+public final class plasmaCrawlResultURLs {
// result stacks;
// these have all entries of form
@@ -70,7 +70,7 @@ public final class plasmaCrawlResults {
private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList gcrawlResultStack; // 6 - local index: triggered external
- public plasmaCrawlResults() {
+ public plasmaCrawlResultURLs() {
// init result stacks
externResultStack = new LinkedList();
searchResultStack = new LinkedList();
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index 1be02dc56..47b69e865 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -89,12 +89,12 @@ public final class plasmaSearchEvent {
TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
long urlRetrievalAllTime;
long snippetComputationAllTime;
- plasmaCrawlResults crawlResults;
+ plasmaCrawlResultURLs crawlResults;
@SuppressWarnings("unchecked")
private plasmaSearchEvent(plasmaSearchQuery query,
plasmaWordIndex wordIndex,
- plasmaCrawlResults crawlResults,
+ plasmaCrawlResultURLs crawlResults,
TreeMap preselectedPeerHashes,
boolean generateAbstracts) {
this.eventTime = System.currentTimeMillis(); // for lifetime check
@@ -457,7 +457,7 @@ public final class plasmaSearchEvent {
plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaWordIndex wordIndex,
- plasmaCrawlResults crawlResults,
+ plasmaCrawlResultURLs crawlResults,
TreeMap preselectedPeerHashes,
boolean generateAbstracts) {
plasmaSearchEvent event = lastEvents.get(query.id(false));
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 019a55655..96d7887b5 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -112,7 +112,6 @@ import de.anomic.data.URLLicense;
import de.anomic.data.blogBoard;
import de.anomic.data.blogBoardComments;
import de.anomic.data.bookmarksDB;
-import de.anomic.data.collageQueue;
import de.anomic.data.listManager;
import de.anomic.data.messageBoard;
import de.anomic.data.userDB;
@@ -202,7 +201,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch rankingPermissions;
public plasmaWordIndex wordIndex;
public plasmaCrawlQueues crawlQueues;
- public plasmaCrawlResults crawlResults;
+ public plasmaCrawlResultURLs crawlResults;
public plasmaSwitchboardQueue sbQueue;
public plasmaCrawlStacker crawlStacker;
public messageBoard messageDB;
@@ -973,7 +972,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch> abstractCache,
indexReferenceBlacklist blacklist,
diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java
index 4159f19f0..27d5a5a15 100644
--- a/source/de/anomic/yacy/yacySearch.java
+++ b/source/de/anomic/yacy/yacySearch.java
@@ -53,7 +53,7 @@ import java.util.TreeMap;
import de.anomic.index.indexReferenceBlacklist;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMScoreCluster;
-import de.anomic.plasma.plasmaCrawlResults;
+import de.anomic.plasma.plasmaCrawlResultURLs;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSearchRankingProfile;
@@ -75,11 +75,11 @@ public class yacySearch extends Thread {
final private plasmaSearchRankingProfile rankingProfile;
final private String prefer, filter;
final private kelondroBitfield constraint;
- plasmaCrawlResults crawlResults;
+ plasmaCrawlResultURLs crawlResults;
public yacySearch(String wordhashes, String excludehashes, String urlhashes, String prefer, String filter, int count, int maxDistance,
boolean global, int partitions, yacySeed targetPeer, plasmaWordIndex wordIndex,
- plasmaCrawlResults crawlResults,
+ plasmaCrawlResultURLs crawlResults,
plasmaSearchRankingProcess containerCache,
Map> abstractCache,
indexReferenceBlacklist blacklist,
@@ -252,7 +252,7 @@ public class yacySearch extends Thread {
String wordhashes, String excludehashes, String urlhashes,
String prefer, String filter, int count, int maxDist,
plasmaWordIndex wordIndex,
- plasmaCrawlResults crawlResults,
+ plasmaCrawlResultURLs crawlResults,
plasmaSearchRankingProcess containerCache,
Map> abstractCache,
int targets,
@@ -280,7 +280,7 @@ public class yacySearch extends Thread {
public static yacySearch secondaryRemoteSearch(String wordhashes, String excludehashes, String urlhashes,
plasmaWordIndex wordIndex,
- plasmaCrawlResults crawlResults,
+ plasmaCrawlResultURLs crawlResults,
plasmaSearchRankingProcess containerCache,
String targethash, indexReferenceBlacklist blacklist,
plasmaSearchRankingProfile rankingProfile,