diff --git a/htroot/Collage.html b/htroot/Collage.html index 753c91f26..61d2b1821 100755 --- a/htroot/Collage.html +++ b/htroot/Collage.html @@ -15,7 +15,8 @@ form dd { width: 130px; text-align:right; } - + + #%env/templates/header.template%# diff --git a/htroot/Collage.java b/htroot/Collage.java index e27fefba7..f7a50a829 100755 --- a/htroot/Collage.java +++ b/htroot/Collage.java @@ -39,8 +39,8 @@ import java.util.Random; -import de.anomic.data.collageQueue; import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaCrawlResultImages; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -54,7 +54,7 @@ public class Collage { private static int fifoSize = 0; private static long zIndex = 0; - private static collageQueue.ImageOriginEntry origins[] = new collageQueue.ImageOriginEntry[fifoMax]; + private static plasmaCrawlResultImages.OriginEntry origins[] = new plasmaCrawlResultImages.OriginEntry[fifoMax]; private static Integer imgWidth[] = new Integer[fifoMax]; private static Integer imgHeight[] = new Integer[fifoMax]; private static Integer imgPosX[] = new Integer[fifoMax]; @@ -66,7 +66,7 @@ public class Collage { final serverObjects prop = new serverObjects(); final plasmaSwitchboard sb = (plasmaSwitchboard) env; final boolean authenticated = sb.adminAuthenticated(header) >= 2; - collageQueue.ImageOriginEntry nextOrigin = collageQueue.next(!authenticated); + plasmaCrawlResultImages.OriginEntry nextOrigin = plasmaCrawlResultImages.next(!authenticated); if (nextOrigin != null) { if (fifoSize == 0 || origins[fifoPos] != nextOrigin) { @@ -86,7 +86,6 @@ public class Collage { } } - if (fifoSize > 0) { prop.put("imgurl", "1"); @@ -105,8 +104,9 @@ public class Collage { prop.put("imgurl", "0"); } - prop.putNum("privateQueueSize", collageQueue.privateQueueSize()); - prop.putNum("publicQueueSize", collageQueue.publicQueueSize()); + prop.putNum("refresh", Math.max(2, Math.min(5, 500 / (1 + plasmaCrawlResultImages.queueSize(!authenticated))))); + prop.put("privateQueueSize", plasmaCrawlResultImages.privateQueueHighSize() + "+" + plasmaCrawlResultImages.privateQueueLowSize()); + prop.put("publicQueueSize", plasmaCrawlResultImages.publicQueueHighSize() + "+" + plasmaCrawlResultImages.publicQueueLowSize()); return prop; } } \ No newline at end of file diff --git a/htroot/xml/util/getpageinfo_p.java b/htroot/xml/util/getpageinfo_p.java index b20d10f02..6b9e9c675 100644 --- a/htroot/xml/util/getpageinfo_p.java +++ b/htroot/xml/util/getpageinfo_p.java @@ -82,7 +82,9 @@ public class getpageinfo_p { if (actions.indexOf("title")>=0) { try { yacyURL u = new yacyURL(url, null); - String contentString=new String(HttpClient.wget(u.toString())); + byte[] r = HttpClient.wget(u.toString()); + if (r == null) return prop; + String contentString=new String(r); htmlFilterContentScraper scraper = new htmlFilterContentScraper(u); //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); diff --git a/source/de/anomic/data/collageQueue.java b/source/de/anomic/data/collageQueue.java deleted file mode 100755 index dc6369cf7..000000000 --- a/source/de/anomic/data/collageQueue.java +++ /dev/null @@ -1,101 +0,0 @@ -// collageQueue.java -// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net -// first published 13.04.2008 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.data; - -import java.util.HashMap; -import java.util.concurrent.ConcurrentLinkedQueue; - -import de.anomic.htmlFilter.htmlFilterImageEntry; -import de.anomic.plasma.plasmaParserDocument; -import de.anomic.yacy.yacyURL; - -public class collageQueue { - - private static final ConcurrentLinkedQueue privateImageQueue = new ConcurrentLinkedQueue(); - private static final ConcurrentLinkedQueue publicImageQueue = new ConcurrentLinkedQueue(); - - public static void registerImages(plasmaParserDocument document, boolean privateEntry) { - if (document == null) return; - if (document.dc_source() == null) return; - - HashMap images = document.getImages(); - for (htmlFilterImageEntry image: images.values()) { - String name = image.url().getFile(); - - if (image.width() > 120 && - image.height() > 100 && - image.width() < 1200 && - image.height() < 1000 && - name.lastIndexOf(".gif") == -1) { - // && ((urlString.lastIndexOf(".jpg") != -1)) || - // ((urlString.lastIndexOf(".png") != -1)){ - float ratio; - if (image.width() > image.height()) { - ratio = (float) image.width() / (float) image.height(); - } else { - ratio = (float) image.height() / (float) image.width(); - } - if (ratio >= 1.0f && ratio <= 2.0f) { - if (privateEntry) { - privateImageQueue.add(new ImageOriginEntry(image, document.dc_source())); - } else { - publicImageQueue.add(new ImageOriginEntry(image, document.dc_source())); - } - } - } - } - } - - public static ImageOriginEntry next(boolean privateEntryOnly) { - ImageOriginEntry e = null; - if (privateEntryOnly) { - e = privateImageQueue.poll(); - } else { - e = publicImageQueue.poll(); - if (e == null) e = privateImageQueue.poll(); - } - return e; - } - - public static int privateQueueSize() { - return privateImageQueue.size(); - } - - public static int publicQueueSize() { - return publicImageQueue.size(); - } - - public static class ImageOriginEntry { - public htmlFilterImageEntry imageEntry; - public yacyURL baseURL; - public ImageOriginEntry(htmlFilterImageEntry imageEntry, yacyURL baseURL) { - this.imageEntry = imageEntry; - this.baseURL = baseURL; - } - } - -} diff --git a/source/de/anomic/plasma/plasmaCrawlResultImages.java b/source/de/anomic/plasma/plasmaCrawlResultImages.java new file mode 100755 index 000000000..6a54e8ed7 --- /dev/null +++ b/source/de/anomic/plasma/plasmaCrawlResultImages.java @@ -0,0 +1,152 @@ +// plasmaCrawlResultImages.java +// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net +// first published 13.04.2008 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.plasma; + +import java.util.HashMap; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; + +import de.anomic.htmlFilter.htmlFilterImageEntry; +import de.anomic.yacy.yacyURL; + +public class plasmaCrawlResultImages { + + // we maintain two different queues for private and public crawls and divide both into two halves: + // such images that appear to be good quality for a image monitor bacause their size is known, and other images + // that are not declared with sizes. + private static final ConcurrentLinkedQueue privateImageQueueHigh = new ConcurrentLinkedQueue(); + private static final ConcurrentLinkedQueue privateImageQueueLow = new ConcurrentLinkedQueue(); + private static final ConcurrentLinkedQueue publicImageQueueHigh = new ConcurrentLinkedQueue(); + private static final ConcurrentLinkedQueue publicImageQueueLow = new ConcurrentLinkedQueue(); + + // we also check all links for a double-check so we don't get the same image more than once in any queue + // image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence: + // the same images may be linked from different pages + private static final ConcurrentHashMap doubleCheck = new ConcurrentHashMap(); // (url-hash, time) when the url appeared first + + public static void registerImages(plasmaParserDocument document, boolean privateEntry) { + if (document == null) return; + if (document.dc_source() == null) return; + + HashMap images = document.getImages(); + for (htmlFilterImageEntry image: images.values()) { + // do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup + if (doubleCheck.containsKey(image.url().hash())) continue; + doubleCheck.put(image.url().hash(), System.currentTimeMillis()); + + String name = image.url().getFile(); + boolean good = false; + if (image.width() > 120 && + image.height() > 100 && + image.width() < 1200 && + image.height() < 1000 && + name.lastIndexOf(".gif") == -1) { + // && ((urlString.lastIndexOf(".jpg") != -1)) || + // ((urlString.lastIndexOf(".png") != -1)){ + + good = true; + float ratio; + if (image.width() > image.height()) { + ratio = (float) image.width() / (float) image.height(); + } else { + ratio = (float) image.height() / (float) image.width(); + } + if (ratio < 1.0f || ratio > 2.0f) good = false; + } + if (good) { + if (privateEntry) { + privateImageQueueHigh.add(new OriginEntry(image, document.dc_source())); + } else { + publicImageQueueHigh.add(new OriginEntry(image, document.dc_source())); + } + } else { + if (privateEntry) { + privateImageQueueLow.add(new OriginEntry(image, document.dc_source())); + } else { + publicImageQueueLow.add(new OriginEntry(image, document.dc_source())); + } + } + } + } + + public static OriginEntry next(boolean privateEntryOnly) { + OriginEntry e = null; + if (privateEntryOnly) { + e = privateImageQueueHigh.poll(); + if (e == null) e = privateImageQueueLow.poll(); + } else { + e = publicImageQueueHigh.poll(); + if (e == null) e = privateImageQueueHigh.poll(); + if (e == null) e = publicImageQueueLow.poll(); + if (e == null) e = privateImageQueueLow.poll(); + } + return e; + } + + public static int queueSize(boolean privateEntryOnly) { + if (privateEntryOnly) { + return privateImageQueueHigh.size() + privateImageQueueLow.size(); + } else { + return privateImageQueueHigh.size() + privateImageQueueLow.size() + + publicImageQueueHigh.size() + publicImageQueueLow.size(); + } + } + + public static int privateQueueHighSize() { + return privateImageQueueHigh.size(); + } + + public static int privateQueueLowSize() { + return privateImageQueueLow.size(); + } + + public static int publicQueueHighSize() { + return publicImageQueueHigh.size(); + } + + public static int publicQueueLowSize() { + return publicImageQueueLow.size(); + } + + public static void clearQueues() { + privateImageQueueHigh.clear(); + privateImageQueueLow.clear(); + publicImageQueueHigh.clear(); + publicImageQueueLow.clear(); + doubleCheck.clear(); + } + + public static class OriginEntry { + public htmlFilterImageEntry imageEntry; + public yacyURL baseURL; + public OriginEntry(htmlFilterImageEntry imageEntry, yacyURL baseURL) { + this.imageEntry = imageEntry; + this.baseURL = baseURL; + } + } + +} diff --git a/source/de/anomic/plasma/plasmaCrawlResults.java b/source/de/anomic/plasma/plasmaCrawlResultURLs.java similarity index 99% rename from source/de/anomic/plasma/plasmaCrawlResults.java rename to source/de/anomic/plasma/plasmaCrawlResultURLs.java index 6bf9ccc6e..6265dfcd1 100644 --- a/source/de/anomic/plasma/plasmaCrawlResults.java +++ b/source/de/anomic/plasma/plasmaCrawlResultURLs.java @@ -58,7 +58,7 @@ import de.anomic.index.indexURLReference; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; -public final class plasmaCrawlResults { +public final class plasmaCrawlResultURLs { // result stacks; // these have all entries of form @@ -70,7 +70,7 @@ public final class plasmaCrawlResults { private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling private final LinkedList gcrawlResultStack; // 6 - local index: triggered external - public plasmaCrawlResults() { + public plasmaCrawlResultURLs() { // init result stacks externResultStack = new LinkedList(); searchResultStack = new LinkedList(); diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 1be02dc56..47b69e865 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -89,12 +89,12 @@ public final class plasmaSearchEvent { TreeSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets long urlRetrievalAllTime; long snippetComputationAllTime; - plasmaCrawlResults crawlResults; + plasmaCrawlResultURLs crawlResults; @SuppressWarnings("unchecked") private plasmaSearchEvent(plasmaSearchQuery query, plasmaWordIndex wordIndex, - plasmaCrawlResults crawlResults, + plasmaCrawlResultURLs crawlResults, TreeMap preselectedPeerHashes, boolean generateAbstracts) { this.eventTime = System.currentTimeMillis(); // for lifetime check @@ -457,7 +457,7 @@ public final class plasmaSearchEvent { plasmaSearchQuery query, plasmaSearchRankingProfile ranking, plasmaWordIndex wordIndex, - plasmaCrawlResults crawlResults, + plasmaCrawlResultURLs crawlResults, TreeMap preselectedPeerHashes, boolean generateAbstracts) { plasmaSearchEvent event = lastEvents.get(query.id(false)); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 019a55655..96d7887b5 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -112,7 +112,6 @@ import de.anomic.data.URLLicense; import de.anomic.data.blogBoard; import de.anomic.data.blogBoardComments; import de.anomic.data.bookmarksDB; -import de.anomic.data.collageQueue; import de.anomic.data.listManager; import de.anomic.data.messageBoard; import de.anomic.data.userDB; @@ -202,7 +201,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch rankingPermissions; public plasmaWordIndex wordIndex; public plasmaCrawlQueues crawlQueues; - public plasmaCrawlResults crawlResults; + public plasmaCrawlResultURLs crawlResults; public plasmaSwitchboardQueue sbQueue; public plasmaCrawlStacker crawlStacker; public messageBoard messageDB; @@ -973,7 +972,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch> abstractCache, indexReferenceBlacklist blacklist, diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 4159f19f0..27d5a5a15 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -53,7 +53,7 @@ import java.util.TreeMap; import de.anomic.index.indexReferenceBlacklist; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroMScoreCluster; -import de.anomic.plasma.plasmaCrawlResults; +import de.anomic.plasma.plasmaCrawlResultURLs; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProcess; import de.anomic.plasma.plasmaSearchRankingProfile; @@ -75,11 +75,11 @@ public class yacySearch extends Thread { final private plasmaSearchRankingProfile rankingProfile; final private String prefer, filter; final private kelondroBitfield constraint; - plasmaCrawlResults crawlResults; + plasmaCrawlResultURLs crawlResults; public yacySearch(String wordhashes, String excludehashes, String urlhashes, String prefer, String filter, int count, int maxDistance, boolean global, int partitions, yacySeed targetPeer, plasmaWordIndex wordIndex, - plasmaCrawlResults crawlResults, + plasmaCrawlResultURLs crawlResults, plasmaSearchRankingProcess containerCache, Map> abstractCache, indexReferenceBlacklist blacklist, @@ -252,7 +252,7 @@ public class yacySearch extends Thread { String wordhashes, String excludehashes, String urlhashes, String prefer, String filter, int count, int maxDist, plasmaWordIndex wordIndex, - plasmaCrawlResults crawlResults, + plasmaCrawlResultURLs crawlResults, plasmaSearchRankingProcess containerCache, Map> abstractCache, int targets, @@ -280,7 +280,7 @@ public class yacySearch extends Thread { public static yacySearch secondaryRemoteSearch(String wordhashes, String excludehashes, String urlhashes, plasmaWordIndex wordIndex, - plasmaCrawlResults crawlResults, + plasmaCrawlResultURLs crawlResults, plasmaSearchRankingProcess containerCache, String targethash, indexReferenceBlacklist blacklist, plasmaSearchRankingProfile rankingProfile,