- refactoring of Colage back-end: move to plasma package

- renamed also the plasmaCrawlResults to have a consistent naming for url and image queues
- added a double-check for the images
- added additional queues for the images: all worse-quality images go there, so the queue can be used also if no sizes are given; no image is lost
- added a cleanup for the stacks so they cannot flood the memory

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4722 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent d7e89c2aca
commit 1995faef8d

@ -15,7 +15,8 @@ form dd {
width: 130px;
text-align:right;
}
</style><meta http-equiv="REFRESH" content="5" />
</style>
<meta http-equiv="REFRESH" content="#[refresh]#" />
</head>
<body style="margin:0px;">
#%env/templates/header.template%#

@ -39,8 +39,8 @@
import java.util.Random;
import de.anomic.data.collageQueue;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlResultImages;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -54,7 +54,7 @@ public class Collage {
private static int fifoSize = 0;
private static long zIndex = 0;
private static collageQueue.ImageOriginEntry origins[] = new collageQueue.ImageOriginEntry[fifoMax];
private static plasmaCrawlResultImages.OriginEntry origins[] = new plasmaCrawlResultImages.OriginEntry[fifoMax];
private static Integer imgWidth[] = new Integer[fifoMax];
private static Integer imgHeight[] = new Integer[fifoMax];
private static Integer imgPosX[] = new Integer[fifoMax];
@ -66,7 +66,7 @@ public class Collage {
final serverObjects prop = new serverObjects();
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
collageQueue.ImageOriginEntry nextOrigin = collageQueue.next(!authenticated);
plasmaCrawlResultImages.OriginEntry nextOrigin = plasmaCrawlResultImages.next(!authenticated);
if (nextOrigin != null) {
if (fifoSize == 0 || origins[fifoPos] != nextOrigin) {
@ -86,7 +86,6 @@ public class Collage {
}
}
if (fifoSize > 0) {
prop.put("imgurl", "1");
@ -105,8 +104,9 @@ public class Collage {
prop.put("imgurl", "0");
}
prop.putNum("privateQueueSize", collageQueue.privateQueueSize());
prop.putNum("publicQueueSize", collageQueue.publicQueueSize());
prop.putNum("refresh", Math.max(2, Math.min(5, 500 / (1 + plasmaCrawlResultImages.queueSize(!authenticated)))));
prop.put("privateQueueSize", plasmaCrawlResultImages.privateQueueHighSize() + "+" + plasmaCrawlResultImages.privateQueueLowSize());
prop.put("publicQueueSize", plasmaCrawlResultImages.publicQueueHighSize() + "+" + plasmaCrawlResultImages.publicQueueLowSize());
return prop;
}
}

@ -82,7 +82,9 @@ public class getpageinfo_p {
if (actions.indexOf("title")>=0) {
try {
yacyURL u = new yacyURL(url, null);
String contentString=new String(HttpClient.wget(u.toString()));
byte[] r = HttpClient.wget(u.toString());
if (r == null) return prop;
String contentString=new String(r);
htmlFilterContentScraper scraper = new htmlFilterContentScraper(u);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);

@ -1,101 +0,0 @@
// collageQueue.java
// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net
// first published 13.04.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data;
import java.util.HashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.yacy.yacyURL;
public class collageQueue {
private static final ConcurrentLinkedQueue<ImageOriginEntry> privateImageQueue = new ConcurrentLinkedQueue<ImageOriginEntry>();
private static final ConcurrentLinkedQueue<ImageOriginEntry> publicImageQueue = new ConcurrentLinkedQueue<ImageOriginEntry>();
public static void registerImages(plasmaParserDocument document, boolean privateEntry) {
if (document == null) return;
if (document.dc_source() == null) return;
HashMap<String, htmlFilterImageEntry> images = document.getImages();
for (htmlFilterImageEntry image: images.values()) {
String name = image.url().getFile();
if (image.width() > 120 &&
image.height() > 100 &&
image.width() < 1200 &&
image.height() < 1000 &&
name.lastIndexOf(".gif") == -1) {
// && ((urlString.lastIndexOf(".jpg") != -1)) ||
// ((urlString.lastIndexOf(".png") != -1)){
float ratio;
if (image.width() > image.height()) {
ratio = (float) image.width() / (float) image.height();
} else {
ratio = (float) image.height() / (float) image.width();
}
if (ratio >= 1.0f && ratio <= 2.0f) {
if (privateEntry) {
privateImageQueue.add(new ImageOriginEntry(image, document.dc_source()));
} else {
publicImageQueue.add(new ImageOriginEntry(image, document.dc_source()));
}
}
}
}
}
public static ImageOriginEntry next(boolean privateEntryOnly) {
ImageOriginEntry e = null;
if (privateEntryOnly) {
e = privateImageQueue.poll();
} else {
e = publicImageQueue.poll();
if (e == null) e = privateImageQueue.poll();
}
return e;
}
public static int privateQueueSize() {
return privateImageQueue.size();
}
public static int publicQueueSize() {
return publicImageQueue.size();
}
public static class ImageOriginEntry {
public htmlFilterImageEntry imageEntry;
public yacyURL baseURL;
public ImageOriginEntry(htmlFilterImageEntry imageEntry, yacyURL baseURL) {
this.imageEntry = imageEntry;
this.baseURL = baseURL;
}
}
}

@ -0,0 +1,152 @@
// plasmaCrawlResultImages.java
// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net
// first published 13.04.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.util.HashMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.yacy.yacyURL;
public class plasmaCrawlResultImages {
// we maintain two different queues for private and public crawls and divide both into two halves:
// such images that appear to be good quality for a image monitor bacause their size is known, and other images
// that are not declared with sizes.
private static final ConcurrentLinkedQueue<OriginEntry> privateImageQueueHigh = new ConcurrentLinkedQueue<OriginEntry>();
private static final ConcurrentLinkedQueue<OriginEntry> privateImageQueueLow = new ConcurrentLinkedQueue<OriginEntry>();
private static final ConcurrentLinkedQueue<OriginEntry> publicImageQueueHigh = new ConcurrentLinkedQueue<OriginEntry>();
private static final ConcurrentLinkedQueue<OriginEntry> publicImageQueueLow = new ConcurrentLinkedQueue<OriginEntry>();
// we also check all links for a double-check so we don't get the same image more than once in any queue
// image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence:
// the same images may be linked from different pages
private static final ConcurrentHashMap<String, Long> doubleCheck = new ConcurrentHashMap<String, Long>(); // (url-hash, time) when the url appeared first
public static void registerImages(plasmaParserDocument document, boolean privateEntry) {
if (document == null) return;
if (document.dc_source() == null) return;
HashMap<String, htmlFilterImageEntry> images = document.getImages();
for (htmlFilterImageEntry image: images.values()) {
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
if (doubleCheck.containsKey(image.url().hash())) continue;
doubleCheck.put(image.url().hash(), System.currentTimeMillis());
String name = image.url().getFile();
boolean good = false;
if (image.width() > 120 &&
image.height() > 100 &&
image.width() < 1200 &&
image.height() < 1000 &&
name.lastIndexOf(".gif") == -1) {
// && ((urlString.lastIndexOf(".jpg") != -1)) ||
// ((urlString.lastIndexOf(".png") != -1)){
good = true;
float ratio;
if (image.width() > image.height()) {
ratio = (float) image.width() / (float) image.height();
} else {
ratio = (float) image.height() / (float) image.width();
}
if (ratio < 1.0f || ratio > 2.0f) good = false;
}
if (good) {
if (privateEntry) {
privateImageQueueHigh.add(new OriginEntry(image, document.dc_source()));
} else {
publicImageQueueHigh.add(new OriginEntry(image, document.dc_source()));
}
} else {
if (privateEntry) {
privateImageQueueLow.add(new OriginEntry(image, document.dc_source()));
} else {
publicImageQueueLow.add(new OriginEntry(image, document.dc_source()));
}
}
}
}
public static OriginEntry next(boolean privateEntryOnly) {
OriginEntry e = null;
if (privateEntryOnly) {
e = privateImageQueueHigh.poll();
if (e == null) e = privateImageQueueLow.poll();
} else {
e = publicImageQueueHigh.poll();
if (e == null) e = privateImageQueueHigh.poll();
if (e == null) e = publicImageQueueLow.poll();
if (e == null) e = privateImageQueueLow.poll();
}
return e;
}
public static int queueSize(boolean privateEntryOnly) {
if (privateEntryOnly) {
return privateImageQueueHigh.size() + privateImageQueueLow.size();
} else {
return privateImageQueueHigh.size() + privateImageQueueLow.size() +
publicImageQueueHigh.size() + publicImageQueueLow.size();
}
}
public static int privateQueueHighSize() {
return privateImageQueueHigh.size();
}
public static int privateQueueLowSize() {
return privateImageQueueLow.size();
}
public static int publicQueueHighSize() {
return publicImageQueueHigh.size();
}
public static int publicQueueLowSize() {
return publicImageQueueLow.size();
}
public static void clearQueues() {
privateImageQueueHigh.clear();
privateImageQueueLow.clear();
publicImageQueueHigh.clear();
publicImageQueueLow.clear();
doubleCheck.clear();
}
public static class OriginEntry {
public htmlFilterImageEntry imageEntry;
public yacyURL baseURL;
public OriginEntry(htmlFilterImageEntry imageEntry, yacyURL baseURL) {
this.imageEntry = imageEntry;
this.baseURL = baseURL;
}
}
}

@ -58,7 +58,7 @@ import de.anomic.index.indexURLReference;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
public final class plasmaCrawlResults {
public final class plasmaCrawlResultURLs {
// result stacks;
// these have all entries of form
@ -70,7 +70,7 @@ public final class plasmaCrawlResults {
private final LinkedList<String> lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList<String> gcrawlResultStack; // 6 - local index: triggered external
public plasmaCrawlResults() {
public plasmaCrawlResultURLs() {
// init result stacks
externResultStack = new LinkedList<String>();
searchResultStack = new LinkedList<String>();

@ -89,12 +89,12 @@ public final class plasmaSearchEvent {
TreeSet<String> snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
long urlRetrievalAllTime;
long snippetComputationAllTime;
plasmaCrawlResults crawlResults;
plasmaCrawlResultURLs crawlResults;
@SuppressWarnings("unchecked")
private plasmaSearchEvent(plasmaSearchQuery query,
plasmaWordIndex wordIndex,
plasmaCrawlResults crawlResults,
plasmaCrawlResultURLs crawlResults,
TreeMap<String, String> preselectedPeerHashes,
boolean generateAbstracts) {
this.eventTime = System.currentTimeMillis(); // for lifetime check
@ -457,7 +457,7 @@ public final class plasmaSearchEvent {
plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaWordIndex wordIndex,
plasmaCrawlResults crawlResults,
plasmaCrawlResultURLs crawlResults,
TreeMap<String, String> preselectedPeerHashes,
boolean generateAbstracts) {
plasmaSearchEvent event = lastEvents.get(query.id(false));

@ -112,7 +112,6 @@ import de.anomic.data.URLLicense;
import de.anomic.data.blogBoard;
import de.anomic.data.blogBoardComments;
import de.anomic.data.bookmarksDB;
import de.anomic.data.collageQueue;
import de.anomic.data.listManager;
import de.anomic.data.messageBoard;
import de.anomic.data.userDB;
@ -202,7 +201,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
public HashMap<String, String> rankingPermissions;
public plasmaWordIndex wordIndex;
public plasmaCrawlQueues crawlQueues;
public plasmaCrawlResults crawlResults;
public plasmaCrawlResultURLs crawlResults;
public plasmaSwitchboardQueue sbQueue;
public plasmaCrawlStacker crawlStacker;
public messageBoard messageDB;
@ -973,7 +972,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
log.logConfig("Starting Indexing Management");
String networkName = getConfig("network.unit.name", "");
wordIndex = new plasmaWordIndex(indexPrimaryPath, indexSecondaryPath, networkName, log);
crawlResults = new plasmaCrawlResults();
crawlResults = new plasmaCrawlResultURLs();
// start yacy core
log.logConfig("Starting YaCy Protocol Core");
@ -1970,6 +1969,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
hasDoneSomething = true;
}
}
// clean up image stack
plasmaCrawlResultImages.clearQueues();
// clean up profiles
checkInterruption();
@ -2139,9 +2140,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
}
in.queueEntry.close();
return null;
} else {
plasmaCrawlProfile.entry profile = profilesActiveCrawls.getEntry(in.queueEntry.profileHandle);
collageQueue.registerImages(document, (profile == null) ? true : !profile.remoteIndexing());
}
return new indexingQueueEntry(in.queueEntry, document, null);
}
@ -2226,6 +2224,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
in.queueEntry.close();
return null;
}
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
// to compute a URL hash which is necessary for a double-check
plasmaCrawlProfile.entry profile = profilesActiveCrawls.getEntry(in.queueEntry.profileHandle);
plasmaCrawlResultImages.registerImages(in.document, (profile == null) ? true : !profile.remoteIndexing());
return new indexingQueueEntry(in.queueEntry, in.document, condenser);
}
@ -2290,38 +2295,38 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
in.queueEntry.close();
}
private void storeDocumentIndex(plasmaSwitchboardQueue.QueueEntry entry, plasmaParserDocument document, plasmaCondenser condenser) {
private void storeDocumentIndex(plasmaSwitchboardQueue.QueueEntry queueEntry, plasmaParserDocument document, plasmaCondenser condenser) {
// CREATE INDEX
String dc_title = document.dc_title();
yacyURL referrerURL = entry.referrerURL();
int processCase = entry.processCase();
yacyURL referrerURL = queueEntry.referrerURL();
int processCase = queueEntry.processCase();
// remove stopwords
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());
// STORE URL TO LOADED-URL-DB
indexURLReference newEntry = null;
try {
newEntry = wordIndex.storeDocument(entry, document, condenser);
newEntry = wordIndex.storeDocument(queueEntry, document, condenser);
} catch (IOException e) {
if (this.log.isFine()) log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, "error storing url: " + e.getMessage(), new kelondroBitfield());
if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "error storing url: " + e.getMessage(), new kelondroBitfield());
return;
}
// update statistics
// update url result list statistics
crawlResults.stack(
newEntry, // loaded url db entry
entry.initiator(), // initiator peer hash
queueEntry.initiator(), // initiator peer hash
yacyCore.seedDB.mySeed().hash, // executor peer hash
processCase // process case
);
// STORE WORD INDEX
if ((!entry.profile().indexText()) && (!entry.profile().indexMedia())) {
if (this.log.isFine()) log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
if ((!queueEntry.profile().indexText()) && (!queueEntry.profile().indexMedia())) {
if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
return;
}
@ -2329,12 +2334,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
indexedPages++;
// update profiling info
plasmaProfiling.updateIndexedPage(entry);
plasmaProfiling.updateIndexedPage(queueEntry);
// if this was performed for a remote crawl request, notify requester
yacySeed initiatorPeer = entry.initiatorPeer();
yacySeed initiatorPeer = queueEntry.initiatorPeer();
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
log.logInfo("Sending crawl receipt for '" + entry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
log.logInfo("Sending crawl receipt for '" + queueEntry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
// start a thread for receipt sending to avoid a blocking here
new Thread(new receiptSending(initiatorPeer, newEntry)).start();

@ -72,7 +72,7 @@ import de.anomic.index.indexURLReference;
import de.anomic.index.indexWord;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaCrawlResults;
import de.anomic.plasma.plasmaCrawlResultURLs;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSnippetCache;
@ -419,7 +419,7 @@ public final class yacyClient {
int partitions,
yacySeed target,
plasmaWordIndex wordIndex,
plasmaCrawlResults crawlResults,
plasmaCrawlResultURLs crawlResults,
plasmaSearchRankingProcess containerCache,
Map<String, TreeMap<String, String>> abstractCache,
indexReferenceBlacklist blacklist,

@ -53,7 +53,7 @@ import java.util.TreeMap;
import de.anomic.index.indexReferenceBlacklist;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaCrawlResults;
import de.anomic.plasma.plasmaCrawlResultURLs;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProcess;
import de.anomic.plasma.plasmaSearchRankingProfile;
@ -75,11 +75,11 @@ public class yacySearch extends Thread {
final private plasmaSearchRankingProfile rankingProfile;
final private String prefer, filter;
final private kelondroBitfield constraint;
plasmaCrawlResults crawlResults;
plasmaCrawlResultURLs crawlResults;
public yacySearch(String wordhashes, String excludehashes, String urlhashes, String prefer, String filter, int count, int maxDistance,
boolean global, int partitions, yacySeed targetPeer, plasmaWordIndex wordIndex,
plasmaCrawlResults crawlResults,
plasmaCrawlResultURLs crawlResults,
plasmaSearchRankingProcess containerCache,
Map<String, TreeMap<String, String>> abstractCache,
indexReferenceBlacklist blacklist,
@ -252,7 +252,7 @@ public class yacySearch extends Thread {
String wordhashes, String excludehashes, String urlhashes,
String prefer, String filter, int count, int maxDist,
plasmaWordIndex wordIndex,
plasmaCrawlResults crawlResults,
plasmaCrawlResultURLs crawlResults,
plasmaSearchRankingProcess containerCache,
Map<String, TreeMap<String, String>> abstractCache,
int targets,
@ -280,7 +280,7 @@ public class yacySearch extends Thread {
public static yacySearch secondaryRemoteSearch(String wordhashes, String excludehashes, String urlhashes,
plasmaWordIndex wordIndex,
plasmaCrawlResults crawlResults,
plasmaCrawlResultURLs crawlResults,
plasmaSearchRankingProcess containerCache,
String targethash, indexReferenceBlacklist blacklist,
plasmaSearchRankingProfile rankingProfile,

Loading…
Cancel
Save