Better implementation of SortStack and SortStore and adoptions in all using classes to implement the necessary Comparable interface and hash code computation.

The better SortStack performance affects crawling and image search speed and quality.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6492 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent fe41a84330
commit 1dff620181

@ -33,10 +33,11 @@ import net.yacy.document.Document;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.repository.LoaderDispatcher;
public class MediaSnippet {
public class MediaSnippet implements Comparable<MediaSnippet> {
public ContentDomain type;
public DigestURI href, source;
public String name, attr;
@ -58,6 +59,14 @@ public class MediaSnippet {
return href.hashCode();
}
public boolean equals(MediaSnippet other) {
return this.href.hash().equals(other.href.hash());
}
public int compareTo(MediaSnippet o) {
return Base64Order.enhancedCoder.compare(this.href.hash().getBytes(), o.href.hash().getBytes());
}
public static ArrayList<MediaSnippet> retrieveMediaSnippets(final DigestURI url, final TreeSet<byte[]> queryhashes, final ContentDomain mediatype, final boolean fetchOnline, final int timeout, final boolean reindexing) {
if (queryhashes.size() == 0) {
Log.logFine("snippet fetch", "no query hashes given for url " + url);

@ -41,7 +41,7 @@ import java.util.TreeSet;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.Cache;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.ObjectIndex;
@ -111,7 +111,7 @@ public final class MetadataRepository implements Iterable<byte[]> {
return 0;
}
public synchronized URIMetadataRow load(final String urlHash, final WordReference searchedWord, final long ranking) {
public synchronized URIMetadataRow load(final String urlHash, final WordReferenceVars searchedWord, final long ranking) {
// generates an plasmaLURLEntry using the url hash
// if the url cannot be found, this returns null
if (urlHash == null) return null;

@ -50,7 +50,6 @@ import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.BinSearch;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.FileUtils;
@ -495,11 +494,9 @@ public final class RankingProcess extends Thread {
return this.local_resourceSize;
}
public Reference remove(final String urlHash) {
final SortStack<WordReferenceVars>.stackElement se = stack.remove(urlHash.hashCode());
if (se == null) return null;
urlhashes.remove(urlHash);
return se.element;
public void remove(final WordReferenceVars reference) {
stack.remove(reference);
urlhashes.remove(reference.urlHash);
}
public Iterator<String> miss() {

@ -36,13 +36,14 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.rwi.Reference;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
public class ResultEntry {
public class ResultEntry implements Comparable<ResultEntry> {
// payload objects
private final URIMetadataRow urlentry;
@ -101,6 +102,9 @@ public class ResultEntry {
public int hashCode() {
return urlentry.hash().hashCode();
}
public boolean equals(ResultEntry other) {
return urlentry.hash().equals(other.urlentry.hash());
}
public String hash() {
return urlentry.hash();
}
@ -161,4 +165,7 @@ public class ResultEntry {
}
return urlentry.toString(textSnippet.getLineRaw());
}
public int compareTo(ResultEntry o) {
return Base64Order.enhancedCoder.compare(this.hash().getBytes(), o.hash().getBytes());
}
}

@ -164,26 +164,24 @@ public class ResultFetcher {
// get next entry
page = rankedCache.takeURL(true, taketimeout);
if (page == null) break;
if (result.exists(page.hash().hashCode())) continue;
if (failedURLs.get(page.hash()) != null) continue;
final ResultEntry resultEntry = fetchSnippet(page, snippetMode);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
if (result.exists(resultEntry)) continue;
urlRetrievalAllTime += resultEntry.dbRetrievalTime;
snippetComputationAllTime += resultEntry.snippetComputationTime;
//System.out.println("+++DEBUG-resultWorker+++ fetched " + resultEntry.urlstring());
// place the result to the result vector
if (!result.exists(resultEntry)) {
// apply post-ranking
long ranking = Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()));
ranking += postRanking(resultEntry, rankedCache.getTopics());
result.push(resultEntry, ranking);
if (nav_topics) rankedCache.addTopics(resultEntry);
}
// apply post-ranking
long ranking = Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()));
ranking += postRanking(resultEntry, rankedCache.getTopics());
result.push(resultEntry, ranking);
if (nav_topics) rankedCache.addTopics(resultEntry);
//System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
}
} catch (final Exception e) {
@ -234,7 +232,7 @@ public class ResultFetcher {
registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
if (!peers.mySeed().isVirgin())
try {
TextSnippet.failConsequences(this.indexSegment, snippet, query.id(false));
TextSnippet.failConsequences(this.indexSegment, page.word(), snippet, query.id(false));
} catch (IOException e) {
Log.logException(e);
}

@ -34,6 +34,7 @@ import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.ReferenceContainer;
@ -384,10 +385,8 @@ public final class SearchEvent {
}
}
public void remove(final String urlhash) {
// removes the url hash reference from last search result
/*indexRWIEntry e =*/ this.rankedCache.remove(urlhash);
//assert e != null;
public void remove(final WordReferenceVars reference) {
this.rankedCache.remove(reference);
}
public ResultFetcher result() {

@ -49,6 +49,7 @@ import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.ByteOrder;
@ -442,7 +443,7 @@ public class Segment {
public void run() {
Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
ReferenceContainer<WordReference> container = null;
WordReference entry = null;
WordReferenceVars entry = null;
DigestURI url = null;
final HashSet<String> urlHashs = new HashSet<String>();
try {
@ -454,7 +455,7 @@ public class Segment {
wordHashNow = container.getTermHash();
while (containerIterator.hasNext() && run) {
waiter();
entry = containerIterator.next();
entry = new WordReferenceVars(containerIterator.next());
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
final URIMetadataRow ue = urlMetadata.load(entry.metadataHash(), entry, 0);

@ -576,7 +576,6 @@ public final class Switchboard extends serverSwitch {
// generate snippets cache
log.logConfig("Initializing Snippet Cache");
TextSnippet.init(log, this);
// init the wiki
wikiParser = new wikiCode(this.peers.mySeed().getClusterAddress());

@ -41,6 +41,7 @@ import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.ARC;
import net.yacy.kelondro.index.ConcurrentARC;
import net.yacy.kelondro.logging.Log;
@ -79,18 +80,6 @@ public class TextSnippet {
private TreeSet<byte[]> remaingHashes;
private final DigestURI favicon;
private static Log log = null;
private static Switchboard sb = null;
public static void init(
final Log logx,
final Switchboard switchboard
) {
log = logx;
sb = switchboard;
}
public static boolean existsInCache(final DigestURI url, final TreeSet<byte[]> queryhashes) {
final String hashes = yacySearch.set2string(queryhashes);
return retrieveFromCache(hashes, url.hash()) != null;
@ -361,7 +350,7 @@ public class TextSnippet {
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
// place entry on indexing queue
sb.toIndexer(entry);
Switchboard.getSwitchboard().toIndexer(entry);
// read resource body (if it is there)
final byte[] resourceArray = entry.getContent();
@ -501,7 +490,7 @@ public class TextSnippet {
}
return null;
} catch (final IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
Log.logSevere("computeSnippet", "error with string generation", e);
return new Object[]{null, queryhashes};
}
}
@ -570,12 +559,12 @@ public class TextSnippet {
}
return new Object[] {sentence, remainingHashes};
} catch (final IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
Log.logSevere("computeSnippet", "error with string generation", e);
return null;
}
}
public static String failConsequences(Segment indexSegment, final TextSnippet snippet, final String eventID) throws IOException {
public static String failConsequences(Segment indexSegment, final WordReferenceVars word, final TextSnippet snippet, final String eventID) throws IOException {
// problems with snippet fetch
final String urlHash = snippet.getUrl().hash();
final String querystring = SetTools.setToString(snippet.getRemainingHashes(), ' ');
@ -583,19 +572,19 @@ public class TextSnippet {
(snippet.getErrorCode() == ERROR_RESOURCE_LOADING) ||
(snippet.getErrorCode() == ERROR_PARSER_FAILED) ||
(snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
log.logInfo("error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
Log.logInfo("TextSnippet", "error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
indexSegment.urlMetadata().remove(urlHash);
final SearchEvent event = SearchEventCache.getEvent(eventID);
assert indexSegment != null;
assert event != null : "eventID = " + eventID;
assert event.getQuery() != null;
indexSegment.termIndex().remove(event.getQuery().queryHashes, urlHash);
event.remove(urlHash);
event.remove(word);
}
if (snippet.getErrorCode() == ERROR_NO_MATCH) {
log.logInfo("error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
Log.logInfo("TextSnippet", "error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
indexSegment.termIndex().remove(snippet.getRemainingHashes(), urlHash);
SearchEventCache.getEvent(eventID).remove(urlHash);
SearchEventCache.getEvent(eventID).remove(word);
}
return snippet.getError();
}

@ -181,6 +181,9 @@ public final class CitationReferenceRow implements Reference /*, Cloneable*/ {
return this.metadataHash().hashCode();
}
public boolean equals(Reference other) {
return this.metadataHash().equals(other.metadataHash());
}
public int distance() {
throw new UnsupportedOperationException();
@ -205,4 +208,5 @@ public final class CitationReferenceRow implements Reference /*, Cloneable*/ {
public int positions() {
throw new UnsupportedOperationException();
}
}

@ -33,8 +33,8 @@ import java.util.Date;
import java.util.Iterator;
import java.util.Properties;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
@ -119,7 +119,7 @@ public class URIMetadataRow implements URIMetadata {
private final Row.Entry entry;
private final String snippet;
private WordReference word; // this is only used if the url is transported via remote search requests
private WordReferenceVars word; // this is only used if the url is transported via remote search requests
private final long ranking; // during generation of a search result this value is set
private Components comp;
@ -218,7 +218,7 @@ public class URIMetadataRow implements URIMetadata {
}
}
public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord, final long ranking) {
public URIMetadataRow(final Row.Entry entry, final WordReferenceVars searchedWord, final long ranking) {
this.entry = entry;
this.snippet = null;
this.word = searchedWord;
@ -285,7 +285,7 @@ public class URIMetadataRow implements URIMetadata {
this.word = null;
if (prop.containsKey("word")) throw new kelondroException("old database structure is not supported");
if (prop.containsKey("wi")) {
this.word = new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", "")));
this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))));
}
this.ranking = 0;
this.comp = null;
@ -450,7 +450,7 @@ public class URIMetadataRow implements URIMetadata {
return snippet;
}
public WordReference word() {
public WordReferenceVars word() {
return word;
}

@ -147,6 +147,10 @@ public final class NavigationReferenceRow extends AbstractReference implements N
return this.navigationHash().hashCode();
}
public boolean equals(Reference other) {
return this.metadataHash().equals(other.metadataHash());
}
public boolean isOlder(Reference other) {
return false;
}

@ -118,7 +118,11 @@ public class NavigationReferenceVars extends AbstractReference implements Navig
public int hashCode() {
return this.navigationHash().hashCode();
}
public boolean equals(Reference other) {
return this.metadataHash().equals(other.metadataHash());
}
public boolean isOlder(Reference other) {
return false;
}

@ -323,6 +323,10 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
return false;
}
public boolean equals(Reference other) {
return this.metadataHash().equals(other.metadataHash());
}
public int hashCode() {
return this.metadataHash().hashCode();
}

@ -31,6 +31,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.order.MicroDate;
import net.yacy.kelondro.rwi.AbstractReference;
@ -38,7 +39,7 @@ import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer;
public class WordReferenceVars extends AbstractReference implements WordReference, Reference, Cloneable {
public class WordReferenceVars extends AbstractReference implements WordReference, Reference, Cloneable, Comparable<WordReferenceVars> {
/**
* object for termination of concurrent blocking queue processing
@ -357,10 +358,18 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.wordsintext = this.wordsintext + oe.wordsintext();
}
public boolean equals(Reference other) {
return this.urlHash.equals(other.metadataHash());
}
public int hashCode() {
return this.urlHash.hashCode();
}
public int compareTo(WordReferenceVars o) {
return Base64Order.enhancedCoder.compare(this.urlHash.getBytes(), o.metadataHash().getBytes());
}
public void addPosition(int position) {
this.positions.add(position);
}
@ -417,4 +426,5 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
} catch (InterruptedException e) {}
}
}
}

@ -44,6 +44,8 @@ public interface Reference {
public int hashCode();
public boolean equals(Reference other);
public void join(final Reference oe);
public int positions();

@ -26,27 +26,28 @@
package net.yacy.kelondro.util;
import java.util.ConcurrentModificationException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
public class SortStack<E> {
public class SortStack<E extends Comparable<E>> {
// implements a stack where elements 'float' on-top of the stack according to a weight value.
// objects pushed on the stack must implement the hashCode() method to provide a handle
// for a double-check.
private TreeMap<Long, E> onstack; // object within the stack
private HashSet<Integer> instack; // keeps track which element has been on the stack or is now in the offstack
private TreeMap<Long, List<E>> onstack; // object within the stack
private HashSet<E> instack; // keeps track which element has been on the stack or is now in the offstack
protected int maxsize;
public SortStack(final int maxsize) {
// the maxsize is the maximum number of entries in the stack
// if this is set to -1, the size is unlimited
this.onstack = new TreeMap<Long, E>();
this.instack = new HashSet<Integer>();
this.onstack = new TreeMap<Long, List<E>>();
this.instack = new HashSet<E>();
this.maxsize = maxsize;
}
@ -58,17 +59,26 @@ public class SortStack<E> {
push(se.element, se.weight);
}
/**
* put a elememt on the stack using a order of the weight
* @param element
* @param weight
*/
public synchronized void push(final E element, Long weight) {
if (exists(element)) return;
// manipulate weight in such a way that it has no conflicts
while (this.onstack.containsKey(weight)) weight = Long.valueOf(weight.longValue() + 1);
if (this.instack.contains(element)) return;
// put the element on the stack
this.onstack.put(weight, element);
List<E> l = this.onstack.get(weight);
if (l == null) {
l = new LinkedList<E>();
l.add(element);
this.onstack.put(weight, l);
} else {
l.add(element);
}
// register it for double-check
this.instack.add(Integer.valueOf(element.hashCode()));
this.instack.add(element);
// check maximum size of the stack an remove elements if the stack gets too large
if (this.maxsize <= 0) return;
@ -77,69 +87,58 @@ public class SortStack<E> {
}
}
/**
* return the element with the smallest weight
* @return
*/
public synchronized stackElement top() {
// returns the element that is currently on top of the stack
if (this.onstack.isEmpty()) return null;
final Long w = this.onstack.firstKey();
final E element = this.onstack.get(w);
final List<E> l = this.onstack.get(w);
final E element = l.get(0);
return new stackElement(element, w);
}
/**
* return the element with the smallest weight and remove it from the stack
* @return
*/
public synchronized stackElement pop() {
// returns the element that is currently on top of the stack
// it is removed and added to the offstack list
// this is exactly the same as element(offstack.size())
if (this.onstack.isEmpty()) return null;
final Long w = this.onstack.firstKey();
final E element = this.onstack.remove(w);
final stackElement se = new stackElement(element, w);
return se;
}
public boolean exists(final E element) {
// uses the hashCode of the element to find out of the element had been on the list or the stack
return this.instack.contains(Integer.valueOf(element.hashCode()));
final List<E> l = this.onstack.get(w);
final E element = l.remove(0);
if (l.size() == 0) this.onstack.remove(w);
return new stackElement(element, w);
}
public boolean exists(final int hashcode) {
public synchronized boolean exists(final E element) {
// uses the hashCode of the element to find out of the element had been on the list or the stack
return this.instack.contains(Integer.valueOf(hashcode));
return this.instack.contains(element);
}
public stackElement get(final int hashcode) {
final Iterator<Map.Entry<Long, E>> i = this.onstack.entrySet().iterator();
Map.Entry<Long, E> entry;
while (i.hasNext()) {
entry = i.next();
if (entry.getValue().hashCode() == hashcode) return new stackElement(entry.getValue(), entry.getKey());
}
return null;
}
public stackElement remove(final int hashcode) {
Map.Entry<Long, E> entry;
stackElement se;
int retry = 3;
retryloop : while (retry-- > 0) {
try {
final Iterator<Map.Entry<Long, E>> i = this.onstack.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
if (entry.getValue().hashCode() == hashcode) {
se = new stackElement(entry.getValue(), entry.getKey());
this.onstack.remove(se.weight);
return se;
public synchronized void remove(final E element) {
if (!this.instack.contains(element)) return;
for (Map.Entry<Long,List<E>> entry: this.onstack.entrySet()) {
Iterator<E> i = entry.getValue().iterator();
while (i.hasNext()) {
if (i.next().equals(element)) {
i.remove();
if (entry.getValue().size() == 0) {
this.onstack.remove(entry.getKey());
}
return;
}
break retryloop;
} catch (ConcurrentModificationException e) {
continue retryloop;
}
}
return null;
}
public boolean bottom(final long weight) {
public synchronized boolean bottom(final long weight) {
// returns true if the element with that weight would be on the bottom of the stack after inserting
return weight > this.onstack.lastKey().longValue();
}

@ -35,7 +35,7 @@ import java.util.Iterator;
* specific elements in the list.
* @param <E>
*/
public class SortStore<E> extends SortStack<E> {
public class SortStore<E extends Comparable<E>> extends SortStack<E> {
private final ArrayList<stackElement> offstack; // objects that had been on the stack but had been removed
@ -106,27 +106,14 @@ public class SortStore<E> extends SortStack<E> {
return this.offstack;
}
public stackElement get(final int hashcode) {
stackElement se = super.get(hashcode);
if (se != null) return se;
final Iterator<stackElement> j = this.offstack.iterator();
while (j.hasNext()) {
se = j.next();
if (se.element.hashCode() == hashcode) return se;
}
return null;
}
public stackElement remove(final int hashcode) {
stackElement se = super.remove(hashcode);
if (se != null) return se;
for (int j = 0; j < this.offstack.size(); j++) {
se = this.offstack.get(j);
if (se.element.hashCode() == hashcode) {
this.offstack.remove(j);
return se;
public void remove(final E element) {
super.remove(element);
Iterator<stackElement> i = this.offstack.iterator();
while (i.hasNext()) {
if (i.next().element.equals(element)) {
i.remove();
return;
}
}
return null;
}
}

Loading…
Cancel
Save