enhanced remove operation in search consequences (which are triggered when the snippet fetch proves that the word has disappeared from the page that was stored in the index)

- no direct deletion of referenced during search (shifted to time after search)
- bundling of all deletions for the references of a single word into one remove operation
- enhanced remove operation by caring that the collection is stored sorted (experimental)
- more String -> byte[] transition for search word lists
- clean up of unused code
- enhanced memory allocation of RowSet Objects (will use a little bit less memory which was wasted before)


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6823 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 7a59012632
commit 93ea0a4789

@ -61,7 +61,7 @@ public class ResultFetcher {
protected Worker[] workerThreads;
protected final SortStore<ResultEntry> result;
protected final SortStore<MediaSnippet> images; // container to sort images by size
protected final HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string
protected final HandleSet failedURLs; // a set of urlhashes that could not been verified during search
protected final HandleSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
long urlRetrievalAllTime;
long snippetComputationAllTime;
@ -84,7 +84,7 @@ public class ResultFetcher {
this.snippetComputationAllTime = 0;
this.result = new SortStore<ResultEntry>(-1, true); // this is the result, enriched with snippets, ranked and ordered by ranking
this.images = new SortStore<MediaSnippet>(-1, true);
this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
this.failedURLs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed.
// snippets do not need to match with the complete query hashes,
// only with the query minus the stopwords which had not been used for the search
@ -167,7 +167,7 @@ public class ResultFetcher {
// get next entry
page = rankedCache.takeURL(true, taketimeout);
if (page == null) break;
if (failedURLs.get(new String(page.hash())) != null) continue;
if (failedURLs.has(page.hash())) continue;
final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0
@ -230,7 +230,7 @@ public class ResultFetcher {
(snippetMode == 2) ? Integer.MAX_VALUE : 30000,
query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
if (snippet.getErrorCode() < 11) {
// we loaded the file and found the snippet
@ -241,13 +241,7 @@ public class ResultFetcher {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
registerFailure(new String(page.hash()), "no text snippet for URL " + metadata.url());
if (!peers.mySeed().isVirgin())
try {
TextSnippet.failConsequences(query.getSegment(), page.word(), snippet, query.id(false));
} catch (IOException e) {
Log.logException(e);
}
registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
return null;
}
} else {
@ -255,7 +249,7 @@ public class ResultFetcher {
startTime = System.currentTimeMillis();
final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
if (mediaSnippets != null && !mediaSnippets.isEmpty()) {
// found media snippets, return entry
@ -264,16 +258,20 @@ public class ResultFetcher {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
} else {
// problems with snippet fetch
registerFailure(new String(page.hash()), "no media snippet for URL " + metadata.url());
registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
return null;
}
}
// finished, no more actions possible here
}
private void registerFailure(final String urlhash, final String reason) {
this.failedURLs.put(urlhash, reason);
Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
private void registerFailure(final byte[] urlhash, final String reason) {
try {
this.failedURLs.put(urlhash);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
Log.logInfo("SEARCH", "sorted out urlhash " + new String(urlhash) + " during search: " + reason);
}
public int resultCount() {

@ -223,6 +223,7 @@ public final class SearchEvent {
// execute deletion of failed words
int rw = this.results.failedURLs.size();
if (rw > 0) {
long start = System.currentTimeMillis();
final HandleSet removeWords = query.queryHashes;
try {
removeWords.putAll(query.excludeHashes);
@ -233,12 +234,12 @@ public final class SearchEvent {
final Iterator<byte[]> j = removeWords.iterator();
// remove the same url hashes for multiple words
while (j.hasNext()) {
this.query.getSegment().termIndex().remove(j.next(), this.results.failedURLs.keySet());
this.query.getSegment().termIndex().remove(j.next(), this.results.failedURLs);
}
} catch (IOException e) {
Log.logException(e);
}
Log.logInfo("SearchEvents", "cleaning up event " + query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words");
Log.logInfo("SearchEvents", "cleaning up event " + query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words in " + (System.currentTimeMillis() - start) + " milliseconds");
}
}

@ -53,7 +53,7 @@ public class SearchEventCache {
SearchEvent event;
while (i.hasNext()) {
event = i.next();
if ((all) || (event.getEventTime() + eventLifetime < System.currentTimeMillis())) {
if (all || event.getEventTime() + eventLifetime < System.currentTimeMillis()) {
event.cleanup();
// remove the event

@ -583,29 +583,4 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
}
}
public static String failConsequences(Segment indexSegment, final WordReferenceVars word, final TextSnippet snippet, final String eventID) throws IOException {
// problems with snippet fetch
final byte[] urlHash = snippet.getUrl().hash();
final String querystring = SetTools.setToString(snippet.getRemainingHashes(), ' ');
if ((snippet.getErrorCode() == ERROR_SOURCE_LOADING) ||
(snippet.getErrorCode() == ERROR_RESOURCE_LOADING) ||
(snippet.getErrorCode() == ERROR_PARSER_FAILED) ||
(snippet.getErrorCode() == ERROR_PARSER_NO_LINES)) {
Log.logInfo("TextSnippet", "error: '" + snippet.getError() + "', remove url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
indexSegment.urlMetadata().remove(urlHash);
final SearchEvent event = SearchEventCache.getEvent(eventID);
assert indexSegment != null;
assert event != null : "eventID = " + eventID;
assert event.getQuery() != null;
indexSegment.termIndex().remove(event.getQuery().queryHashes, urlHash);
event.remove(word);
}
if (snippet.getErrorCode() == ERROR_NO_MATCH) {
Log.logInfo("TextSnippet", "error: '" + snippet.getError() + "', remove words '" + querystring + "' for url = " + snippet.getUrl().toNormalform(false, true) + ", cause: " + snippet.getError());
indexSegment.termIndex().remove(snippet.getRemainingHashes(), urlHash);
SearchEventCache.getEvent(eventID).remove(word);
}
return snippet.getError();
}
}

@ -412,6 +412,7 @@ public class ArrayStack implements BLOB {
File location;
BLOB blob;
public blobItem(Date creation, File location, BLOB blob) {
assert blob != null;
this.creation = creation;
this.location = location;
this.blob = blob;

@ -192,6 +192,10 @@ public final class HandleSet implements Iterable<byte[]>, Cloneable {
index = null;
}
public final String toString() {
return this.index.toString();
}
// set tools
public HandleSet joinConstructive(final HandleSet other) throws RowSpaceExceededException {

@ -199,6 +199,8 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
public synchronized byte[] exportCollection() {
// returns null if the collection is empty
trim(false);
sort(); // experimental; supervise CPU load
assert this.sortBound == this.chunkcount; // on case the collection is sorted
assert this.size() * this.rowdef.objectsize == this.chunkcache.length : "this.size() = " + this.size() + ", objectsize = " + this.rowdef.objectsize + ", chunkcache.length = " + this.chunkcache.length;
final Row row = exportRow(chunkcache.length);
final Row.Entry entry = row.newEntry();
@ -227,9 +229,11 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
if (chunkcache.length >= needed) return 0;
assert needed > 0 : "needed = " + needed;
long allocram = needed * growfactorLarge100 / 100L;
allocram -= allocram % rowdef.objectsize;
assert allocram > 0 : "elements = " + elements + ", new = " + allocram;
if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, false)) return allocram;
allocram = needed * growfactorSmall100 / 100L;
allocram -= allocram % rowdef.objectsize;
assert allocram > 0 : "elements = " + elements + ", new = " + allocram;
if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, forcegc)) return allocram;
return needed;
@ -239,7 +243,8 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
if (elements == 0) return;
final long allocram = neededSpaceForEnsuredSize(elements, true);
if (allocram == 0) return;
assert allocram > chunkcache.length : "wrong alloc computation: allocram = " + allocram + ", chunkcache.length = " + chunkcache.length;
assert chunkcache.length < elements * rowdef.objectsize : "wrong alloc computation (1): elements * rowdef.objectsize = " + (elements * rowdef.objectsize) + ", chunkcache.length = " + chunkcache.length;
assert allocram > chunkcache.length : "wrong alloc computation (2): allocram = " + allocram + ", chunkcache.length = " + chunkcache.length;
if (allocram > Integer.MAX_VALUE || !MemoryControl.request(allocram, true))
throw new RowSpaceExceededException(allocram, "RowCollection grow");
try {
@ -564,7 +569,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
}
protected synchronized final void sort() {
public synchronized final void sort() {
assert (this.rowdef.objectOrder != null);
if (this.sortBound == this.chunkcount) return; // this is already sorted
if (this.chunkcount < isortlimit) {
@ -609,6 +614,7 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
//assert this.isSorted();
}
/*
public synchronized final void sort2() {
assert (this.rowdef.objectOrder != null);
if (this.sortBound == this.chunkcount) return; // this is already sorted
@ -643,7 +649,8 @@ public class RowCollection implements Iterable<Row.Entry>, Cloneable {
this.sortBound = this.chunkcount;
//assert this.isSorted();
}
*/
private static class qsortthread implements Callable<Object> {
private RowCollection rc;
int L, R, S;

@ -75,7 +75,7 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable<Row.E
if (size < 0) return new RowSet(rowdef);
final int orderbound = (int) NaturalOrder.decodeLong(b, 10, 4);
assert orderbound >= 0 : "orderbound = " + orderbound;
if (orderbound < 0) return new RowSet(rowdef);
if (orderbound < 0) return new RowSet(rowdef); // error
final byte[] chunkcache = new byte[size * rowdef.objectsize];
//assert b.length - exportOverheadSize == size * rowdef.objectsize : "b.length = " + b.length + ", size * rowdef.objectsize = " + size * rowdef.objectsize;
if (b.length - exportOverheadSize != size * rowdef.objectsize) {

@ -256,12 +256,6 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
return removed + (reduced / this.array.rowdef().objectsize);
}
public int remove(byte[] termHash, Set<String> urlHashes) throws IOException {
int removed = this.ram.remove(termHash, urlHashes);
int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHashes));
return removed + (reduced / this.array.rowdef().objectsize);
}
public boolean remove(byte[] termHash, byte[] urlHashBytes) throws IOException {
boolean removed = this.ram.remove(termHash, urlHashBytes);
int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHashBytes));
@ -276,16 +270,6 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
this.urlHashes = urlHashes;
}
public RemoveRewriter(Set<String> urlHashes) {
this.urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
for (String s: urlHashes)
try {
this.urlHashes.put(s.getBytes());
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
}
public RemoveRewriter(byte[] urlHashBytes) {
this.urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
try {
@ -296,6 +280,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
}
public ReferenceContainer<ReferenceType> rewrite(ReferenceContainer<ReferenceType> container) {
container.sort();
container.removeEntries(urlHashes);
return container;
}

@ -188,13 +188,6 @@ public class ReferenceContainer<ReferenceType extends Reference> extends RowSet
return count;
}
public int removeEntries(final Set<String> urlHashes) {
int count = 0;
final Iterator<String> i = urlHashes.iterator();
while (i.hasNext()) count += (remove(i.next().getBytes()) == null) ? 0 : 1;
return count;
}
public Iterator<ReferenceType> entries() {
// returns an iterator of indexRWIEntry objects
return new entryIterator();

@ -366,27 +366,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
}
return 0;
}
public int remove(final byte[] termHash, final Set<String> urlHashes) {
assert this.cache != null;
if (urlHashes.isEmpty()) return 0;
ByteArray tha = new ByteArray(termHash);
int count;
synchronized (cache) {
final ReferenceContainer<ReferenceType> c = cache.get(tha);
if ((c != null) && ((count = c.removeEntries(urlHashes)) > 0)) {
// removal successful
if (c.isEmpty()) {
delete(termHash);
} else {
cache.put(tha, c);
}
return count;
}
}
return 0;
}
public void add(final ReferenceContainer<ReferenceType> container) throws RowSpaceExceededException {
// this puts the entries into the cache
assert this.cache != null;

Loading…
Cancel
Save