redesign of CrawlResult data structures because of OOM occurrences during URL deletion processes.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6675 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent bab0438fee
commit 564927ce72

@ -29,11 +29,13 @@ import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.ResultURLs.InitExecEntry;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
@ -173,21 +175,20 @@ public class CrawlResults {
prop.put("table_showURL", (showURL) ? "1" : "0");
boolean dark = true;
String urlHash, initiatorHash, executorHash;
String urlstr, urltxt;
yacySeed initiatorSeed, executorSeed;
URIMetadataRow urle;
URIMetadataRow.Components metadata;
int i, cnt = 0;
for (i = sb.crawlResults.getStackSize(tabletype) - 1; i >= (sb.crawlResults.getStackSize(tabletype) - lines); i--) {
initiatorHash = sb.crawlResults.getInitiatorHash(tabletype, i);
executorHash = sb.crawlResults.getExecutorHash(tabletype, i);
urlHash = sb.crawlResults.getUrlHash(tabletype, i);
int cnt = 0;
Iterator<Map.Entry<String, InitExecEntry>> i = sb.crawlResults.results(tabletype);
Map.Entry<String, InitExecEntry> entry;
while (i.hasNext()) {
entry = i.next();
try {
urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(urlHash, null, 0);
urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey(), null, 0);
if(urle == null) {
Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ urlHash);
Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ entry.getKey());
urlstr = null;
urltxt = null;
metadata = null;
@ -196,13 +197,13 @@ public class CrawlResults {
urlstr = metadata.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
}
initiatorSeed = sb.peers.getConnected(initiatorHash);
executorSeed = sb.peers.getConnected(executorHash);
initiatorSeed = sb.peers.getConnected(entry.getValue().initiatorHash);
executorSeed = sb.peers.getConnected(entry.getValue().executorHash);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html");
prop.put("table_indexed_" + cnt + "_tabletype", tabletype.getCode());
prop.put("table_indexed_" + cnt + "_urlhash", urlHash);
prop.put("table_indexed_" + cnt + "_urlhash", entry.getKey());
if (showInit) {
prop.put("table_indexed_" + cnt + "_showInit", "1");
@ -239,7 +240,7 @@ public class CrawlResults {
prop.putHTML("table_indexed_" + cnt + "_showTitle_available_nodescr_urldescr", metadata.dc_title());
}
prop.put("table_indexed_" + cnt + "_showTitle_available_urlHash", urlHash);
prop.put("table_indexed_" + cnt + "_showTitle_available_urlHash", entry.getKey());
prop.putHTML("table_indexed_" + cnt + "_showTitle_available_urltitle", urlstr);
} else
prop.put("table_indexed_" + cnt + "_showTitle", "0");
@ -248,7 +249,7 @@ public class CrawlResults {
prop.put("table_indexed_" + cnt + "_showURL", "1");
prop.put("table_indexed_" + cnt + "_showURL_available", "1");
prop.put("table_indexed_" + cnt + "_showURL_available_urlHash", urlHash);
prop.put("table_indexed_" + cnt + "_showURL_available_urlHash", entry.getKey());
prop.putHTML("table_indexed_" + cnt + "_showURL_available_urltitle", urlstr);
prop.put("table_indexed_" + cnt + "_showURL_available_url", urltxt);
} else

@ -155,7 +155,7 @@ public class IndexControlRWIs_p {
// delete word
if (post.containsKey("keyhashdeleteall")) try {
if (delurl || delurlref) {
// generate an urlx array
// generate urlx: an array of url hashes to be deleted
ReferenceContainer<WordReference> index = null;
index = segment.termIndex().get(keyhash, null);
final Iterator<WordReference> en = index.entries();
@ -169,12 +169,14 @@ public class IndexControlRWIs_p {
if (delurlref) {
for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
}
// delete the word first because that is much faster than the deletion of the urls from the url database
segment.termIndex().delete(keyhash);
// now delete all urls if demanded
if (delurl || delurlref) {
for (i = 0; i < urlx.length; i++) {
sb.urlRemove(segment, urlx[i]);
}
}
segment.termIndex().delete(keyhash);
post.remove("keyhashdeleteall");
post.put("urllist", "generated");
} catch (IOException e) {

@ -1,4 +1,4 @@
// plasmaCrawlLURL.java
// ResultURLs.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net
@ -23,48 +23,43 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/*
This class provides storage functions for the plasma search engine.
- the url-specific properties, including condenser results
- the text content of the url
Both entities are accessed with a hash, which is based on the MD5
algorithm. The MD5 is not encoded as a hex value, but a b64 value.
*/
package de.anomic.crawler;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.LinkedHashMap;
import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.util.ReverseMapIterator;
import net.yacy.kelondro.util.ScoreCluster;
import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.yacy.yacySeedDB;
public final class ResultURLs {
// result stacks;
// these have all entries of form
// strings: urlHash + initiatorHash + ExecutorHash
private final Map<EventOrigin, LinkedList<String>> resultStacks;
private final Map<EventOrigin, LinkedHashMap<String, InitExecEntry>> resultStacks; // a mapping from urlHash to Entries
private final Map<EventOrigin, ScoreCluster<String>> resultDomains;
public class InitExecEntry {
public String initiatorHash, executorHash;
public InitExecEntry(final String initiatorHash, final String executorHash) {
this.initiatorHash = initiatorHash;
this.executorHash = executorHash;
}
}
public ResultURLs(int initialStackCapacity) {
// init result stacks
resultStacks = new HashMap<EventOrigin, LinkedList<String>>(initialStackCapacity);
resultStacks = new HashMap<EventOrigin, LinkedHashMap<String, InitExecEntry>>(initialStackCapacity);
resultDomains = new HashMap<EventOrigin, ScoreCluster<String>>(initialStackCapacity);
for (EventOrigin origin: EventOrigin.values()) {
resultStacks.put(origin, new LinkedList<String>());
resultStacks.put(origin, new LinkedHashMap<String, InitExecEntry>());
resultDomains.put(origin, new ScoreCluster<String>());
}
}
@ -74,9 +69,9 @@ public final class ResultURLs {
assert executorHash != null;
if (e == null) { return; }
try {
final List<String> resultStack = getStack(stackType);
final LinkedHashMap<String, InitExecEntry> resultStack = getStack(stackType);
if (resultStack != null) {
resultStack.add(e.hash() + initiatorHash + executorHash);
resultStack.put(e.hash(), new InitExecEntry(initiatorHash, executorHash));
}
} catch (final Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString());
@ -94,7 +89,7 @@ public final class ResultURLs {
}
public synchronized int getStackSize(final EventOrigin stack) {
final List<String> resultStack = getStack(stack);
final LinkedHashMap<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack == null) return 0;
return resultStack.size();
}
@ -104,77 +99,11 @@ public final class ResultURLs {
if (domains == null) return 0;
return domains.size();
}
public synchronized String getUrlHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 0);
}
public synchronized String getInitiatorHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 1);
}
public synchronized String getExecutorHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 2);
}
/**
* gets the hash at <em>index</em> in element at <em>pos</em> in <em>stack</em> (based on {@link yacySeedDB#commonHashLength})
*
* <p>simplified example with {@link yacySeedDB#commonHashLength} = 3:</p>
* <code>String[][] stacks[1][0] = "123456789";
* System.out.println(getHashNo(1, 0, 0));
* System.out.println(getHashNo(1, 0, 0));
* System.out.println(getHashNo(1, 0, 0));</code>
* <p>Output:
* 123<br/>
* 456<br/>
* 789</p>
*
* @param stack
* @param pos
* @param index starting at 0
* @return
*/
public synchronized String getHashNo(final EventOrigin stack, final int pos, final int index) {
final String result = getResultStackAt(stack, pos);
if (result != null) {
if (result.length() < Word.commonHashLength * 3) {
Log.logSevere("ResultURLs", "unexpected error: result of stack is too short: "+ result.length());
if(result.length() <= Word.commonHashLength * 2) {
return null;
}
// return what is there
return result.substring(Word.commonHashLength * 2);
}
return result.substring(Word.commonHashLength * index, Word.commonHashLength * (index + 1));
} else if(isValidStack(stack)) {
Log.logSevere("ResultURLs", "unexpected error: result of stack is null: "+ stack +","+ pos);
}
return result;
}
/**
* gets the element at pos in stack
*
* @param stack
* @param pos
* @return null if either stack or element do not exist
*/
private String getResultStackAt(final EventOrigin stack, final int pos) {
assert pos >= 0 : "precondition violated: " + pos + " >= 0";
final List<String> resultStack = getStack(stack);
if(resultStack == null) {
return null;
}
assert pos < resultStack.size() : "pos = " + pos + ", resultStack.size() = " + resultStack.size();
if(pos >= resultStack.size()) {
Log.logSevere("ResultURLs", "unexpected error: Index out of Bounds "+ pos +" of "+ resultStack.size());
return null;
}
return resultStack.get(pos);
public synchronized Iterator<Map.Entry<String, InitExecEntry>> results(final EventOrigin stack) {
final LinkedHashMap<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack == null) return new LinkedHashMap<String, InitExecEntry>().entrySet().iterator();
return new ReverseMapIterator<String, InitExecEntry>(resultStack);
}
/**
@ -187,14 +116,16 @@ public final class ResultURLs {
}
public int deleteDomain(final EventOrigin stack, String host, String hosthash) {
assert host != null : "host = null";
assert hosthash.length() == 6;
int i = 0;
final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack);
Map.Entry<String, InitExecEntry> w;
String urlhash;
while (i < getStackSize(stack)) {
urlhash = getUrlHash(stack, i);
if (urlhash == null || urlhash.substring(6).equals(hosthash)) getStack(stack).remove(i); else i++;
while (i.hasNext()) {
w = i.next();
urlhash = w.getKey();
if (urlhash == null || urlhash.substring(6).equals(hosthash)) i.remove();
}
assert host != null : "host = null";
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).deleteScore(host);
}
@ -217,33 +148,15 @@ public final class ResultURLs {
* @param stack id of resultStack
* @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged))
*/
private List<String> getStack(final EventOrigin stack) {
private LinkedHashMap<String, InitExecEntry> getStack(final EventOrigin stack) {
return resultStacks.get(stack);
}
private ScoreCluster<String> getDomains(final EventOrigin stack) {
return resultDomains.get(stack);
}
/**
* tests if a stack with id <em>stack</em> exists
*
* @param stack
* @return
*/
private boolean isValidStack(final EventOrigin stack) {
return getStack(stack) != null;
}
public synchronized boolean removeStack(final EventOrigin stack, final int pos) {
final List<String> resultStack = getStack(stack);
if (resultStack == null) {
return false;
}
return resultStack.remove(pos) != null;
}
public synchronized void clearStack(final EventOrigin stack) {
final List<String> resultStack = getStack(stack);
final LinkedHashMap<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack != null) resultStack.clear();
final ScoreCluster<String> resultDomains = getDomains(stack);
if (resultDomains != null) {
@ -255,15 +168,10 @@ public final class ResultURLs {
public synchronized boolean remove(final String urlHash) {
if (urlHash == null) return false;
String hash;
LinkedHashMap<String, InitExecEntry> resultStack;
for (EventOrigin origin: EventOrigin.values()) {
for (int i = getStackSize(origin) - 1; i >= 0; i--) {
hash = getUrlHash(origin, i);
if (hash != null && hash.equals(urlHash)) {
removeStack(origin, i);
return true;
}
}
resultStack = getStack(origin);
if (resultStack != null) resultStack.remove(urlHash);
}
return true;
}
@ -283,16 +191,6 @@ public final class ResultURLs {
results.stack(urlRef, urlRef.hash(), url.hash(), stackNo);
// size
System.out.println("size of stack:\t"+ results.getStackSize(stackNo));
// get
System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 0));
System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 0));
System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 0));
// test errors
System.out.println("invalid test:\n=======");
// get
System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 1));
System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 1));
System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 1));
} catch (final MalformedURLException e) {
Log.logException(e);
}

@ -0,0 +1,80 @@
// ReverseMapIterator.java
// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 16.10.2010 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-11-05 21:28:37 +0100 (Do, 05 Nov 2009) $
// $LastChangedRevision: 6458 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.kelondro.util;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
public class ReverseMapIterator <E, F> implements Iterator<Map.Entry<E, F>> {
ArrayList<E> a;
Map<E, F> map;
E last;
public ReverseMapIterator(Map<E, F> map) {
this.map = map;
this.a = new ArrayList<E>();
for (E e: map.keySet()) a.add(e);
}
public boolean hasNext() {
return a.size() > 0;
}
public Map.Entry<E, F> next() {
this.last = a.remove(a.size() - 1);
return new Entry0(this.last, this.map.get(this.last));
}
public void remove() {
this.map.remove(this.last);
}
public class Entry0 implements Map.Entry<E, F> {
E e;
F f;
public Entry0(final E e, final F f) {
this.e = e;
this.f = f;
}
public E getKey() {
return this.e;
}
public F getValue() {
return this.f;
}
public F setValue(F value) {
F f0 = this.f;
this.f = value;
return f0;
}
}
}
Loading…
Cancel
Save