redesign of CrawlResult data structures because of OOM occurrences during URL deletion processes.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6675 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent bab0438fee
commit 564927ce72

@ -29,11 +29,13 @@ import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.Locale; import java.util.Locale;
import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.ResultURLs.InitExecEntry;
import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments; import de.anomic.search.Segments;
@ -173,21 +175,20 @@ public class CrawlResults {
prop.put("table_showURL", (showURL) ? "1" : "0"); prop.put("table_showURL", (showURL) ? "1" : "0");
boolean dark = true; boolean dark = true;
String urlHash, initiatorHash, executorHash;
String urlstr, urltxt; String urlstr, urltxt;
yacySeed initiatorSeed, executorSeed; yacySeed initiatorSeed, executorSeed;
URIMetadataRow urle; URIMetadataRow urle;
URIMetadataRow.Components metadata; URIMetadataRow.Components metadata;
int i, cnt = 0; int cnt = 0;
for (i = sb.crawlResults.getStackSize(tabletype) - 1; i >= (sb.crawlResults.getStackSize(tabletype) - lines); i--) { Iterator<Map.Entry<String, InitExecEntry>> i = sb.crawlResults.results(tabletype);
initiatorHash = sb.crawlResults.getInitiatorHash(tabletype, i); Map.Entry<String, InitExecEntry> entry;
executorHash = sb.crawlResults.getExecutorHash(tabletype, i); while (i.hasNext()) {
urlHash = sb.crawlResults.getUrlHash(tabletype, i); entry = i.next();
try { try {
urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(urlHash, null, 0); urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey(), null, 0);
if(urle == null) { if(urle == null) {
Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ urlHash); Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ entry.getKey());
urlstr = null; urlstr = null;
urltxt = null; urltxt = null;
metadata = null; metadata = null;
@ -196,13 +197,13 @@ public class CrawlResults {
urlstr = metadata.url().toNormalform(false, true); urlstr = metadata.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
} }
initiatorSeed = sb.peers.getConnected(initiatorHash); initiatorSeed = sb.peers.getConnected(entry.getValue().initiatorHash);
executorSeed = sb.peers.getConnected(executorHash); executorSeed = sb.peers.getConnected(entry.getValue().executorHash);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0"); prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html"); prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html");
prop.put("table_indexed_" + cnt + "_tabletype", tabletype.getCode()); prop.put("table_indexed_" + cnt + "_tabletype", tabletype.getCode());
prop.put("table_indexed_" + cnt + "_urlhash", urlHash); prop.put("table_indexed_" + cnt + "_urlhash", entry.getKey());
if (showInit) { if (showInit) {
prop.put("table_indexed_" + cnt + "_showInit", "1"); prop.put("table_indexed_" + cnt + "_showInit", "1");
@ -239,7 +240,7 @@ public class CrawlResults {
prop.putHTML("table_indexed_" + cnt + "_showTitle_available_nodescr_urldescr", metadata.dc_title()); prop.putHTML("table_indexed_" + cnt + "_showTitle_available_nodescr_urldescr", metadata.dc_title());
} }
prop.put("table_indexed_" + cnt + "_showTitle_available_urlHash", urlHash); prop.put("table_indexed_" + cnt + "_showTitle_available_urlHash", entry.getKey());
prop.putHTML("table_indexed_" + cnt + "_showTitle_available_urltitle", urlstr); prop.putHTML("table_indexed_" + cnt + "_showTitle_available_urltitle", urlstr);
} else } else
prop.put("table_indexed_" + cnt + "_showTitle", "0"); prop.put("table_indexed_" + cnt + "_showTitle", "0");
@ -248,7 +249,7 @@ public class CrawlResults {
prop.put("table_indexed_" + cnt + "_showURL", "1"); prop.put("table_indexed_" + cnt + "_showURL", "1");
prop.put("table_indexed_" + cnt + "_showURL_available", "1"); prop.put("table_indexed_" + cnt + "_showURL_available", "1");
prop.put("table_indexed_" + cnt + "_showURL_available_urlHash", urlHash); prop.put("table_indexed_" + cnt + "_showURL_available_urlHash", entry.getKey());
prop.putHTML("table_indexed_" + cnt + "_showURL_available_urltitle", urlstr); prop.putHTML("table_indexed_" + cnt + "_showURL_available_urltitle", urlstr);
prop.put("table_indexed_" + cnt + "_showURL_available_url", urltxt); prop.put("table_indexed_" + cnt + "_showURL_available_url", urltxt);
} else } else

@ -155,7 +155,7 @@ public class IndexControlRWIs_p {
// delete word // delete word
if (post.containsKey("keyhashdeleteall")) try { if (post.containsKey("keyhashdeleteall")) try {
if (delurl || delurlref) { if (delurl || delurlref) {
// generate an urlx array // generate urlx: an array of url hashes to be deleted
ReferenceContainer<WordReference> index = null; ReferenceContainer<WordReference> index = null;
index = segment.termIndex().get(keyhash, null); index = segment.termIndex().get(keyhash, null);
final Iterator<WordReference> en = index.entries(); final Iterator<WordReference> en = index.entries();
@ -169,12 +169,14 @@ public class IndexControlRWIs_p {
if (delurlref) { if (delurlref) {
for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true); for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true);
} }
// delete the word first because that is much faster than the deletion of the urls from the url database
segment.termIndex().delete(keyhash);
// now delete all urls if demanded
if (delurl || delurlref) { if (delurl || delurlref) {
for (i = 0; i < urlx.length; i++) { for (i = 0; i < urlx.length; i++) {
sb.urlRemove(segment, urlx[i]); sb.urlRemove(segment, urlx[i]);
} }
} }
segment.termIndex().delete(keyhash);
post.remove("keyhashdeleteall"); post.remove("keyhashdeleteall");
post.put("urllist", "generated"); post.put("urllist", "generated");
} catch (IOException e) { } catch (IOException e) {

@ -1,4 +1,4 @@
// plasmaCrawlLURL.java // ResultURLs.java
// ----------------------- // -----------------------
// part of YaCy // part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net // (C) by Michael Peter Christen; mc@yacy.net
@ -23,48 +23,43 @@
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/*
This class provides storage functions for the plasma search engine.
- the url-specific properties, including condenser results
- the text content of the url
Both entities are accessed with a hash, which is based on the MD5
algorithm. The MD5 is not encoded as a hex value, but a b64 value.
*/
package de.anomic.crawler; package de.anomic.crawler;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.util.ReverseMapIterator;
import net.yacy.kelondro.util.ScoreCluster; import net.yacy.kelondro.util.ScoreCluster;
import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.crawler.retrieval.EventOrigin;
import de.anomic.yacy.yacySeedDB;
public final class ResultURLs { public final class ResultURLs {
// result stacks; private final Map<EventOrigin, LinkedHashMap<String, InitExecEntry>> resultStacks; // a mapping from urlHash to Entries
// these have all entries of form
// strings: urlHash + initiatorHash + ExecutorHash
private final Map<EventOrigin, LinkedList<String>> resultStacks;
private final Map<EventOrigin, ScoreCluster<String>> resultDomains; private final Map<EventOrigin, ScoreCluster<String>> resultDomains;
public class InitExecEntry {
public String initiatorHash, executorHash;
public InitExecEntry(final String initiatorHash, final String executorHash) {
this.initiatorHash = initiatorHash;
this.executorHash = executorHash;
}
}
public ResultURLs(int initialStackCapacity) { public ResultURLs(int initialStackCapacity) {
// init result stacks // init result stacks
resultStacks = new HashMap<EventOrigin, LinkedList<String>>(initialStackCapacity); resultStacks = new HashMap<EventOrigin, LinkedHashMap<String, InitExecEntry>>(initialStackCapacity);
resultDomains = new HashMap<EventOrigin, ScoreCluster<String>>(initialStackCapacity); resultDomains = new HashMap<EventOrigin, ScoreCluster<String>>(initialStackCapacity);
for (EventOrigin origin: EventOrigin.values()) { for (EventOrigin origin: EventOrigin.values()) {
resultStacks.put(origin, new LinkedList<String>()); resultStacks.put(origin, new LinkedHashMap<String, InitExecEntry>());
resultDomains.put(origin, new ScoreCluster<String>()); resultDomains.put(origin, new ScoreCluster<String>());
} }
} }
@ -74,9 +69,9 @@ public final class ResultURLs {
assert executorHash != null; assert executorHash != null;
if (e == null) { return; } if (e == null) { return; }
try { try {
final List<String> resultStack = getStack(stackType); final LinkedHashMap<String, InitExecEntry> resultStack = getStack(stackType);
if (resultStack != null) { if (resultStack != null) {
resultStack.add(e.hash() + initiatorHash + executorHash); resultStack.put(e.hash(), new InitExecEntry(initiatorHash, executorHash));
} }
} catch (final Exception ex) { } catch (final Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString());
@ -94,7 +89,7 @@ public final class ResultURLs {
} }
public synchronized int getStackSize(final EventOrigin stack) { public synchronized int getStackSize(final EventOrigin stack) {
final List<String> resultStack = getStack(stack); final LinkedHashMap<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack == null) return 0; if (resultStack == null) return 0;
return resultStack.size(); return resultStack.size();
} }
@ -104,77 +99,11 @@ public final class ResultURLs {
if (domains == null) return 0; if (domains == null) return 0;
return domains.size(); return domains.size();
} }
public synchronized String getUrlHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 0);
}
public synchronized String getInitiatorHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 1);
}
public synchronized String getExecutorHash(final EventOrigin stack, final int pos) {
return getHashNo(stack, pos, 2);
}
/** public synchronized Iterator<Map.Entry<String, InitExecEntry>> results(final EventOrigin stack) {
* gets the hash at <em>index</em> in element at <em>pos</em> in <em>stack</em> (based on {@link yacySeedDB#commonHashLength}) final LinkedHashMap<String, InitExecEntry> resultStack = getStack(stack);
* if (resultStack == null) return new LinkedHashMap<String, InitExecEntry>().entrySet().iterator();
* <p>simplified example with {@link yacySeedDB#commonHashLength} = 3:</p> return new ReverseMapIterator<String, InitExecEntry>(resultStack);
* <code>String[][] stacks[1][0] = "123456789";
* System.out.println(getHashNo(1, 0, 0));
* System.out.println(getHashNo(1, 0, 0));
* System.out.println(getHashNo(1, 0, 0));</code>
* <p>Output:
* 123<br/>
* 456<br/>
* 789</p>
*
* @param stack
* @param pos
* @param index starting at 0
* @return
*/
public synchronized String getHashNo(final EventOrigin stack, final int pos, final int index) {
final String result = getResultStackAt(stack, pos);
if (result != null) {
if (result.length() < Word.commonHashLength * 3) {
Log.logSevere("ResultURLs", "unexpected error: result of stack is too short: "+ result.length());
if(result.length() <= Word.commonHashLength * 2) {
return null;
}
// return what is there
return result.substring(Word.commonHashLength * 2);
}
return result.substring(Word.commonHashLength * index, Word.commonHashLength * (index + 1));
} else if(isValidStack(stack)) {
Log.logSevere("ResultURLs", "unexpected error: result of stack is null: "+ stack +","+ pos);
}
return result;
}
/**
* gets the element at pos in stack
*
* @param stack
* @param pos
* @return null if either stack or element do not exist
*/
private String getResultStackAt(final EventOrigin stack, final int pos) {
assert pos >= 0 : "precondition violated: " + pos + " >= 0";
final List<String> resultStack = getStack(stack);
if(resultStack == null) {
return null;
}
assert pos < resultStack.size() : "pos = " + pos + ", resultStack.size() = " + resultStack.size();
if(pos >= resultStack.size()) {
Log.logSevere("ResultURLs", "unexpected error: Index out of Bounds "+ pos +" of "+ resultStack.size());
return null;
}
return resultStack.get(pos);
} }
/** /**
@ -187,14 +116,16 @@ public final class ResultURLs {
} }
public int deleteDomain(final EventOrigin stack, String host, String hosthash) { public int deleteDomain(final EventOrigin stack, String host, String hosthash) {
assert host != null : "host = null";
assert hosthash.length() == 6; assert hosthash.length() == 6;
int i = 0; final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack);
Map.Entry<String, InitExecEntry> w;
String urlhash; String urlhash;
while (i < getStackSize(stack)) { while (i.hasNext()) {
urlhash = getUrlHash(stack, i); w = i.next();
if (urlhash == null || urlhash.substring(6).equals(hosthash)) getStack(stack).remove(i); else i++; urlhash = w.getKey();
if (urlhash == null || urlhash.substring(6).equals(hosthash)) i.remove();
} }
assert host != null : "host = null";
assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).deleteScore(host); return getDomains(stack).deleteScore(host);
} }
@ -217,33 +148,15 @@ public final class ResultURLs {
* @param stack id of resultStack * @param stack id of resultStack
* @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged)) * @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged))
*/ */
private List<String> getStack(final EventOrigin stack) { private LinkedHashMap<String, InitExecEntry> getStack(final EventOrigin stack) {
return resultStacks.get(stack); return resultStacks.get(stack);
} }
private ScoreCluster<String> getDomains(final EventOrigin stack) { private ScoreCluster<String> getDomains(final EventOrigin stack) {
return resultDomains.get(stack); return resultDomains.get(stack);
} }
/**
* tests if a stack with id <em>stack</em> exists
*
* @param stack
* @return
*/
private boolean isValidStack(final EventOrigin stack) {
return getStack(stack) != null;
}
public synchronized boolean removeStack(final EventOrigin stack, final int pos) {
final List<String> resultStack = getStack(stack);
if (resultStack == null) {
return false;
}
return resultStack.remove(pos) != null;
}
public synchronized void clearStack(final EventOrigin stack) { public synchronized void clearStack(final EventOrigin stack) {
final List<String> resultStack = getStack(stack); final LinkedHashMap<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack != null) resultStack.clear(); if (resultStack != null) resultStack.clear();
final ScoreCluster<String> resultDomains = getDomains(stack); final ScoreCluster<String> resultDomains = getDomains(stack);
if (resultDomains != null) { if (resultDomains != null) {
@ -255,15 +168,10 @@ public final class ResultURLs {
public synchronized boolean remove(final String urlHash) { public synchronized boolean remove(final String urlHash) {
if (urlHash == null) return false; if (urlHash == null) return false;
String hash; LinkedHashMap<String, InitExecEntry> resultStack;
for (EventOrigin origin: EventOrigin.values()) { for (EventOrigin origin: EventOrigin.values()) {
for (int i = getStackSize(origin) - 1; i >= 0; i--) { resultStack = getStack(origin);
hash = getUrlHash(origin, i); if (resultStack != null) resultStack.remove(urlHash);
if (hash != null && hash.equals(urlHash)) {
removeStack(origin, i);
return true;
}
}
} }
return true; return true;
} }
@ -283,16 +191,6 @@ public final class ResultURLs {
results.stack(urlRef, urlRef.hash(), url.hash(), stackNo); results.stack(urlRef, urlRef.hash(), url.hash(), stackNo);
// size // size
System.out.println("size of stack:\t"+ results.getStackSize(stackNo)); System.out.println("size of stack:\t"+ results.getStackSize(stackNo));
// get
System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 0));
System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 0));
System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 0));
// test errors
System.out.println("invalid test:\n=======");
// get
System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 1));
System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 1));
System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 1));
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {
Log.logException(e); Log.logException(e);
} }

@ -0,0 +1,80 @@
// ReverseMapIterator.java
// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 16.10.2010 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-11-05 21:28:37 +0100 (Do, 05 Nov 2009) $
// $LastChangedRevision: 6458 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.kelondro.util;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
public class ReverseMapIterator <E, F> implements Iterator<Map.Entry<E, F>> {
ArrayList<E> a;
Map<E, F> map;
E last;
public ReverseMapIterator(Map<E, F> map) {
this.map = map;
this.a = new ArrayList<E>();
for (E e: map.keySet()) a.add(e);
}
public boolean hasNext() {
return a.size() > 0;
}
public Map.Entry<E, F> next() {
this.last = a.remove(a.size() - 1);
return new Entry0(this.last, this.map.get(this.last));
}
public void remove() {
this.map.remove(this.last);
}
public class Entry0 implements Map.Entry<E, F> {
E e;
F f;
public Entry0(final E e, final F f) {
this.e = e;
this.f = f;
}
public E getKey() {
return this.e;
}
public F getValue() {
return this.f;
}
public F setValue(F value) {
F f0 = this.f;
this.f = value;
return f0;
}
}
}
Loading…
Cancel
Save