diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index 0bea535dd..8f164bc6d 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -29,11 +29,13 @@ import java.text.SimpleDateFormat; import java.util.Date; import java.util.Iterator; import java.util.Locale; +import java.util.Map; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; +import de.anomic.crawler.ResultURLs.InitExecEntry; import de.anomic.crawler.retrieval.EventOrigin; import de.anomic.http.server.RequestHeader; import de.anomic.search.Segments; @@ -173,21 +175,20 @@ public class CrawlResults { prop.put("table_showURL", (showURL) ? "1" : "0"); boolean dark = true; - String urlHash, initiatorHash, executorHash; String urlstr, urltxt; yacySeed initiatorSeed, executorSeed; URIMetadataRow urle; URIMetadataRow.Components metadata; - int i, cnt = 0; - for (i = sb.crawlResults.getStackSize(tabletype) - 1; i >= (sb.crawlResults.getStackSize(tabletype) - lines); i--) { - initiatorHash = sb.crawlResults.getInitiatorHash(tabletype, i); - executorHash = sb.crawlResults.getExecutorHash(tabletype, i); - urlHash = sb.crawlResults.getUrlHash(tabletype, i); + int cnt = 0; + Iterator> i = sb.crawlResults.results(tabletype); + Map.Entry entry; + while (i.hasNext()) { + entry = i.next(); try { - urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(urlHash, null, 0); + urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey(), null, 0); if(urle == null) { - Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ urlHash); + Log.logWarning("PLASMA", "CrawlResults: URL not in index for crawl result "+ i +" with hash "+ entry.getKey()); urlstr = null; urltxt = null; metadata = null; @@ -196,13 +197,13 @@ public class CrawlResults { urlstr = metadata.url().toNormalform(false, true); urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL } - initiatorSeed = sb.peers.getConnected(initiatorHash); - executorSeed = sb.peers.getConnected(executorHash); + initiatorSeed = sb.peers.getConnected(entry.getValue().initiatorHash); + executorSeed = sb.peers.getConnected(entry.getValue().executorHash); prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0"); prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html"); prop.put("table_indexed_" + cnt + "_tabletype", tabletype.getCode()); - prop.put("table_indexed_" + cnt + "_urlhash", urlHash); + prop.put("table_indexed_" + cnt + "_urlhash", entry.getKey()); if (showInit) { prop.put("table_indexed_" + cnt + "_showInit", "1"); @@ -239,7 +240,7 @@ public class CrawlResults { prop.putHTML("table_indexed_" + cnt + "_showTitle_available_nodescr_urldescr", metadata.dc_title()); } - prop.put("table_indexed_" + cnt + "_showTitle_available_urlHash", urlHash); + prop.put("table_indexed_" + cnt + "_showTitle_available_urlHash", entry.getKey()); prop.putHTML("table_indexed_" + cnt + "_showTitle_available_urltitle", urlstr); } else prop.put("table_indexed_" + cnt + "_showTitle", "0"); @@ -248,7 +249,7 @@ public class CrawlResults { prop.put("table_indexed_" + cnt + "_showURL", "1"); prop.put("table_indexed_" + cnt + "_showURL_available", "1"); - prop.put("table_indexed_" + cnt + "_showURL_available_urlHash", urlHash); + prop.put("table_indexed_" + cnt + "_showURL_available_urlHash", entry.getKey()); prop.putHTML("table_indexed_" + cnt + "_showURL_available_urltitle", urlstr); prop.put("table_indexed_" + cnt + "_showURL_available_url", urltxt); } else diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 674d4bfbb..c5dd47de9 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -155,7 +155,7 @@ public class IndexControlRWIs_p { // delete word if (post.containsKey("keyhashdeleteall")) try { if (delurl || delurlref) { - // generate an urlx array + // generate urlx: an array of url hashes to be deleted ReferenceContainer index = null; index = segment.termIndex().get(keyhash, null); final Iterator en = index.entries(); @@ -169,12 +169,14 @@ public class IndexControlRWIs_p { if (delurlref) { for (i = 0; i < urlx.length; i++) segment.removeAllUrlReferences(urlx[i], sb.loader, true); } + // delete the word first because that is much faster than the deletion of the urls from the url database + segment.termIndex().delete(keyhash); + // now delete all urls if demanded if (delurl || delurlref) { for (i = 0; i < urlx.length; i++) { sb.urlRemove(segment, urlx[i]); } } - segment.termIndex().delete(keyhash); post.remove("keyhashdeleteall"); post.put("urllist", "generated"); } catch (IOException e) { diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java index e32d2079c..7308b0658 100644 --- a/source/de/anomic/crawler/ResultURLs.java +++ b/source/de/anomic/crawler/ResultURLs.java @@ -1,4 +1,4 @@ -// plasmaCrawlLURL.java +// ResultURLs.java // ----------------------- // part of YaCy // (C) by Michael Peter Christen; mc@yacy.net @@ -23,48 +23,43 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -/* - This class provides storage functions for the plasma search engine. - - the url-specific properties, including condenser results - - the text content of the url - Both entities are accessed with a hash, which is based on the MD5 - algorithm. The MD5 is not encoded as a hex value, but a b64 value. -*/ - package de.anomic.crawler; import java.net.MalformedURLException; import java.util.Date; import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; +import java.util.LinkedHashMap; import java.util.Map; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Bitfield; +import net.yacy.kelondro.util.ReverseMapIterator; import net.yacy.kelondro.util.ScoreCluster; import de.anomic.crawler.retrieval.EventOrigin; -import de.anomic.yacy.yacySeedDB; public final class ResultURLs { - // result stacks; - // these have all entries of form - // strings: urlHash + initiatorHash + ExecutorHash - private final Map> resultStacks; + private final Map> resultStacks; // a mapping from urlHash to Entries private final Map> resultDomains; + public class InitExecEntry { + public String initiatorHash, executorHash; + public InitExecEntry(final String initiatorHash, final String executorHash) { + this.initiatorHash = initiatorHash; + this.executorHash = executorHash; + } + } + public ResultURLs(int initialStackCapacity) { // init result stacks - resultStacks = new HashMap>(initialStackCapacity); + resultStacks = new HashMap>(initialStackCapacity); resultDomains = new HashMap>(initialStackCapacity); for (EventOrigin origin: EventOrigin.values()) { - resultStacks.put(origin, new LinkedList()); + resultStacks.put(origin, new LinkedHashMap()); resultDomains.put(origin, new ScoreCluster()); } } @@ -74,9 +69,9 @@ public final class ResultURLs { assert executorHash != null; if (e == null) { return; } try { - final List resultStack = getStack(stackType); + final LinkedHashMap resultStack = getStack(stackType); if (resultStack != null) { - resultStack.add(e.hash() + initiatorHash + executorHash); + resultStack.put(e.hash(), new InitExecEntry(initiatorHash, executorHash)); } } catch (final Exception ex) { System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); @@ -94,7 +89,7 @@ public final class ResultURLs { } public synchronized int getStackSize(final EventOrigin stack) { - final List resultStack = getStack(stack); + final LinkedHashMap resultStack = getStack(stack); if (resultStack == null) return 0; return resultStack.size(); } @@ -104,77 +99,11 @@ public final class ResultURLs { if (domains == null) return 0; return domains.size(); } - - public synchronized String getUrlHash(final EventOrigin stack, final int pos) { - return getHashNo(stack, pos, 0); - } - - public synchronized String getInitiatorHash(final EventOrigin stack, final int pos) { - return getHashNo(stack, pos, 1); - } - - public synchronized String getExecutorHash(final EventOrigin stack, final int pos) { - return getHashNo(stack, pos, 2); - } - /** - * gets the hash at index in element at pos in stack (based on {@link yacySeedDB#commonHashLength}) - * - *

simplified example with {@link yacySeedDB#commonHashLength} = 3:

- * String[][] stacks[1][0] = "123456789"; - * System.out.println(getHashNo(1, 0, 0)); - * System.out.println(getHashNo(1, 0, 0)); - * System.out.println(getHashNo(1, 0, 0)); - *

Output: - * 123
- * 456
- * 789

- * - * @param stack - * @param pos - * @param index starting at 0 - * @return - */ - public synchronized String getHashNo(final EventOrigin stack, final int pos, final int index) { - final String result = getResultStackAt(stack, pos); - if (result != null) { - if (result.length() < Word.commonHashLength * 3) { - Log.logSevere("ResultURLs", "unexpected error: result of stack is too short: "+ result.length()); - if(result.length() <= Word.commonHashLength * 2) { - return null; - } - // return what is there - return result.substring(Word.commonHashLength * 2); - } - return result.substring(Word.commonHashLength * index, Word.commonHashLength * (index + 1)); - } else if(isValidStack(stack)) { - Log.logSevere("ResultURLs", "unexpected error: result of stack is null: "+ stack +","+ pos); - } - return result; - } - - /** - * gets the element at pos in stack - * - * @param stack - * @param pos - * @return null if either stack or element do not exist - */ - private String getResultStackAt(final EventOrigin stack, final int pos) { - assert pos >= 0 : "precondition violated: " + pos + " >= 0"; - - final List resultStack = getStack(stack); - - if(resultStack == null) { - return null; - } - assert pos < resultStack.size() : "pos = " + pos + ", resultStack.size() = " + resultStack.size(); - if(pos >= resultStack.size()) { - Log.logSevere("ResultURLs", "unexpected error: Index out of Bounds "+ pos +" of "+ resultStack.size()); - return null; - } - - return resultStack.get(pos); + public synchronized Iterator> results(final EventOrigin stack) { + final LinkedHashMap resultStack = getStack(stack); + if (resultStack == null) return new LinkedHashMap().entrySet().iterator(); + return new ReverseMapIterator(resultStack); } /** @@ -187,14 +116,16 @@ public final class ResultURLs { } public int deleteDomain(final EventOrigin stack, String host, String hosthash) { + assert host != null : "host = null"; assert hosthash.length() == 6; - int i = 0; + final Iterator> i = results(stack); + Map.Entry w; String urlhash; - while (i < getStackSize(stack)) { - urlhash = getUrlHash(stack, i); - if (urlhash == null || urlhash.substring(6).equals(hosthash)) getStack(stack).remove(i); else i++; + while (i.hasNext()) { + w = i.next(); + urlhash = w.getKey(); + if (urlhash == null || urlhash.substring(6).equals(hosthash)) i.remove(); } - assert host != null : "host = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; return getDomains(stack).deleteScore(host); } @@ -217,33 +148,15 @@ public final class ResultURLs { * @param stack id of resultStack * @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged)) */ - private List getStack(final EventOrigin stack) { + private LinkedHashMap getStack(final EventOrigin stack) { return resultStacks.get(stack); } private ScoreCluster getDomains(final EventOrigin stack) { return resultDomains.get(stack); } - - /** - * tests if a stack with id stack exists - * - * @param stack - * @return - */ - private boolean isValidStack(final EventOrigin stack) { - return getStack(stack) != null; - } - - public synchronized boolean removeStack(final EventOrigin stack, final int pos) { - final List resultStack = getStack(stack); - if (resultStack == null) { - return false; - } - return resultStack.remove(pos) != null; - } public synchronized void clearStack(final EventOrigin stack) { - final List resultStack = getStack(stack); + final LinkedHashMap resultStack = getStack(stack); if (resultStack != null) resultStack.clear(); final ScoreCluster resultDomains = getDomains(stack); if (resultDomains != null) { @@ -255,15 +168,10 @@ public final class ResultURLs { public synchronized boolean remove(final String urlHash) { if (urlHash == null) return false; - String hash; + LinkedHashMap resultStack; for (EventOrigin origin: EventOrigin.values()) { - for (int i = getStackSize(origin) - 1; i >= 0; i--) { - hash = getUrlHash(origin, i); - if (hash != null && hash.equals(urlHash)) { - removeStack(origin, i); - return true; - } - } + resultStack = getStack(origin); + if (resultStack != null) resultStack.remove(urlHash); } return true; } @@ -283,16 +191,6 @@ public final class ResultURLs { results.stack(urlRef, urlRef.hash(), url.hash(), stackNo); // size System.out.println("size of stack:\t"+ results.getStackSize(stackNo)); - // get - System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 0)); - System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 0)); - System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 0)); - // test errors - System.out.println("invalid test:\n======="); - // get - System.out.println("url hash:\t"+ results.getUrlHash(stackNo, 1)); - System.out.println("executor hash:\t"+ results.getExecutorHash(stackNo, 1)); - System.out.println("initiator hash:\t"+ results.getInitiatorHash(stackNo, 1)); } catch (final MalformedURLException e) { Log.logException(e); } diff --git a/source/net/yacy/kelondro/util/ReverseMapIterator.java b/source/net/yacy/kelondro/util/ReverseMapIterator.java new file mode 100644 index 000000000..6818c68e9 --- /dev/null +++ b/source/net/yacy/kelondro/util/ReverseMapIterator.java @@ -0,0 +1,80 @@ +// ReverseMapIterator.java +// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 16.10.2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-11-05 21:28:37 +0100 (Do, 05 Nov 2009) $ +// $LastChangedRevision: 6458 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.kelondro.util; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Map; + +public class ReverseMapIterator implements Iterator> { + ArrayList a; + Map map; + E last; + + public ReverseMapIterator(Map map) { + this.map = map; + this.a = new ArrayList(); + for (E e: map.keySet()) a.add(e); + } + + public boolean hasNext() { + return a.size() > 0; + } + + public Map.Entry next() { + this.last = a.remove(a.size() - 1); + return new Entry0(this.last, this.map.get(this.last)); + } + + public void remove() { + this.map.remove(this.last); + } + + public class Entry0 implements Map.Entry { + E e; + F f; + public Entry0(final E e, final F f) { + this.e = e; + this.f = f; + } + + public E getKey() { + return this.e; + } + + public F getValue() { + return this.f; + } + + public F setValue(F value) { + F f0 = this.f; + this.f = value; + return f0; + } + + } +}