diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index a2af194c9..476f722cd 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -31,6 +31,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.TreeSet; @@ -49,6 +50,7 @@ import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.net.natLib; import de.anomic.search.ContentDomain; +import de.anomic.search.Navigator; import de.anomic.search.QueryParams; import de.anomic.search.RankingProfile; import de.anomic.search.SearchEvent; @@ -57,7 +59,6 @@ import de.anomic.search.Segment; import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.search.ResultEntry; -import de.anomic.search.RankingProcess.NavigatorEntry; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -336,9 +337,9 @@ public final class search { // prepare reference hints final long timer = System.currentTimeMillis(); - final ArrayList ws = theSearch.getTopicNavigator(10); + final List ws = theSearch.getTopicNavigator(10); final StringBuilder refstr = new StringBuilder(6000); - for (NavigatorEntry e: ws) { + for (Navigator.Item e: ws) { refstr.append(",").append(e.name); } prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString()); diff --git a/htroot/yacysearch.html b/htroot/yacysearch.html index f63bfab5a..3381eed47 100644 --- a/htroot/yacysearch.html +++ b/htroot/yacysearch.html @@ -53,9 +53,10 @@ $(function() { }); $("#sidebar1").accordion({}); $("#sidebar2").accordion({}); - $("#sidebar2").accordion('activate', false); $("#sidebar3").accordion({}); $("#sidebar3").accordion('activate', false); + $("#sidebar4").accordion({}); + $("#sidebar4").accordion('activate', false); $("#sidebarAbout").accordion({}); $("#search").focus(); }); diff --git a/htroot/yacysearchtrailer.html b/htroot/yacysearchtrailer.html index e6f868031..f1a8a3de0 100644 --- a/htroot/yacysearchtrailer.html +++ b/htroot/yacysearchtrailer.html @@ -8,6 +8,15 @@
+#(nav-namespace)#:: +

Name Space Navigator

+
    #{element}# +
  • #[url]#
  • +#{/element}#
+#(/nav-namespace)# +
+ +
#(nav-authors)#::

Author Navigator

    #{element}# @@ -16,7 +25,7 @@ #(/nav-authors)#
-
+
#(nav-topics)#::

Topic Navigator

    #{element}# diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index ec00474c8..24ca54b9e 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -26,14 +26,15 @@ import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import net.yacy.kelondro.util.EventTracker; import de.anomic.http.server.RequestHeader; +import de.anomic.search.Navigator; import de.anomic.search.QueryParams; import de.anomic.search.SearchEvent; import de.anomic.search.SearchEventCache; -import de.anomic.search.RankingProcess.NavigatorEntry; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.graphics.ProfilingGraph; @@ -58,39 +59,61 @@ public class yacysearchtrailer { final QueryParams theQuery = theSearch.getQuery(); // compose search navigation + + // namespace navigators + ArrayList namespaceNavigator = theSearch.getNamespaceNavigator(10); + if (namespaceNavigator == null || namespaceNavigator.isEmpty()) { + prop.put("nav-namespace", 0); + } else { + prop.put("nav-namespace", 1); + Navigator.Item entry; + int i; + for (i = 0; i < Math.min(10, namespaceNavigator.size()); i++) { + entry = namespaceNavigator.get(i); + prop.put("nav-namespace_element_" + i + "_name", entry.name); + prop.put("nav-namespace_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); + prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, "inurl:" + entry.name, theQuery.navigators)); + prop.put("nav-namespace_element_" + i + "_count", entry.count); + prop.put("nav-namespace_element_" + i + "_modifier", "inurl:" + entry.name); + prop.put("nav-namespace_element_" + i + "_nl", 1); + } + i--; + prop.put("nav-namespace_element_" + i + "_nl", 0); + prop.put("nav-namespace_element", namespaceNavigator.size()); + } // host navigators - ArrayList hostNavigator = theSearch.getHostNavigator(10); + List hostNavigator = theSearch.getHostNavigator(10); if (hostNavigator == null || hostNavigator.isEmpty()) { - prop.put("nav-domains", 0); + prop.put("nav-domains", 0); } else { - prop.put("nav-domains", 1); - NavigatorEntry entry; - int i; - for (i = 0; i < hostNavigator.size(); i++) { - entry = hostNavigator.get(i); - prop.put("nav-domains_element_" + i + "_name", entry.name); + prop.put("nav-domains", 1); + Navigator.Item entry; + int i; + for (i = 0; i < Math.min(10, hostNavigator.size()); i++) { + entry = hostNavigator.get(i); + prop.put("nav-domains_element_" + i + "_name", entry.name); prop.put("nav-domains_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); - prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators)); + prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators)); prop.put("nav-domains_element_" + i + "_count", entry.count); - prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name); + prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name); prop.put("nav-domains_element_" + i + "_nl", 1); - } - i--; - prop.put("nav-domains_element_" + i + "_nl", 0); - prop.put("nav-domains_element", hostNavigator.size()); + } + i--; + prop.put("nav-domains_element_" + i + "_nl", 0); + prop.put("nav-domains_element", hostNavigator.size()); } // author navigators - ArrayList authorNavigator = theSearch.getAuthorNavigator(10); + List authorNavigator = theSearch.getAuthorNavigator(10); if (authorNavigator == null || authorNavigator.isEmpty()) { prop.put("nav-authors", 0); } else { prop.put("nav-authors", 1); - NavigatorEntry entry; + Navigator.Item entry; int i; String anav; - for (i = 0; i < authorNavigator.size(); i++) { + for (i = 0; i < Math.min(10, authorNavigator.size()); i++) { entry = authorNavigator.get(i); anav = (entry.name.indexOf(' ') < 0) ? "author:" + entry.name : "author:'" + entry.name + "'"; prop.put("nav-authors_element_" + i + "_name", entry.name); @@ -106,15 +129,15 @@ public class yacysearchtrailer { } // topics navigator - ArrayList topicNavigator = theSearch.getTopicNavigator(10); + List topicNavigator = theSearch.getTopicNavigator(10); if (topicNavigator == null || topicNavigator.isEmpty()) { - topicNavigator = new ArrayList(); + topicNavigator = new ArrayList(); prop.put("nav-topics", "0"); } else { prop.put("nav-topics", "1"); int i = 0; - NavigatorEntry e; - Iterator iter = topicNavigator.iterator(); + Navigator.Item e; + Iterator iter = topicNavigator.iterator(); while (iter.hasNext()) { e = iter.next(); if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break; diff --git a/htroot/yacysearchtrailer.json b/htroot/yacysearchtrailer.json index 65f394d9f..21e155669 100644 --- a/htroot/yacysearchtrailer.json +++ b/htroot/yacysearchtrailer.json @@ -11,7 +11,20 @@ {"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)# #{/element}# ] - },#(/nav-domains)##(nav-authors)#:: + },#(/nav-domains)##(nav-namespace)#:: + { + "facetname": "namespace", + "displayname": "Name Space", + "type": "String", + "min": "0", + "max": "0", + "mean": "0", + "elements": [ +#{element}# + {"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)# +#{/element}# + ] + },#(/nav-namespace)##(nav-authors)#:: { "facetname": "authors", "displayname": "Authors", diff --git a/htroot/yacysearchtrailer.xml b/htroot/yacysearchtrailer.xml index f9c10cc94..dc2aca388 100644 --- a/htroot/yacysearchtrailer.xml +++ b/htroot/yacysearchtrailer.xml @@ -6,6 +6,20 @@ #{/element}# #(/nav-domains)# +#(nav-namespace)#:: + +#{element}# + +#{/element}# + +#(/nav-namespace)# +#(nav-authors)#:: + +#{element}# + +#{/element}# + +#(/nav-authors)# #(nav-topics)#:: #{element}# diff --git a/source/de/anomic/data/list/ListAccumulator.java b/source/de/anomic/data/list/ListAccumulator.java index e73d7b8ad..f11494a7f 100644 --- a/source/de/anomic/data/list/ListAccumulator.java +++ b/source/de/anomic/data/list/ListAccumulator.java @@ -65,7 +65,7 @@ public class ListAccumulator { /** * Adds a new entry to a list identified by a given name. - * @param name The name of the list the entry is to be added to. + * @param key The name of the list the entry is to be added to. * @param entry The new entry. * @return True if the entry has been added, else false (if list does not exists). */ diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java index 09a0079ad..29c50231b 100644 --- a/source/de/anomic/search/DocumentIndex.java +++ b/source/de/anomic/search/DocumentIndex.java @@ -29,6 +29,7 @@ package de.anomic.search; import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Date; import java.util.concurrent.BlockingQueue; @@ -54,18 +55,24 @@ public class DocumentIndex extends Segment { private static final RankingProfile textRankingDefault = new RankingProfile(ContentDomain.TEXT); //private Bitfield zeroConstraint = new Bitfield(4); - final static File poison = new File("."); - BlockingQueue queue; + private static DigestURI poison; + static { + try { + poison = new DigestURI("file://."); + } catch (MalformedURLException e) {} + } + BlockingQueue queue; // a queue of document ID's private Worker[] worker; CallbackListener callback; static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup"); + public DocumentIndex(final File segmentPath, CallbackListener callback, int cachesize) throws IOException { super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false); int cores = Runtime.getRuntime().availableProcessors() + 1; this.callback = callback; - this.queue = new LinkedBlockingQueue(cores * 300); + this.queue = new LinkedBlockingQueue(cores * 300); this.worker = new Worker[cores]; for (int i = 0; i < cores; i++) { this.worker[i] = new Worker(i); @@ -79,7 +86,7 @@ public class DocumentIndex extends Segment { } public void run() { - File f; + DigestURI f; URIMetadataRow resultRow; try { while ((f = queue.take()) != poison) try { @@ -110,31 +117,24 @@ public class DocumentIndex extends Segment { this.queue.clear(); } - /** - * put a single file into the index - * @param file - * @return a metadata object that has been generated to identify the file - * @throws IOException in case that the file does not exist or cannot be parsed - */ - public URIMetadataRow add(File file) throws IOException { - if (file == null) throw new IOException("file = null"); - if (file.isDirectory()) throw new IOException("file should be a document, not a path"); - if (!file.canRead()) throw new IOException("cannot read file"); - DigestURI url = new DigestURI("file:" + file.getAbsolutePath()); - Document document; + public URIMetadataRow add(DigestURI url) throws IOException { + if (url == null) throw new IOException("file = null"); + if (url.isDirectory()) throw new IOException("file should be a document, not a path"); + if (!url.canRead()) throw new IOException("cannot read file"); + Document document; try { - document = TextParser.parseSource(url, null, null, file); + document = TextParser.parseSource(url, null, null, url.length(), url.getInputStream()); } catch (InterruptedException e) { - throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage()); + throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } catch (ParserException e) { - throw new IOException("cannot parse " + file.toString() + ": " + e.getMessage()); + throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } final Condenser condenser = new Condenser(document, true, true); return super.storeDocument( url, null, - new Date(file.lastModified()), - file.length(), + new Date(url.lastModified()), + url.length(), document, condenser ); @@ -145,7 +145,7 @@ public class DocumentIndex extends Segment { * If the given file is a path to a directory, the complete sub-tree is indexed * @param start */ - public void addConcurrent(File start) { + public void addConcurrent(DigestURI start) { assert (start != null); assert (start.canRead()) : start.toString(); if (!start.isDirectory()) { @@ -155,17 +155,21 @@ public class DocumentIndex extends Segment { return; } String[] s = start.list(); - File w; + DigestURI w; for (String t: s) { - w = new File(start, t); - if (w.canRead() && !w.isHidden()) { - if (w.isDirectory()) { - addConcurrent(w); - } else { - try { - this.queue.put(w); - } catch (InterruptedException e) {} + try { + w = new DigestURI(start, t); + if (w.canRead() && !w.isHidden()) { + if (w.isDirectory()) { + addConcurrent(w); + } else { + try { + this.queue.put(w); + } catch (InterruptedException e) {} + } } + } catch (MalformedURLException e1) { + Log.logException(e1); } } } @@ -177,14 +181,14 @@ public class DocumentIndex extends Segment { * @param count * @return a list of files that contain the given string */ - public ArrayList find(String querystring, int pos, int count) { + public ArrayList find(String querystring, int pos, int count) { ArrayList result = findMetadata(querystring, this); - ArrayList files = new ArrayList(); + ArrayList files = new ArrayList(); Components metadata; for (URIMetadataRow row : result) { metadata = row.metadata(); if (metadata == null) continue; - files.add(metadata.url().getLocalFile()); + files.add(metadata.url()); count--; if (count == 0) break; } @@ -216,7 +220,7 @@ public class DocumentIndex extends Segment { * @param querystring * @return a list of files that contain the word */ - public ArrayList find(String querystring) { + public ArrayList find(String querystring) { return find(querystring, 0, 100); } @@ -242,8 +246,8 @@ public class DocumentIndex extends Segment { } public interface CallbackListener { - public void commit(File f, URIMetadataRow resultRow); - public void fail(File f, String failReason); + public void commit(DigestURI f, URIMetadataRow resultRow); + public void fail(DigestURI f, String failReason); } public static void main(String[] args) { @@ -259,16 +263,16 @@ public class DocumentIndex extends Segment { File segmentPath = new File(args[0]); System.out.println("using index files at " + segmentPath.getAbsolutePath()); CallbackListener callback = new CallbackListener() { - public void commit(File f, URIMetadataRow resultRow) { + public void commit(DigestURI f, URIMetadataRow resultRow) { System.out.println("indexed: " + f.toString()); } - public void fail(File f, String failReason) { + public void fail(DigestURI f, String failReason) { System.out.println("not indexed " + f.toString() + ": " + failReason); } }; try { if (args[1].equals("add")) { - File f = new File(args[2]); + DigestURI f = new DigestURI(args[2]); DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); di.addConcurrent(f); di.close(); @@ -277,8 +281,8 @@ public class DocumentIndex extends Segment { for (int i = 2; i < args.length; i++) query += args[i]; query.trim(); DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); - ArrayList results = di.find(query); - for (File f: results) { + ArrayList results = di.find(query); + for (DigestURI f: results) { if (f != null) System.out.println(f.toString()); } di.close(); diff --git a/source/de/anomic/search/Navigator.java b/source/de/anomic/search/Navigator.java new file mode 100644 index 000000000..c951f0e3c --- /dev/null +++ b/source/de/anomic/search/Navigator.java @@ -0,0 +1,97 @@ +// Navigator.java +// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 05.03.2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2010-01-29 16:59:24 +0100 (Fr, 29 Jan 2010) $ +// $LastChangedRevision: 6630 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +package de.anomic.search; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +public class Navigator { + private ConcurrentHashMap map; + + public Navigator() { + this.map = new ConcurrentHashMap(); + } + + /** + * a reverse comparator for navigator items + */ + public static final Comparator itemComp = new Comparator() { + public int compare(Item o1, Item o2) { + if (o1.count < o2.count) return 1; + if (o2.count < o1.count) return -1; + return 0; + } + }; + + public void inc(String key, String name) { + Item item = map.get(key); + if (item == null) { + map.put(key, new Item(name)); + } else { + item.inc(); + } + } + + public Map map() { + return this.map; + } + + public Item[] entries() { + Item[] ii = this.map.values().toArray(new Item[this.map.size()]); + Arrays.sort(ii, itemComp); + return ii; + } + + public List entries(int maxcount) { + Item[] ii = entries(); + int c = Math.min(ii.length, maxcount); + ArrayList a = new ArrayList(c); + for (int i = 0; i < c; i++) a.add(ii[i]); + return a; + } + + public static class Item { + public int count; + public String name; + public Item(String name) { + this.count = 1; + this.name = name; + } + public Item(String name, int count) { + this.count = count; + this.name = name; + } + public void inc() { + this.count++; + } + } +} diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 00793af87..84a62e9b6 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -1,4 +1,4 @@ -// plasmaSearchRankingProcess.java +// RankingProcess.java // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 07.11.2007 on http://yacy.net // @@ -29,12 +29,12 @@ package de.anomic.search; import java.io.File; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Comparator; import java.util.ConcurrentModificationException; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.TreeSet; import java.util.concurrent.BlockingQueue; @@ -81,9 +81,10 @@ public final class RankingProcess extends Thread { private final ConcurrentHashMap> doubleDomCache; // key = domhash (6 bytes); value = like stack private final HashSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process - private final ConcurrentHashMap ref; // reference score computation for the commonSense heuristic - private final ConcurrentHashMap hostNavigator; - private final ConcurrentHashMap authorNavigator; + private final Navigator ref; // reference score computation for the commonSense heuristic + private final Navigator hostNavigator; + private final Navigator authorNavigator; + private final Navigator namespaceNavigator; private final ReferenceOrder order; public RankingProcess(final QueryParams query, final ReferenceOrder order, final int maxentries, final int concurrency) { @@ -106,9 +107,10 @@ public final class RankingProcess extends Thread { this.misses = new TreeSet(); this.flagcount = new int[32]; for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} - this.hostNavigator = new ConcurrentHashMap(); - this.authorNavigator = new ConcurrentHashMap(); - this.ref = new ConcurrentHashMap(); + this.hostNavigator = new Navigator(); + this.authorNavigator = new Navigator(); + this.namespaceNavigator = new Navigator(); + this.ref = new Navigator(); //this.domZones = new int[8]; //for (int i = 0; i < 8; i++) {this.domZones[i] = 0;} this.feeders = concurrency; @@ -171,8 +173,7 @@ public final class RankingProcess extends Thread { EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false, 30000, ProfilingGraph.maxTime); // iterate over normalized entries and select some that are better than currently stored - timer = System.currentTimeMillis(); - HostInfo hs; + timer = System.currentTimeMillis(); String domhash; boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0; Long r; @@ -221,12 +222,7 @@ public final class RankingProcess extends Thread { // get statistics for host navigator if (nav_hosts) { domhash = iEntry.urlHash.substring(6); - hs = this.hostNavigator.get(domhash); - if (hs == null) { - this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash)); - } else { - hs.inc(); - } + this.hostNavigator.inc(domhash, iEntry.urlHash); } // accept @@ -377,6 +373,7 @@ public final class RankingProcess extends Thread { public URIMetadataRow takeURL(final boolean skipDoubleDom, final int timeout) { // returns from the current RWI list the best URL entry and removes this entry from the list long timeLimit = System.currentTimeMillis() + timeout; + int p = -1; while (System.currentTimeMillis() < timeLimit) { final SortStack.stackElement obrwi = takeRWI(skipDoubleDom); if (obrwi == null) { @@ -444,17 +441,22 @@ public final class RankingProcess extends Thread { } // add author to the author navigator - AuthorInfo in = this.authorNavigator.get(authorhash); - if (in == null) { - this.authorNavigator.put(authorhash, new AuthorInfo(pageauthor)); - } else { - in.inc(); - this.authorNavigator.put(authorhash, in); - } + this.authorNavigator.inc(authorhash, pageauthor); } else if (this.query.authorhash != null) { continue; } + // namespace navigation + String pagepath = metadata.url().getPath(); + if ((p = pagepath.indexOf(':')) >= 0) { + pagepath = pagepath.substring(0,p); + p = pagepath.lastIndexOf('/'); + if (p >= 0) { + pagepath = pagepath.substring(p + 1); + this.namespaceNavigator.inc(pagepath, pagepath); + } + } + // accept url //System.out.println("handing over hash " + page.hash()); this.handover.add(page.hash()); // remember that we handed over this url @@ -525,68 +527,32 @@ public final class RankingProcess extends Thread { return this.misses.iterator(); } - public class HostInfo { - public int count; - public String hashsample; - public HostInfo(String urlhash) { - this.count = 1; - this.hashsample = urlhash; - } - public void inc() { - this.count++; - } - } - - public class AuthorInfo { - public int count; - public String author; - public AuthorInfo(String author) { - this.count = 1; - this.author = author; - } - public void inc() { - this.count++; - } + public ArrayList getNamespaceNavigator(int count) { + if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace") < 0) return new ArrayList(0); + + Navigator.Item[] hsa = this.namespaceNavigator.entries(); + int rc = Math.min(count, hsa.length); + ArrayList result = new ArrayList(); + for (int i = 0; i < rc; i++) result.add(hsa[i]); + return result; } - public static final Comparator hscomp = new Comparator() { - public int compare(HostInfo o1, HostInfo o2) { - if (o1.count < o2.count) return 1; - if (o2.count < o1.count) return -1; - return 0; - } - }; - - public static final Comparator aicomp = new Comparator() { - public int compare(AuthorInfo o1, AuthorInfo o2) { - if (o1.count < o2.count) return 1; - if (o2.count < o1.count) return -1; - return 0; - } - }; - - public class NavigatorEntry { - public int count; - public String name; - public NavigatorEntry(String name, int count) { - this.name = name; - this.count = count; - } + public List getHostNavigators(int count) { + if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return new ArrayList(0); + + return this.hostNavigator.entries(10); } - - public ArrayList getHostNavigator(int count) { - if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return new ArrayList(0); + public List getHostNavigator(int count) { + List result = new ArrayList(); + if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return result; - HostInfo[] hsa = this.hostNavigator.values().toArray(new HostInfo[this.hostNavigator.size()]); - Arrays.sort(hsa, hscomp); - int rc = Math.min(count, hsa.length); - ArrayList result = new ArrayList(); + List hsa = this.hostNavigator.entries(10); URIMetadataRow mr; DigestURI url; String hostname; Components metadata; - loop: for (int i = 0; i < rc; i++) { - mr = this.query.getSegment().urlMetadata().load(hsa[i].hashsample, null, 0); + loop: for (Navigator.Item item: hsa) { + mr = this.query.getSegment().urlMetadata().load(item.name, null, 0); if (mr == null) continue; metadata = mr.metadata(); if (metadata == null) continue; @@ -595,12 +561,13 @@ public final class RankingProcess extends Thread { hostname = url.getHost(); if (hostname == null) continue; if (query.tenant != null && !hostname.contains(query.tenant) && !url.toNormalform(true, true).contains(query.tenant)) continue; - for (NavigatorEntry entry: result) if (entry.name.equals(hostname)) continue loop; // check if one entry already exists - result.add(new NavigatorEntry(hostname, hsa[i].count)); + for (Navigator.Item entry: result) if (entry.name.equals(hostname)) continue loop; // check if one entry already exists + result.add(new Navigator.Item(hostname, item.count)); } return result; } + public static final Comparator> mecomp = new Comparator>() { public int compare(Map.Entry o1, Map.Entry o2) { if (o1.getValue().intValue() < o2.getValue().intValue()) return 1; @@ -609,44 +576,29 @@ public final class RankingProcess extends Thread { } }; - public Map getTopics() { - return this.ref; + public Map getTopics() { + return this.ref.map(); } - @SuppressWarnings("unchecked") - public ArrayList getTopicNavigator(final int count) { + public List getTopicNavigator(final int count) { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls - if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("topics") < 0) return new ArrayList(0); + if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("topics") < 0) return new ArrayList(0); - Map.Entry[] a = this.ref.entrySet().toArray(new Map.Entry[this.ref.size()]); - Arrays.sort(a, mecomp); - int rc = Math.min(count, a.length); - ArrayList result = new ArrayList(); - Map.Entry e; - int c; - for (int i = 0; i < rc; i++) { - e = a[i]; - c = e.getValue().intValue(); - if (c == 0) break; - result.add(new NavigatorEntry(e.getKey(), c)); - } - return result; + return this.ref.entries(10); } public void addTopic(final String[] words) { String word; for (int i = 0; i < words.length; i++) { word = words[i].toLowerCase(); - Integer c; if (word.length() > 2 && "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0 && !query.queryHashes.contains(Word.word2hash(word)) && word.matches("[a-z]+") && !Switchboard.badwords.contains(word) && !Switchboard.stopwords.contains(word)) { - c = ref.get(word); - if (c == null) ref.put(word, 1); else ref.put(word, c.intValue() + 1); + ref.inc(word, word); } } } @@ -662,22 +614,12 @@ public final class RankingProcess extends Thread { addTopic(descrcomps); } - public ArrayList getAuthorNavigator(final int count) { + public List getAuthorNavigator(final int count) { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls - if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ArrayList(0); + if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ArrayList(0); - AuthorInfo[] a = this.authorNavigator.values().toArray(new AuthorInfo[this.authorNavigator.size()]); - Arrays.sort(a, aicomp); - int rc = Math.min(count, a.length); - ArrayList result = new ArrayList(); - AuthorInfo e; - for (int i = 0; i < rc; i++) { - e = a[i]; - //System.out.println("*** DEBUG Author = " + e.author + ", count = " + e.count); - result.add(new NavigatorEntry(e.author, e.count)); - } - return result; + return this.authorNavigator.entries(count); } public static void loadYBR(final File rankingPath, final int count) { @@ -733,12 +675,3 @@ public final class RankingProcess extends Thread { } } -/* -Thread= Thread-937 id=4224 BLOCKED -Thread= Thread-919 id=4206 BLOCKED -Thread= Thread-936 id=4223 BLOCKED -at net.yacy.kelondro.util.SortStack.pop(SortStack.java:118) -at de.anomic.search.RankingProcess.takeRWI(RankingProcess.java:310) -at de.anomic.search.RankingProcess.takeURL(RankingProcess.java:371) -at de.anomic.search.ResultFetcher$Worker.run(ResultFetcher.java:161) -*/ diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 3996c7365..521e19bc0 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -44,7 +44,6 @@ import net.yacy.kelondro.util.SetTools; import net.yacy.kelondro.util.SortStack; import net.yacy.kelondro.util.SortStore; -import de.anomic.search.RankingProcess.NavigatorEntry; import de.anomic.search.MediaSnippet; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.graphics.ProfilingGraph; @@ -260,20 +259,6 @@ public class ResultFetcher { Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason); } - public ArrayList getHostNavigator(int maxentries) { - return this.rankedCache.getHostNavigator(maxentries); - } - - public ArrayList getTopicNavigator(final int maxentries) { - // returns a set of words that are computed as toplist - return this.rankedCache.getTopicNavigator(maxentries); - } - - public ArrayList getAuthorNavigator(final int maxentries) { - // returns a list of authors so far seen on result set - return this.rankedCache.getAuthorNavigator(maxentries); - } - public int resultCount() { return this.result.size(); } @@ -357,7 +342,7 @@ public class ResultFetcher { public long postRanking( final ResultEntry rentry, - final Map topwords) { + final Map topwords) { long r = 0; @@ -375,14 +360,14 @@ public class ResultFetcher { final String urlstring = rentry.url().toNormalform(true, true); final String[] urlcomps = DigestURI.urlComps(urlstring); final String[] descrcomps = DigestURI.splitpattern.split(rentry.title().toLowerCase()); - Integer tc; + Navigator.Item tc; for (int j = 0; j < urlcomps.length; j++) { tc = topwords.get(urlcomps[j]); - if (tc != null) r += Math.max(1, tc.intValue()) << query.ranking.coeff_urlcompintoplist; + if (tc != null) r += Math.max(1, tc.count) << query.ranking.coeff_urlcompintoplist; } for (int j = 0; j < descrcomps.length; j++) { tc = topwords.get(descrcomps[j]); - if (tc != null) r += Math.max(1, tc) << query.ranking.coeff_descrcompintoplist; + if (tc != null) r += Math.max(1, tc.count) << query.ranking.coeff_descrcompintoplist; } // apply query-in-result matching diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index 17776226f..466359009 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -29,6 +29,7 @@ package de.anomic.search; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; @@ -43,7 +44,6 @@ import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.SetTools; import de.anomic.crawler.ResultURLs; -import de.anomic.search.RankingProcess.NavigatorEntry; import de.anomic.yacy.yacySearch; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.dht.FlatWordPartitionScheme; @@ -289,16 +289,20 @@ public final class SearchEvent { return this.rankedCache; } - public ArrayList getHostNavigator(int maxentries) { - return this.rankedCache.getHostNavigator(maxentries); + public ArrayList getNamespaceNavigator(int maxentries) { + return this.rankedCache.getNamespaceNavigator(maxentries); } - public ArrayList getTopicNavigator(final int maxentries) { + public List getHostNavigator(int maxentries) { + return this.rankedCache.getHostNavigator(maxentries); + } + + public List getTopicNavigator(final int maxentries) { // returns a set of words that are computed as toplist return this.rankedCache.getTopicNavigator(maxentries); } - public ArrayList getAuthorNavigator(final int maxentries) { + public List getAuthorNavigator(final int maxentries) { // returns a list of authors so far seen on result set return this.rankedCache.getAuthorNavigator(maxentries); } diff --git a/source/net/yacy/kelondro/blob/BEncodedHeap.java b/source/net/yacy/kelondro/blob/BEncodedHeap.java index 849370b38..0436a381d 100644 --- a/source/net/yacy/kelondro/blob/BEncodedHeap.java +++ b/source/net/yacy/kelondro/blob/BEncodedHeap.java @@ -105,7 +105,7 @@ public class BEncodedHeap implements Iterable