diff --git a/htroot/yacysearchtrailer.html b/htroot/yacysearchtrailer.html index 3966a65fb..62d3d190f 100644 --- a/htroot/yacysearchtrailer.html +++ b/htroot/yacysearchtrailer.html @@ -1,11 +1,8 @@ -#(nav-domains)# -:: +#(nav-domains)#::

Domains

-
-
#(/nav-domains)# #(nav-topics)#::

Topics

@@ -13,6 +10,12 @@
  • #[url]#
  • #{/element}# #(/nav-topics)# +#(nav-authors)#:: +

    Authors

    +
    +#(/nav-authors)#

    Timeline

    diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index 5a315d234..0fc758602 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -55,8 +55,9 @@ public class yacysearchtrailer { } final plasmaSearchQuery theQuery = theSearch.getQuery(); - // compose search navigation + + // host navigators ArrayList hostNavigator = theSearch.getHostNavigator(10); if (hostNavigator == null) { prop.put("nav-domains", 0); @@ -66,10 +67,10 @@ public class yacysearchtrailer { int i; for (i = 0; i < hostNavigator.size(); i++) { entry = hostNavigator.get(i); - prop.put("nav-domains_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); - prop.putJSON("nav_domains_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators)); - prop.put("nav-domains_element_" + i + "_name", entry.name); - prop.put("nav-domains_element_" + i + "_count", entry.count); + prop.put("nav-domains_element_" + i + "_name", entry.name); + prop.put("nav-domains_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); + prop.putJSON("nav-domains_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators)); + prop.put("nav-domains_element_" + i + "_count", entry.count); prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name); prop.put("nav-domains_element_" + i + "_nl", 1); } @@ -95,11 +96,7 @@ public class yacysearchtrailer { prop.put("nav-topics_element_" + i + "_url", "" + e.name + " (" + e.count + ")"); prop.putJSON("nav-topics_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators)); prop.put("nav-topics_element_" + i + "_count", e.count); - prop.put("nav-topics_element_" + i + "_offset", "0"); - prop.put("nav-topics_element_" + i + "_display", display); prop.put("nav-topics_element_" + i + "_modifier", e.name); - prop.put("nav-topics_element_" + i + "_contentdom", theQuery.contentdom()); - prop.put("nav-topics_element_" + i + "_resource", ((theQuery.isLocal()) ? "local" : "global")); prop.put("nav-topics_element_" + i + "_nl", (iter.hasNext() && i < MAX_TOPWORDS) ? 1 : 0); } if (i++ > MAX_TOPWORDS) { @@ -109,6 +106,29 @@ public class yacysearchtrailer { prop.put("nav-topics_element", i); prop.put("nav-topics", "1"); } + + // author navigators + ArrayList authorNavigator = theSearch.getAuthorNavigator(10); + if (authorNavigator == null) { + prop.put("nav-authors", 0); + } else { + prop.put("nav-authors", 1); + NavigatorEntry entry; + int i; + for (i = 0; i < authorNavigator.size(); i++) { + entry = authorNavigator.get(i); + prop.put("nav-authors_element_" + i + "_name", entry.name); + prop.put("nav-authors_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")"); + prop.putJSON("nav-authors_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "author:'" + entry.name + "'", theQuery.navigators)); + prop.put("nav-authors_element_" + i + "_count", entry.count); + prop.put("nav-authors_element_" + i + "_modifier", "author:'" + entry.name + "'"); + prop.put("nav-authors_element_" + i + "_nl", 1); + } + i--; + prop.put("nav-authors_element_" + i + "_nl", 0); + prop.put("nav-authors_element", authorNavigator.size()); + } + serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.FINALIZATION + "-" + "bottomline", 0, 0), false); return prop; diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 55d952b44..b0693beea 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -99,6 +99,7 @@ public class pdfParser extends AbstractParser implements Parser { checkInterruption(); // creating a text stripper + synchronized (SUPPORTED_MIME_TYPES) { final PDFTextStripper stripper = new PDFTextStripper(); theDocument = parser.getPDDocument(); @@ -168,6 +169,7 @@ public class pdfParser extends AbstractParser implements Parser { } return theDoc; + } } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 198b50e1b..b4c22ad1a 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -586,6 +586,11 @@ public final class plasmaSearchEvent { return this.rankedCache.getTopicNavigator(maxentries); } + public ArrayList getAuthorNavigator(final int maxentries) { + // returns a list of authors so far seen on result set + return this.rankedCache.getAuthorNavigator(maxentries); + } + public ResultEntry oneResult(final int item) { // check if we already retrieved this item (happens if a search // pages is accessed a second time) diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 3e86527f2..7da3d6540 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -35,11 +35,13 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.kelondro.index.BinSearch; +import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Digest; import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.ReferenceContainer; @@ -76,8 +78,9 @@ public final class plasmaSearchRankingProcess { private final Segment indexSegment; private HashMap> localSearchInclusion; private final int[] domZones; - private final ConcurrentHashMap hostNavigator; + private final ConcurrentHashMap hostNavigator; private final ConcurrentHashMap ref; // reference score computation for the commonSense heuristic + private final TreeMap authorNavigator; public plasmaSearchRankingProcess( final Segment indexSegment, @@ -103,7 +106,8 @@ public final class plasmaSearchRankingProcess { this.indexSegment = indexSegment; this.flagcount = new int[32]; for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} - this.hostNavigator = new ConcurrentHashMap(); + this.hostNavigator = new ConcurrentHashMap(); + this.authorNavigator = new TreeMap(Base64Order.enhancedCoder); this.ref = new ConcurrentHashMap(); this.domZones = new int[8]; for (int i = 0; i < 8; i++) {this.domZones[i] = 0;} @@ -160,7 +164,7 @@ public final class plasmaSearchRankingProcess { final Iterator i = decodedEntries.iterator(); WordReferenceVars iEntry; Long r; - hoststat hs; + HostInfo hs; String domhash; boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0; while (i.hasNext()) { @@ -208,7 +212,7 @@ public final class plasmaSearchRankingProcess { domhash = iEntry.urlHash.substring(6); hs = this.hostNavigator.get(domhash); if (hs == null) { - this.hostNavigator.put(domhash, new hoststat(iEntry.urlHash)); + this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash)); } else { hs.inc(); } @@ -321,6 +325,24 @@ public final class plasmaSearchRankingProcess { final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue()); if (u != null) { final URLMetadataRow.Components metadata = u.metadata(); + + // evaluate information of metadata for navigation + // author navigation: + String author = metadata.dc_creator(); + if (author != null && author.length() > 0) { + byte[] authorhash = Word.word2hash(author); + //synchronized (this.authorNavigator) { + AuthorInfo in = this.authorNavigator.get(authorhash); + if (in == null) { + this.authorNavigator.put(authorhash, new AuthorInfo(author)); + } else { + in.inc(); + this.authorNavigator.put(authorhash, in); + } + //} + } + + // get the url if (metadata.url() != null) { String urlstring = metadata.url().toNormalform(true, true); if (urlstring == null || !urlstring.matches(query.urlMask)) continue; @@ -383,10 +405,10 @@ public final class plasmaSearchRankingProcess { return this.misses.iterator(); } - public class hoststat { + public class HostInfo { public int count; public String hashsample; - public hoststat(String urlhash) { + public HostInfo(String urlhash) { this.count = 1; this.hashsample = urlhash; } @@ -395,8 +417,28 @@ public final class plasmaSearchRankingProcess { } } - public static final Comparator hscomp = new Comparator() { - public int compare(hoststat o1, hoststat o2) { + public class AuthorInfo { + public int count; + public String author; + public AuthorInfo(String author) { + this.count = 1; + this.author = author; + } + public void inc() { + this.count++; + } + } + + public static final Comparator hscomp = new Comparator() { + public int compare(HostInfo o1, HostInfo o2) { + if (o1.count < o2.count) return 1; + if (o2.count < o1.count) return -1; + return 0; + } + }; + + public static final Comparator aicomp = new Comparator() { + public int compare(AuthorInfo o1, AuthorInfo o2) { if (o1.count < o2.count) return 1; if (o2.count < o1.count) return -1; return 0; @@ -415,7 +457,7 @@ public final class plasmaSearchRankingProcess { public ArrayList getHostNavigator(int count) { if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return new ArrayList(0); - hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]); + HostInfo[] hsa = this.hostNavigator.values().toArray(new HostInfo[this.hostNavigator.size()]); Arrays.sort(hsa, hscomp); int rc = Math.min(count, hsa.length); ArrayList result = new ArrayList(); @@ -488,6 +530,24 @@ public final class plasmaSearchRankingProcess { addTopic(descrcomps); } + public ArrayList getAuthorNavigator(final int count) { + // create a list of words that had been computed by statistics over all + // words that appeared in the url or the description of all urls + if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ArrayList(0); + + AuthorInfo[] a = this.authorNavigator.values().toArray(new AuthorInfo[this.authorNavigator.size()]); + Arrays.sort(a, aicomp); + int rc = Math.min(count, a.length); + ArrayList result = new ArrayList(); + AuthorInfo e; + for (int i = 0; i < rc; i++) { + e = a[i]; + //System.out.println("*** DEBUG Author = " + e.author + ", count = " + e.count); + result.add(new NavigatorEntry(e.author, e.count)); + } + return result; + } + public ReferenceOrder getOrder() { return this.order; }