diff --git a/htroot/yacysearchtrailer.html b/htroot/yacysearchtrailer.html
index 3966a65fb..62d3d190f 100644
--- a/htroot/yacysearchtrailer.html
+++ b/htroot/yacysearchtrailer.html
@@ -1,11 +1,8 @@
-#(nav-domains)#
-::
+#(nav-domains)#::
Domains
-
-
#{element}#
+#{element}#
- #[url]#
-#{/element}#
-
+#{/element}#
#(/nav-domains)#
#(nav-topics)#::
Topics
@@ -13,6 +10,12 @@
#[url]#
#{/element}#
#(/nav-topics)#
+#(nav-authors)#::
+Authors
+#{element}#
+- #[url]#
+#{/element}#
+#(/nav-authors)#
Timeline
diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java
index 5a315d234..0fc758602 100644
--- a/htroot/yacysearchtrailer.java
+++ b/htroot/yacysearchtrailer.java
@@ -55,8 +55,9 @@ public class yacysearchtrailer {
}
final plasmaSearchQuery theQuery = theSearch.getQuery();
-
// compose search navigation
+
+ // host navigators
ArrayList
hostNavigator = theSearch.getHostNavigator(10);
if (hostNavigator == null) {
prop.put("nav-domains", 0);
@@ -66,10 +67,10 @@ public class yacysearchtrailer {
int i;
for (i = 0; i < hostNavigator.size(); i++) {
entry = hostNavigator.get(i);
- prop.put("nav-domains_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")");
- prop.putJSON("nav_domains_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators));
- prop.put("nav-domains_element_" + i + "_name", entry.name);
- prop.put("nav-domains_element_" + i + "_count", entry.count);
+ prop.put("nav-domains_element_" + i + "_name", entry.name);
+ prop.put("nav-domains_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")");
+ prop.putJSON("nav-domains_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "site:" + entry.name, theQuery.navigators));
+ prop.put("nav-domains_element_" + i + "_count", entry.count);
prop.put("nav-domains_element_" + i + "_modifier", "site:" + entry.name);
prop.put("nav-domains_element_" + i + "_nl", 1);
}
@@ -95,11 +96,7 @@ public class yacysearchtrailer {
prop.put("nav-topics_element_" + i + "_url", "" + e.name + " (" + e.count + ")");
prop.putJSON("nav-topics_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, e.name, theQuery.navigators));
prop.put("nav-topics_element_" + i + "_count", e.count);
- prop.put("nav-topics_element_" + i + "_offset", "0");
- prop.put("nav-topics_element_" + i + "_display", display);
prop.put("nav-topics_element_" + i + "_modifier", e.name);
- prop.put("nav-topics_element_" + i + "_contentdom", theQuery.contentdom());
- prop.put("nav-topics_element_" + i + "_resource", ((theQuery.isLocal()) ? "local" : "global"));
prop.put("nav-topics_element_" + i + "_nl", (iter.hasNext() && i < MAX_TOPWORDS) ? 1 : 0);
}
if (i++ > MAX_TOPWORDS) {
@@ -109,6 +106,29 @@ public class yacysearchtrailer {
prop.put("nav-topics_element", i);
prop.put("nav-topics", "1");
}
+
+ // author navigators
+ ArrayList authorNavigator = theSearch.getAuthorNavigator(10);
+ if (authorNavigator == null) {
+ prop.put("nav-authors", 0);
+ } else {
+ prop.put("nav-authors", 1);
+ NavigatorEntry entry;
+ int i;
+ for (i = 0; i < authorNavigator.size(); i++) {
+ entry = authorNavigator.get(i);
+ prop.put("nav-authors_element_" + i + "_name", entry.name);
+ prop.put("nav-authors_element_" + i + "_url", "" + entry.name + " (" + entry.count + ")");
+ prop.putJSON("nav-authors_element_" + i + "_url-json", plasmaSearchQuery.navurl("json", 0, display, theQuery, theQuery.urlMask, "author:'" + entry.name + "'", theQuery.navigators));
+ prop.put("nav-authors_element_" + i + "_count", entry.count);
+ prop.put("nav-authors_element_" + i + "_modifier", "author:'" + entry.name + "'");
+ prop.put("nav-authors_element_" + i + "_nl", 1);
+ }
+ i--;
+ prop.put("nav-authors_element_" + i + "_nl", 0);
+ prop.put("nav-authors_element", authorNavigator.size());
+ }
+
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), plasmaSearchEvent.FINALIZATION + "-" + "bottomline", 0, 0), false);
return prop;
diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
index 55d952b44..b0693beea 100644
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -99,6 +99,7 @@ public class pdfParser extends AbstractParser implements Parser {
checkInterruption();
// creating a text stripper
+ synchronized (SUPPORTED_MIME_TYPES) {
final PDFTextStripper stripper = new PDFTextStripper();
theDocument = parser.getPDDocument();
@@ -168,6 +169,7 @@ public class pdfParser extends AbstractParser implements Parser {
}
return theDoc;
+ }
}
catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index 198b50e1b..b4c22ad1a 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -586,6 +586,11 @@ public final class plasmaSearchEvent {
return this.rankedCache.getTopicNavigator(maxentries);
}
+ public ArrayList getAuthorNavigator(final int maxentries) {
+ // returns a list of authors so far seen on result set
+ return this.rankedCache.getAuthorNavigator(maxentries);
+ }
+
public ResultEntry oneResult(final int item) {
// check if we already retrieved this item (happens if a search
// pages is accessed a second time)
diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java
index 3e86527f2..7da3d6540 100644
--- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java
+++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java
@@ -35,11 +35,13 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
+import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.index.BinSearch;
+import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.text.Reference;
import de.anomic.kelondro.text.ReferenceContainer;
@@ -76,8 +78,9 @@ public final class plasmaSearchRankingProcess {
private final Segment indexSegment;
private HashMap> localSearchInclusion;
private final int[] domZones;
- private final ConcurrentHashMap hostNavigator;
+ private final ConcurrentHashMap hostNavigator;
private final ConcurrentHashMap ref; // reference score computation for the commonSense heuristic
+ private final TreeMap authorNavigator;
public plasmaSearchRankingProcess(
final Segment indexSegment,
@@ -103,7 +106,8 @@ public final class plasmaSearchRankingProcess {
this.indexSegment = indexSegment;
this.flagcount = new int[32];
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
- this.hostNavigator = new ConcurrentHashMap();
+ this.hostNavigator = new ConcurrentHashMap();
+ this.authorNavigator = new TreeMap(Base64Order.enhancedCoder);
this.ref = new ConcurrentHashMap();
this.domZones = new int[8];
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
@@ -160,7 +164,7 @@ public final class plasmaSearchRankingProcess {
final Iterator i = decodedEntries.iterator();
WordReferenceVars iEntry;
Long r;
- hoststat hs;
+ HostInfo hs;
String domhash;
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
while (i.hasNext()) {
@@ -208,7 +212,7 @@ public final class plasmaSearchRankingProcess {
domhash = iEntry.urlHash.substring(6);
hs = this.hostNavigator.get(domhash);
if (hs == null) {
- this.hostNavigator.put(domhash, new hoststat(iEntry.urlHash));
+ this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash));
} else {
hs.inc();
}
@@ -321,6 +325,24 @@ public final class plasmaSearchRankingProcess {
final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
if (u != null) {
final URLMetadataRow.Components metadata = u.metadata();
+
+ // evaluate information of metadata for navigation
+ // author navigation:
+ String author = metadata.dc_creator();
+ if (author != null && author.length() > 0) {
+ byte[] authorhash = Word.word2hash(author);
+ //synchronized (this.authorNavigator) {
+ AuthorInfo in = this.authorNavigator.get(authorhash);
+ if (in == null) {
+ this.authorNavigator.put(authorhash, new AuthorInfo(author));
+ } else {
+ in.inc();
+ this.authorNavigator.put(authorhash, in);
+ }
+ //}
+ }
+
+ // get the url
if (metadata.url() != null) {
String urlstring = metadata.url().toNormalform(true, true);
if (urlstring == null || !urlstring.matches(query.urlMask)) continue;
@@ -383,10 +405,10 @@ public final class plasmaSearchRankingProcess {
return this.misses.iterator();
}
- public class hoststat {
+ public class HostInfo {
public int count;
public String hashsample;
- public hoststat(String urlhash) {
+ public HostInfo(String urlhash) {
this.count = 1;
this.hashsample = urlhash;
}
@@ -395,8 +417,28 @@ public final class plasmaSearchRankingProcess {
}
}
- public static final Comparator hscomp = new Comparator() {
- public int compare(hoststat o1, hoststat o2) {
+ public class AuthorInfo {
+ public int count;
+ public String author;
+ public AuthorInfo(String author) {
+ this.count = 1;
+ this.author = author;
+ }
+ public void inc() {
+ this.count++;
+ }
+ }
+
+ public static final Comparator hscomp = new Comparator() {
+ public int compare(HostInfo o1, HostInfo o2) {
+ if (o1.count < o2.count) return 1;
+ if (o2.count < o1.count) return -1;
+ return 0;
+ }
+ };
+
+ public static final Comparator aicomp = new Comparator() {
+ public int compare(AuthorInfo o1, AuthorInfo o2) {
if (o1.count < o2.count) return 1;
if (o2.count < o1.count) return -1;
return 0;
@@ -415,7 +457,7 @@ public final class plasmaSearchRankingProcess {
public ArrayList getHostNavigator(int count) {
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return new ArrayList(0);
- hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]);
+ HostInfo[] hsa = this.hostNavigator.values().toArray(new HostInfo[this.hostNavigator.size()]);
Arrays.sort(hsa, hscomp);
int rc = Math.min(count, hsa.length);
ArrayList result = new ArrayList();
@@ -488,6 +530,24 @@ public final class plasmaSearchRankingProcess {
addTopic(descrcomps);
}
+ public ArrayList getAuthorNavigator(final int count) {
+ // create a list of words that had been computed by statistics over all
+ // words that appeared in the url or the description of all urls
+ if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ArrayList(0);
+
+ AuthorInfo[] a = this.authorNavigator.values().toArray(new AuthorInfo[this.authorNavigator.size()]);
+ Arrays.sort(a, aicomp);
+ int rc = Math.min(count, a.length);
+ ArrayList result = new ArrayList();
+ AuthorInfo e;
+ for (int i = 0; i < rc; i++) {
+ e = a[i];
+ //System.out.println("*** DEBUG Author = " + e.author + ", count = " + e.count);
+ result.add(new NavigatorEntry(e.author, e.count));
+ }
+ return result;
+ }
+
public ReferenceOrder getOrder() {
return this.order;
}