diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list
index a28f28b65..01ded4b47 100644
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@@ -187,7 +187,7 @@ h6_txt
#inboundlinks_rel_sxt
## internal links, the rel property of the a-tag, coded binary
-#inboundlinks_relflags_sxt
+#inboundlinks_relflags_val
## internal links, the text content of the a-tag
#inboundlinks_text_txt
@@ -208,7 +208,7 @@ h6_txt
#outboundlinks_rel_sxt
## external links, the rel property of the a-tag, coded binary
-#outboundlinks_relflags_sxt
+#outboundlinks_relflags_val
## external links, the text content of the a-tag
#outboundlinks_text_txt
@@ -306,10 +306,28 @@ italic_txt
## the remaining part of the host without organizationdnc
#host_subdomain_s
+## number of titles (counting the 'title' field) in the document
+#title_count_i
+
+## number of characters for each title
+#title_chars_val
+
+## number of words in each title
+#title_words_val
+
+## number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)
+#description_count_i
+
+## number of characters for each description
+#description_chars_val
+
+## number of words in each description
+#description_words_val
+
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
#ext_cms_txt
-##number of attributes that count for a specific cms in attr_cms
+## number of attributes that count for a specific cms in attr_cms
#ext_cms_val
## names of ad-servers/ad-services
diff --git a/defaults/yacy.logging b/defaults/yacy.logging
index e0967fcbb..8e0ed3673 100644
--- a/defaults/yacy.logging
+++ b/defaults/yacy.logging
@@ -79,9 +79,12 @@ httpclient.wire.level = OFF
org.apache.pdfbox.level = INFO
# Properties for solr
+org.apache.solr.util.plugin.AbstractPluginLoader.enabled = FALSE
+org.apache.solr.schema.IndexSchema.enabled = FALSE
org.apache.solr.handler.enabled = FALSE
org.apache.solr.update.enabled = TRUE
org.apache.solr.update.level = INFO
org.apache.solr.core.enabled = TRUE
org.apache.solr.core.level = INFO
+javax.management.mbeanserver.level = INFO
javax.management.mbeanserver.enabled = FALSE
\ No newline at end of file
diff --git a/htroot/AccessTracker_p.html b/htroot/AccessTracker_p.html
index 48aa6870a..7e69b2d67 100644
--- a/htroot/AccessTracker_p.html
+++ b/htroot/AccessTracker_p.html
@@ -57,7 +57,7 @@
#{list}#
- #[host]# |
+ #[host]# |
#[date]# |
#[path]# |
diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java
index dc74ed3d5..40b3a61a8 100644
--- a/htroot/api/getpageinfo.java
+++ b/htroot/api/getpageinfo.java
@@ -124,7 +124,7 @@ public class getpageinfo {
prop.putXML("desc", removelinebreaks(scraper.dc_description()));
// put language
final Set languages = scraper.getContentLanguages();
- prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
+ prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
// get links and put them into a semicolon-separated list
final Set uris = scraper.getAnchors().keySet();
diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java
index 6cba92e09..29e68da9d 100644
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@@ -124,7 +124,7 @@ public class getpageinfo_p {
prop.putXML("desc", scraper.dc_description());
// put language
final Set languages = scraper.getContentLanguages();
- prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
+ prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
// get links and put them into a semicolon-separated list
final Set uris = scraper.getAnchors().keySet();
diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java
index 46e674bee..37fa30d89 100644
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
@@ -117,21 +117,21 @@ public class Response {
return doctype;
}
- public static String doctype2mime(String ext, char doctype) {
- if (doctype == DT_PDFPS) return "application/pdf";
- if (doctype == DT_HTML) return "text/html";
- if (doctype == DT_DOC) return "application/msword";
- if (doctype == DT_FLASH) return "application/x-shockwave-flash";
- if (doctype == DT_SHARE) return "text/plain";
- if (doctype == DT_BINARY) return "application/octet-stream";
+ public static String[] doctype2mime(String ext, char doctype) {
+ if (doctype == DT_PDFPS) return new String[]{"application/pdf"};
+ if (doctype == DT_HTML) return new String[]{"text/html"};
+ if (doctype == DT_DOC) return new String[]{"application/msword"};
+ if (doctype == DT_FLASH) return new String[]{"application/x-shockwave-flash"};
+ if (doctype == DT_SHARE) return new String[]{"text/plain"};
+ if (doctype == DT_BINARY) return new String[]{"application/octet-stream"};
String mime = Classification.ext2mime(ext);
int p = mime.indexOf('/');
- if (p < 0) return mime;
- if (doctype == DT_TEXT) return "text" + mime.substring(p);
- if (doctype == DT_IMAGE) return "image" + mime.substring(p);
- if (doctype == DT_AUDIO) return "audio" + mime.substring(p);
- if (doctype == DT_MOVIE) return "video" + mime.substring(p);
- return mime;
+ if (p < 0) return new String[]{mime};
+ if (doctype == DT_TEXT) return new String[]{"text" + mime.substring(p)};
+ if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
+ if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
+ if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
+ return new String[]{mime};
}
public static final int QUEUE_STATE_FRESH = 0;
diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java
index c50c6ba8d..642759775 100644
--- a/source/de/anomic/http/server/HTTPDFileHandler.java
+++ b/source/de/anomic/http/server/HTTPDFileHandler.java
@@ -80,6 +80,7 @@ import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
@@ -493,7 +494,8 @@ public final class HTTPDFileHandler {
} else {
if (element.endsWith("html") || (element.endsWith("htm"))) {
scraper = ContentScraper.parseResource(f, 10000);
- headline = scraper.getTitle();
+ Collection t = scraper.getTitles();
+ headline = t.size() > 0 ? t.iterator().next() : "";
author = scraper.getAuthor();
publisher = scraper.getPublisher();
description = scraper.getDescription();
diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java
index a03577e96..c735b76c6 100644
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@@ -707,6 +707,10 @@ public class MultiProtocolURI implements Serializable, Comparable.
@@ -32,13 +32,13 @@ import java.util.Set;
public class RSSFeed implements Iterable {
public static final int DEFAULT_MAXSIZE = 1000;
-
+
// class variables
private RSSMessage channel;
private String imageURL;
- private Map messages; // a guid:Item map
- private int maxsize;
-
+ private final Map messages; // a guid:Item map
+ private final int maxsize;
+
public RSSFeed(final int maxsize) {
this.messages = Collections.synchronizedMap(new LinkedHashMap());
this.channel = null;
@@ -62,13 +62,13 @@ public class RSSFeed implements Iterable {
this.addMessage(message);
}
}
-
+
public void setChannel(final RSSMessage channelItem) {
this.channel = channelItem;
}
-
+
public RSSMessage getChannel() {
- return channel;
+ return this.channel;
}
public void setImage(final String imageURL) {
@@ -78,82 +78,86 @@ public class RSSFeed implements Iterable {
public String getImage() {
return this.imageURL;
}
-
+
public Set getLinks() {
Set links = new HashSet();
- for (RSSMessage message: messages.values()) {
+ for (RSSMessage message: this.messages.values()) {
try {links.add(new MultiProtocolURI(message.getLink()));} catch (MalformedURLException e) {}
}
return links;
}
-
+
public void addMessage(final RSSMessage item) {
final String guid = item.getGuid();
- messages.put(guid, item);
+ this.messages.put(guid, item);
// in case that the feed is full (size > maxsize) flush the oldest element
- while (messages.size() > this.maxsize) pollMessage();
+ while (this.messages.size() > this.maxsize) pollMessage();
}
-
+
public RSSMessage getMessage(final String guid) {
// retrieve item by guid
- return messages.get(guid);
+ return this.messages.get(guid);
}
public boolean isEmpty() {
- return messages.isEmpty();
+ return this.messages.isEmpty();
}
-
+
public int size() {
- return messages.size();
+ return this.messages.size();
}
-
+
+ @Override
public Iterator iterator() {
return new messageIterator();
}
-
+
public RSSMessage pollMessage() {
// retrieve and delete item
- synchronized (messages) {
- if (messages.isEmpty()) return null;
- final String nextGUID = messages.keySet().iterator().next();
+ synchronized (this.messages) {
+ if (this.messages.isEmpty()) return null;
+ final String nextGUID = this.messages.keySet().size() == 0 ? null : this.messages.keySet().iterator().next();
if (nextGUID == null) return null;
- return messages.remove(nextGUID);
+ return this.messages.remove(nextGUID);
}
}
public class messageIterator implements Iterator{
-
+
Iterator GUIDiterator;
String lastGUID;
int t;
-
+
public messageIterator() {
- t = messages.size(); // termination counter
- GUIDiterator = messages.keySet().iterator();
- lastGUID = null;
+ this.t = RSSFeed.this.messages.size(); // termination counter
+ this.GUIDiterator = RSSFeed.this.messages.keySet().iterator();
+ this.lastGUID = null;
}
+ @Override
public boolean hasNext() {
- if (t <= 0) return false; // ensure termination
- return GUIDiterator.hasNext();
+ if (this.t <= 0) return false; // ensure termination
+ return this.GUIDiterator.hasNext();
}
+ @Override
public RSSMessage next() {
- t--; // ensure termination
+ this.t--; // ensure termination
try {
- lastGUID = GUIDiterator.next();
+ this.lastGUID = this.GUIDiterator.next();
} catch (ConcurrentModificationException e) {
return null;
}
- if (lastGUID == null) return null;
- return messages.get(lastGUID);
+ if (this.lastGUID == null) return null;
+ return RSSFeed.this.messages.get(this.lastGUID);
}
+ @Override
public void remove() {
- if (lastGUID == null) return;
- GUIDiterator.remove();
- messages.remove(lastGUID);
+ if (this.lastGUID == null) return;
+ this.GUIDiterator.remove();
+ RSSFeed.this.messages.remove(this.lastGUID);
}
}
-
+
}
diff --git a/source/net/yacy/cora/document/RSSMessage.java b/source/net/yacy/cora/document/RSSMessage.java
index 1e10a1480..867cc49ba 100644
--- a/source/net/yacy/cora/document/RSSMessage.java
+++ b/source/net/yacy/cora/document/RSSMessage.java
@@ -84,7 +84,7 @@ public class RSSMessage implements Hit, Comparable, Comparator supportedMimeTypes() {
return this.SUPPORTED_MIME_TYPES;
}
@@ -62,6 +66,7 @@ public abstract class AbstractParser implements Parser {
* each parser must define a set of supported file extensions
* @return a set of file name extensions that are supported
*/
+ @Override
public Set supportedExtensions() {
return this.SUPPORTED_EXTENSIONS;
}
@@ -71,6 +76,7 @@ public abstract class AbstractParser implements Parser {
* @param o
* @return
*/
+ @Override
public boolean equals(final Object o) {
return getName().equals(((Parser) o).getName());
}
@@ -79,8 +85,15 @@ public abstract class AbstractParser implements Parser {
* the hash code of a parser
* @return the hash code of the parser name string
*/
+ @Override
public int hashCode() {
return getName().hashCode();
}
+ public static List singleList(String t) {
+ List c = new ArrayList(1);
+ c.add(t);
+ return c;
+ }
+
}
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index 53b1532e9..11a25b590 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -44,6 +44,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@@ -75,7 +76,7 @@ public class Document {
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
private final List keywords; // most resources provide a keyword field
- private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result
+ private List titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
private final StringBuilder creator; // author or copyright
private final String publisher; // publisher
private final List sections; // if present: more titles/headlines appearing in the document
@@ -99,7 +100,9 @@ public class Document {
public Document(final DigestURI location, final String mimeType, final String charset,
final Object parserObject,
final Set languages,
- final String[] keywords, final String title, final String author, final String publisher,
+ final String[] keywords,
+ final List titles,
+ final String author, final String publisher,
final String[] sections, final String abstrct,
final double lon, final double lat,
final Object text,
@@ -113,7 +116,7 @@ public class Document {
this.parserObject = parserObject;
this.keywords = new LinkedList();
if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
- this.title = (title == null) ? new StringBuilder(0) : new StringBuilder(title);
+ this.titles = titles;
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections);
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
@@ -186,11 +189,20 @@ dc_rights
*/
public String dc_title() {
- return (this.title == null) ? "" : this.title.toString();
+ return (this.titles == null || this.titles.size() == 0) ? "" : this.titles.iterator().next();
+ }
+
+ public List titles() {
+ return this.titles;
}
public void setTitle(final String title) {
- this.title = new StringBuilder(title);
+ this.titles = new ArrayList();
+ if (title != null) this.titles.add(title);
+ }
+
+ public void addTitle(final String title) {
+ if (title != null) this.titles.add(title);
}
public String dc_creator() {
@@ -620,10 +632,7 @@ dc_rights
public void addSubDocuments(final Document[] docs) throws IOException {
for (final Document doc: docs) {
this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
-
- if (this.title.length() > 0) this.title.append('\n');
- this.title.append(doc.dc_title());
-
+ this.titles.addAll(doc.titles());
this.keywords.addAll(doc.getKeywords());
if (this.description.length() > 0) this.description.append('\n');
@@ -760,10 +769,9 @@ dc_rights
final StringBuilder authors = new StringBuilder(80);
final StringBuilder publishers = new StringBuilder(80);
final StringBuilder subjects = new StringBuilder(80);
- final StringBuilder title = new StringBuilder(80);
final StringBuilder description = new StringBuilder(80);
- final LinkedList sectionTitles = new LinkedList();
-
+ final Collection titles = new LinkedHashSet();
+ final Collection sectionTitles = new LinkedHashSet();
final Map anchors = new HashMap();
final Map rss = new HashMap();
final Map images = new HashMap();
@@ -790,9 +798,7 @@ dc_rights
subjects.append(subject);
}
- if (title.length() > 0) title.append("\n");
- title.append(doc.dc_title());
-
+ titles.addAll(doc.titles());
sectionTitles.addAll(Arrays.asList(doc.getSectionTitles()));
if (description.length() > 0) description.append("\n");
@@ -822,6 +828,8 @@ dc_rights
}
// return consolidation
+ ArrayList titlesa = new ArrayList();
+ titlesa.addAll(titles);
return new Document(
location,
globalMime,
@@ -829,7 +837,7 @@ dc_rights
null,
null,
subjects.toString().split(" |,"),
- title.toString(),
+ titlesa,
authors.toString(),
publishers.toString(),
sectionTitles.toArray(new String[sectionTitles.size()]),
diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java
index c310fa5e3..0c12f8665 100644
--- a/source/net/yacy/document/content/DCEntry.java
+++ b/source/net/yacy/document/content/DCEntry.java
@@ -30,8 +30,10 @@ import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.text.Collator;
import java.text.ParseException;
+import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
+import java.util.List;
import java.util.Locale;
import java.util.TreeMap;
@@ -261,7 +263,8 @@ public class DCEntry extends TreeMap {
public Document document() {
HashSet languages = new HashSet();
languages.add(getLanguage());
-
+ List t = new ArrayList(1);
+ t.add(getTitle());
return new Document(
getIdentifier(true),
"text/html",
@@ -269,7 +272,7 @@ public class DCEntry extends TreeMap {
this,
languages,
getSubject(),
- getTitle(),
+ t,
getCreator(),
getPublisher(),
null,
diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java
index 9728fce88..b96d48ef1 100644
--- a/source/net/yacy/document/parser/augment/AugmentParser.java
+++ b/source/net/yacy/document/parser/augment/AugmentParser.java
@@ -60,14 +60,14 @@ public class AugmentParser extends AbstractParser implements Parser {
private static Document analyze (Document alreadyParsedDocument, DigestURI url,
String mimeType, String charset) {
- Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
+ Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, null, null, null, null, false);
// if the magic word appears in the document, perform extra actions.
if (alreadyParsedDocument.getKeywords().contains("magicword")) {
String all = "";
all = "yacylatest";
- newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
+ newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
}
@@ -77,7 +77,7 @@ public class AugmentParser extends AbstractParser implements Parser {
private Document parseAndAugment(DigestURI url, String mimeType, String charset) {
String all = "";
- Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
+ Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
Iterator it;
diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java
index 42a849864..2c5ca3793 100644
--- a/source/net/yacy/document/parser/csvParser.java
+++ b/source/net/yacy/document/parser/csvParser.java
@@ -67,7 +67,7 @@ public class csvParser extends AbstractParser implements Parser {
this,
null,
null,
- concatRow(table.get(0)),
+ singleList(concatRow(table.get(0))),
"",
"",
null,
diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java
index f33940d5c..163a23024 100644
--- a/source/net/yacy/document/parser/docParser.java
+++ b/source/net/yacy/document/parser/docParser.java
@@ -28,10 +28,12 @@
package net.yacy.document.parser;
import java.io.InputStream;
+
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
+
import org.apache.poi.hwpf.extractor.WordExtractor;
public class docParser extends AbstractParser implements Parser {
@@ -50,6 +52,7 @@ public class docParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/x-msword");
}
+ @Override
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
@@ -90,7 +93,7 @@ public class docParser extends AbstractParser implements Parser {
this,
null,
null,
- title,
+ singleList(title),
"", // TODO: AUTHOR
extractor.getDocSummaryInformation().getCompany(), // publisher
null,
diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java
index 623c907a1..9ad666367 100644
--- a/source/net/yacy/document/parser/genericParser.java
+++ b/source/net/yacy/document/parser/genericParser.java
@@ -43,6 +43,7 @@ public class genericParser extends AbstractParser implements Parser {
// this parser is used if no other fits. This parser fits all
}
+ @Override
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source1)
throws Parser.Failure, InterruptedException {
@@ -54,7 +55,7 @@ public class genericParser extends AbstractParser implements Parser {
this,
null,
null,
- location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName()), // title
+ singleList(location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName())), // title
"", // author
location.getHost(),
null,
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 650a1d6b5..b11ef58ec 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -32,8 +32,10 @@ import java.io.Writer;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@@ -125,7 +127,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final Map embeds; // urlhash/embed relation
private final Map images; // urlhash/image relation
private final Map metas;
- private String title;
+ private Collection titles;
//private String headline;
private List[] headlines;
private final ClusteredScoreMap bold, italic;
@@ -170,7 +172,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.iframes = new SizeLimitedSet(maxLinks);
this.metas = new SizeLimitedMap(maxLinks);
this.script = new SizeLimitedSet(maxLinks);
- this.title = EMPTY_STRING;
+ this.titles = new LinkedHashSet();
this.headlines = new ArrayList[6];
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList();
this.bold = new ClusteredScoreMap();
@@ -391,7 +393,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.images.put(ie.url(), ie);
this.favicon = newLink;
} else if (rel.equalsIgnoreCase("canonical")) {
- tagopts.put("name", this.title);
+ tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
mergeAnchors(newLink, tagopts);
this.canonical = newLink;
} else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
@@ -480,8 +482,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
h = recursiveParse(text);
if (h.length() > 0) this.headlines[5].add(h);
} else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
- this.title = recursiveParse(text);
- this.evaluationScores.match(Element.title, this.title);
+ String t = recursiveParse(text);
+ this.titles.add(t);
+ this.evaluationScores.match(Element.title, t);
} else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.bold.inc(h);
@@ -542,35 +545,37 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return line;
}
- public String getTitle() {
- // construct a title string, even if the document has no title
+ public List getTitles() {
// some documents have a title tag as meta tag
String s = this.metas.get("title");
-
- // try to construct the title with the content of the title tag
- if (this.title.length() > 0) {
- if (s == null) {
- return this.title;
+ if (s != null && s.length() > 0) {
+ LinkedHashSet t = new LinkedHashSet();
+ t.add(s);
+ t.addAll(this.titles);
+ this.titles = t;
+ }
+
+ if (this.titles.size() == 0) {
+ // take any headline
+ for (int i = 0; i < this.headlines.length; i++) {
+ if (!this.headlines[i].isEmpty()) {
+ this.titles.add(this.headlines[i].get(0));
+ break;
+ }
}
- if ((this.title.compareToIgnoreCase(s) == 0) || (this.title.indexOf(s) >= 0)) return s;
- return this.title + ": " + s;
- }
- if (s != null) {
- return s;
}
- // otherwise take any headline
- for (int i = 0; i < this.headlines.length; i++) {
- if (!this.headlines[i].isEmpty()) return this.headlines[i].get(0);
+ if (this.titles.size() == 0) {
+ // take description tag
+ s = getDescription();
+ if (!s.isEmpty()) this.titles.add(s);
}
- // take description tag
- s = getDescription();
- if (!s.isEmpty()) return s;
-
// extract headline from file name
- return MultiProtocolURI.unescape(this.root.getFileName());
+ ArrayList t = new ArrayList();
+ t.addAll(this.titles);
+ return t;
}
public String[] getHeadlines(final int i) {
@@ -875,7 +880,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.embeds.clear();
this.images.clear();
this.metas.clear();
- this.title = null;
+ this.titles.clear();
this.headlines = null;
this.bold.clear();
this.italic.clear();
@@ -884,7 +889,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
public void print() {
- System.out.println("TITLE :" + this.title);
+ for (String t: this.titles) {
+ System.out.println("TITLE :" + t);
+ }
for (int i = 0; i < 4; i++) {
System.out.println("HEADLINE" + i + ":" + this.headlines[i].toString());
}
diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java
index 56e1e2fee..a0fd34f05 100644
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@@ -132,7 +132,7 @@ public class htmlParser extends AbstractParser implements Parser {
scraper,
scraper.getContentLanguages(),
scraper.getKeywords(),
- scraper.getTitle(),
+ scraper.getTitles(),
scraper.getAuthor(),
scraper.getPublisher(),
sections,
diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java
index 334bb16f8..06e6ee6c6 100644
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@@ -199,7 +199,7 @@ public class genericImageParser extends AbstractParser implements Parser {
this,
languages,
keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords
- title, // title
+ singleList(title), // title
author == null ? "" : author, // author
location.getHost(), // Publisher
new String[]{}, // sections
diff --git a/source/net/yacy/document/parser/mmParser.java b/source/net/yacy/document/parser/mmParser.java
index 1f32e0667..b40b78f0a 100644
--- a/source/net/yacy/document/parser/mmParser.java
+++ b/source/net/yacy/document/parser/mmParser.java
@@ -68,6 +68,7 @@ public class mmParser extends AbstractParser implements Parser {
return parser;
}
+ @Override
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException
@@ -105,7 +106,7 @@ public class mmParser extends AbstractParser implements Parser {
this,
null,
null,
- rootElementText,
+ singleList(rootElementText),
null,
null,
null,
diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java
index f9cf04276..670f68580 100644
--- a/source/net/yacy/document/parser/odtParser.java
+++ b/source/net/yacy/document/parser/odtParser.java
@@ -185,7 +185,7 @@ public class odtParser extends AbstractParser implements Parser {
this,
languages,
docKeywords,
- docLongTitle,
+ singleList(docLongTitle),
docAuthor,
"",
null,
diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java
index f1cf0717b..7021b6ac4 100644
--- a/source/net/yacy/document/parser/ooxmlParser.java
+++ b/source/net/yacy/document/parser/ooxmlParser.java
@@ -170,7 +170,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
this,
languages,
docKeywords,
- docLongTitle,
+ singleList(docLongTitle),
docAuthor,
"",
null,
diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java
index 874f328e5..841f56c0d 100644
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@@ -190,7 +190,7 @@ public class pdfParser extends AbstractParser implements Parser {
this,
null,
docKeywords,
- docTitle,
+ singleList(docTitle),
docAuthor,
docPublisher,
null,
diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java
index 72ab753b8..260546fad 100644
--- a/source/net/yacy/document/parser/pptParser.java
+++ b/source/net/yacy/document/parser/pptParser.java
@@ -58,6 +58,7 @@ public class pptParser extends AbstractParser implements Parser {
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document
*/
+ @Override
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source) throws Parser.Failure,
InterruptedException {
@@ -88,7 +89,7 @@ public class pptParser extends AbstractParser implements Parser {
this,
null,
null,
- title,
+ singleList(title),
"", // TODO: AUTHOR
pptExtractor.getDocSummaryInformation().getCompany(),
null,
diff --git a/source/net/yacy/document/parser/rdfParser.java b/source/net/yacy/document/parser/rdfParser.java
index 1bb48bcd0..4d495ca08 100644
--- a/source/net/yacy/document/parser/rdfParser.java
+++ b/source/net/yacy/document/parser/rdfParser.java
@@ -43,6 +43,7 @@ public class rdfParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/rdf+xml");
}
+ @Override
public Document[] parse(final DigestURI url, final String mimeType,
final String charset, final InputStream source)
throws Failure, InterruptedException {
@@ -57,7 +58,7 @@ public class rdfParser extends AbstractParser implements Parser {
Document doc;
String all = "rdfdatasource";
- doc = new Document(url, mimeType, charset, null, null, null, "", "",
+ doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
docs.add(doc);
diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
index 7b9c8e50f..b3d3cb957 100644
--- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
+++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
@@ -78,7 +78,7 @@ public class RDFaParser extends AbstractParser implements Parser {
Log.logWarning("RDFA PARSER", "Triple extraction failed");
}
- Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
+ Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, null, null, null, null, false);
try {
@@ -137,7 +137,7 @@ public class RDFaParser extends AbstractParser implements Parser {
all += string + ",";
}
- Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
+ Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
return doc;
}
diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java
index db798f1ee..4917f0fdd 100644
--- a/source/net/yacy/document/parser/rssParser.java
+++ b/source/net/yacy/document/parser/rssParser.java
@@ -92,7 +92,7 @@ public class rssParser extends AbstractParser implements Parser {
this,
languages,
item.getSubject(),
- item.getTitle(),
+ singleList(item.getTitle()),
item.getAuthor(),
item.getCopyright(),
new String[0],
diff --git a/source/net/yacy/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java
index 7b0994b99..22844ec48 100644
--- a/source/net/yacy/document/parser/rtfParser.java
+++ b/source/net/yacy/document/parser/rtfParser.java
@@ -50,6 +50,7 @@ public class rtfParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/x-soffice");
}
+ @Override
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
@@ -69,11 +70,11 @@ public class rtfParser extends AbstractParser implements Parser {
this,
null,
null,
- ((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
+ singleList(((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
replaceAll("\r\n"," ").
replaceAll("\n"," ").
replaceAll("\r"," ").
- replaceAll("\t"," "),
+ replaceAll("\t"," ")),
"", // TODO: AUTHOR
"", // TODO: publisher
null,
diff --git a/source/net/yacy/document/parser/sidAudioParser.java b/source/net/yacy/document/parser/sidAudioParser.java
index 8f793d9db..77bed6ed0 100644
--- a/source/net/yacy/document/parser/sidAudioParser.java
+++ b/source/net/yacy/document/parser/sidAudioParser.java
@@ -85,7 +85,7 @@ public class sidAudioParser extends AbstractParser implements Parser {
this,
null,
null,
- header.get("name"),
+ singleList(header.get("name")),
header.get("author"),
header.get("publisher"),
null,
diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java
index 832fae74a..5d0fccddb 100644
--- a/source/net/yacy/document/parser/sitemapParser.java
+++ b/source/net/yacy/document/parser/sitemapParser.java
@@ -88,7 +88,7 @@ public class sitemapParser extends AbstractParser implements Parser {
this,
null,
null,
- "",
+ singleList(""),
"",
"",
new String[0],
diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java
index fb31a8f69..58d80e399 100644
--- a/source/net/yacy/document/parser/swfParser.java
+++ b/source/net/yacy/document/parser/swfParser.java
@@ -110,11 +110,11 @@ public class swfParser extends AbstractParser implements Parser {
this,
null,
null, //keywords
- ((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
+ singleList(((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
replaceAll("\r\n"," ").
replaceAll("\n"," ").
replaceAll("\r"," ").
- replaceAll("\t"," "), // title
+ replaceAll("\t"," ")), // title
"", // TODO: AUTHOR
"",
sections, // an array of section headlines
diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java
index 339946d83..d83b162ac 100644
--- a/source/net/yacy/document/parser/torrentParser.java
+++ b/source/net/yacy/document/parser/torrentParser.java
@@ -100,7 +100,7 @@ public class torrentParser extends AbstractParser implements Parser {
this,
null,
null,
- title, // title
+ singleList(title), // title
comment, // author
location.getHost(),
null,
diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java
index 0ab5680df..44c432464 100644
--- a/source/net/yacy/document/parser/vcfParser.java
+++ b/source/net/yacy/document/parser/vcfParser.java
@@ -215,7 +215,7 @@ public class vcfParser extends AbstractParser implements Parser {
this,
null, // set of languages
null, // a list of extracted keywords
- parsedTitle.toString(), // a long document title
+ singleList(parsedTitle.toString()), // a long document title
"", // TODO: AUTHOR
"", // the publisher
sections, // an array of section headlines
diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java
index fbfcf5580..eadf496ae 100644
--- a/source/net/yacy/document/parser/vsdParser.java
+++ b/source/net/yacy/document/parser/vsdParser.java
@@ -62,6 +62,7 @@ public class vsdParser extends AbstractParser implements Parser {
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document
*/
+ @Override
public Document[] parse(final DigestURI location, final String mimeType, final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
@@ -108,7 +109,7 @@ public class vsdParser extends AbstractParser implements Parser {
this,
null, // language
keywords,
- title,
+ singleList(title),
author,
"",
null, // an array of section headlines
diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java
index 48b65a158..1ad1d55fe 100644
--- a/source/net/yacy/document/parser/xlsParser.java
+++ b/source/net/yacy/document/parser/xlsParser.java
@@ -119,7 +119,7 @@ public class xlsParser extends AbstractParser implements Parser {
this,
null,
null,
- location.getFile(),
+ singleList(location.getFile()),
"", // TODO: AUTHOR
"", // TODO: publisher
null,
diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
index 26be0e084..ea6f6d26e 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@@ -31,6 +31,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.vocabulary.Tagging;
+import net.yacy.cora.services.federated.solr.SolrType;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
@@ -116,31 +117,38 @@ public class URIMetadataNode implements URIMetadata {
}
private int getInt(YaCySchema field) {
+ assert !field.isMultiValued();
+ assert field.getType() == SolrType.integer;
Integer x = (Integer) this.doc.getFieldValue(field.name());
if (x == null) return 0;
return x.intValue();
}
- /*
- private double getDouble(YaCySchema field) {
- Double x = (Double) this.doc.getFieldValue(field.name());
- if (x == null) return 0.0d;
- return x.doubleValue();
- }
- */
+
private Date getDate(YaCySchema field) {
+ assert !field.isMultiValued();
+ assert field.getType() == SolrType.date;
Date x = (Date) this.doc.getFieldValue(field.name());
if (x == null) return new Date(0);
return x;
}
private String getString(YaCySchema field) {
- String x = (String) this.doc.getFieldValue(field.name());
+ assert !field.isMultiValued();
+ assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight;
+ Object x = this.doc.getFieldValue(field.name());
if (x == null) return "";
- return x;
+ if (x instanceof ArrayList) {
+ @SuppressWarnings("unchecked")
+ ArrayList xa = (ArrayList) x;
+ return xa.size() == 0 ? "" : xa.get(0);
+ }
+ return (String) x;
}
@SuppressWarnings("unchecked")
private ArrayList getArrayList(YaCySchema field) {
+ assert field.isMultiValued();
+ assert field.getType() == SolrType.string || field.getType() == SolrType.text_general;
Object r = this.doc.getFieldValue(field.name());
if (r == null) return new ArrayList(0);
if (r instanceof ArrayList) {
diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
index addefade4..6d5f1bc06 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
@@ -111,7 +111,7 @@ public class URIMetadataRow implements URIMetadata {
public URIMetadataRow() {
// create a dummy entry, good to produce poison objects
this.entry = rowdef.newEntry();
- this.snippet = null;
+ this.snippet = "";
this.word = null;
this.ranking = 0;
this.comp = null;
@@ -161,7 +161,7 @@ public class URIMetadataRow implements URIMetadata {
this.entry.setCol(col_lvideo, lvideo);
this.entry.setCol(col_lapp, lapp);
//System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString());
- this.snippet = null;
+ this.snippet = "";
this.word = null;
this.ranking = 0;
this.comp = null;
@@ -207,7 +207,7 @@ public class URIMetadataRow implements URIMetadata {
public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord, final long ranking) {
this.entry = entry;
- this.snippet = null;
+ this.snippet = "";
this.word = searchedWord;
this.ranking = ranking;
this.comp = null;
diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java
index 34187db76..b210a53dc 100644
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@@ -110,46 +110,62 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final byte[] value) {
+ assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length != 0))) key.add(doc, UTF8.String(value));
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final String value) {
+ assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final String value, final float boost) {
+ assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) key.add(doc, value, boost);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final Date value) {
+ assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.getTime() > 0))) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final String[] value) {
+ assert key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value);
}
- protected void add(final SolrInputDocument doc, final YaCySchema key, final List value) {
- if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) key.add(doc, value);
+ protected void add(final SolrInputDocument doc, final YaCySchema key, final Integer[] value) {
+ assert key.isMultiValued();
+ if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value);
+ }
+
+ protected void add(final SolrInputDocument doc, final YaCySchema key, final List> values) {
+ assert key.isMultiValued();
+ if ((isEmpty() || contains(key)) && (!this.lazy || (values != null && !values.isEmpty()))) key.add(doc, values);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final int value) {
+ assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final long value) {
+ assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final float value) {
+ assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0.0f)) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final double value) {
+ assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0.0d)) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final boolean value) {
+ assert !key.isMultiValued();
if (isEmpty() || contains(key)) key.add(doc, value);
}
@@ -224,9 +240,32 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.host_organizationdnc_s)) add(doc, YaCySchema.host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom);
}
- if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, md.dc_title());
+
+ String title = md.dc_title();
+ if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, new String[]{title});
+ if (allAttr || contains(YaCySchema.title_count_i)) add(doc, YaCySchema.title_count_i, 1);
+ if (allAttr || contains(YaCySchema.title_chars_val)) {
+ Integer[] cv = new Integer[]{new Integer(title.length())};
+ add(doc, YaCySchema.title_chars_val, cv);
+ }
+ if (allAttr || contains(YaCySchema.title_words_val)) {
+ Integer[] cv = new Integer[]{new Integer(title.split(" ").length)};
+ add(doc, YaCySchema.title_words_val, cv);
+ }
+
+ String description = md.snippet(); if (description == null) description = "";
+ if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, description);
+ if (allAttr || contains(YaCySchema.description_count_i)) add(doc, YaCySchema.description_count_i, 1);
+ if (allAttr || contains(YaCySchema.description_chars_val)) {
+ Integer[] cv = new Integer[]{new Integer(description.length())};
+ add(doc, YaCySchema.description_chars_val, cv);
+ }
+ if (allAttr || contains(YaCySchema.description_words_val)) {
+ Integer[] cv = new Integer[]{new Integer(description.split(" ").length)};
+ add(doc, YaCySchema.description_words_val, cv);
+ }
+
if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, md.dc_creator());
- if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, md.snippet());
if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype()));
if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, md.moddate());
if (allAttr || contains(YaCySchema.wordcount_i)) add(doc, YaCySchema.wordcount_i, md.wordCount());
@@ -243,10 +282,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// path elements of link
- final String path = digestURI.getPath();
- if (path != null && (allAttr || contains(YaCySchema.url_paths_sxt))) {
- final String[] paths = path.split("/");
- if (paths.length > 0) add(doc, YaCySchema.url_paths_sxt, paths);
+ if (allAttr || contains(YaCySchema.url_paths_sxt)) {
+ add(doc, YaCySchema.url_paths_sxt, digestURI.getPaths());
}
if (allAttr || contains(YaCySchema.imagescount_i)) add(doc, YaCySchema.imagescount_i, md.limage());
@@ -331,10 +368,39 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.host_organizationdnc_s)) add(doc, YaCySchema.host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom);
}
- if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, yacydoc.dc_title());
+
+ List titles = yacydoc.titles();
+ if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, titles);
+ if (allAttr || contains(YaCySchema.title_count_i)) add(doc, YaCySchema.title_count_i, titles.size());
+ if (allAttr || contains(YaCySchema.title_chars_val)) {
+ ArrayList cv = new ArrayList(titles.size());
+ for (String s: titles) cv.add(new Integer(s.length()));
+ add(doc, YaCySchema.title_chars_val, cv);
+ }
+ if (allAttr || contains(YaCySchema.title_words_val)) {
+ ArrayList cv = new ArrayList(titles.size());
+ for (String s: titles) cv.add(new Integer(s.split(" ").length));
+ add(doc, YaCySchema.title_words_val, cv);
+ }
+
+ String description = yacydoc.dc_description();
+ List descriptions = new ArrayList();
+ for (String s: description.split("\n")) descriptions.add(s);
+ if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, description);
+ if (allAttr || contains(YaCySchema.description_count_i)) add(doc, YaCySchema.description_count_i, descriptions.size());
+ if (allAttr || contains(YaCySchema.description_chars_val)) {
+ ArrayList cv = new ArrayList(descriptions.size());
+ for (String s: descriptions) cv.add(new Integer(s.length()));
+ add(doc, YaCySchema.description_chars_val, cv);
+ }
+ if (allAttr || contains(YaCySchema.description_words_val)) {
+ ArrayList cv = new ArrayList(descriptions.size());
+ for (String s: descriptions) cv.add(new Integer(s.split(" ").length));
+ add(doc, YaCySchema.description_words_val, cv);
+ }
+
if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, yacydoc.dc_creator());
- if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, yacydoc.dc_description());
- if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, yacydoc.dc_format());
+ if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, new String[]{yacydoc.dc_format()});
if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, header == null ? new Date() : header.lastModified());
if (allAttr || contains(YaCySchema.keywords)) add(doc, YaCySchema.keywords, yacydoc.dc_subject(' '));
final String content = yacydoc.getTextString();
@@ -345,10 +411,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// path elements of link
- final String path = digestURI.getPath();
- if (path != null && (allAttr || contains(YaCySchema.url_paths_sxt))) {
- final String[] paths = path.split("/");
- if (paths.length > 0) add(doc, YaCySchema.url_paths_sxt, paths);
+ if (allAttr || contains(YaCySchema.url_paths_sxt)) {
+ add(doc, YaCySchema.url_paths_sxt, digestURI.getPaths());
}
// get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme
@@ -614,7 +678,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.inboundlinks_urlstub_txt)) add(doc, YaCySchema.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (allAttr || contains(YaCySchema.inboundlinks_name_txt)) add(doc, YaCySchema.inboundlinks_name_txt, inboundlinksName);
if (allAttr || contains(YaCySchema.inboundlinks_rel_sxt)) add(doc, YaCySchema.inboundlinks_rel_sxt, inboundlinksRel);
- if (allAttr || contains(YaCySchema.inboundlinks_relflags_sxt)) add(doc, YaCySchema.inboundlinks_relflags_sxt, relEval(inboundlinksRel));
+ if (allAttr || contains(YaCySchema.inboundlinks_relflags_val)) add(doc, YaCySchema.inboundlinks_relflags_val, relEval(inboundlinksRel));
if (allAttr || contains(YaCySchema.inboundlinks_text_txt)) add(doc, YaCySchema.inboundlinks_text_txt, inboundlinksText);
c = 0;
@@ -652,7 +716,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.outboundlinks_urlstub_txt)) add(doc, YaCySchema.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (allAttr || contains(YaCySchema.outboundlinks_name_txt)) add(doc, YaCySchema.outboundlinks_name_txt, outboundlinksName);
if (allAttr || contains(YaCySchema.outboundlinks_rel_sxt)) add(doc, YaCySchema.outboundlinks_rel_sxt, outboundlinksRel);
- if (allAttr || contains(YaCySchema.outboundlinks_relflags_sxt)) add(doc, YaCySchema.outboundlinks_relflags_sxt, relEval(inboundlinksRel));
+ if (allAttr || contains(YaCySchema.outboundlinks_relflags_val)) add(doc, YaCySchema.outboundlinks_relflags_val, relEval(inboundlinksRel));
if (allAttr || contains(YaCySchema.outboundlinks_text_txt)) add(doc, YaCySchema.outboundlinks_text_txt, outboundlinksText);
// charset
@@ -701,14 +765,16 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
* @param rel
* @return binary encoded information about rel
*/
- private static int relEval(final List rel) {
- int i = 0;
+ private static List relEval(final List rel) {
+ List il = new ArrayList(rel.size());
for (final String s: rel) {
+ int i = 0;
final String s0 = s.toLowerCase().trim();
if ("me".equals(s0)) i += 1;
if ("nofollow".equals(s0)) i += 2;
+ il.add(i);
}
- return i;
+ return il;
}
public String solrGetID(final SolrDocument solr) {
@@ -768,11 +834,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (digestURI.getHost() != null) add(solrdoc, YaCySchema.host_s, digestURI.getHost());
// path elements of link
- final String path = digestURI.getPath();
- if (path != null) {
- final String[] paths = path.split("/");
- if (paths.length > 0) add(solrdoc, YaCySchema.url_paths_sxt, paths);
- }
+ add(solrdoc, YaCySchema.url_paths_sxt, digestURI.getPaths());
add(solrdoc, YaCySchema.failreason_t, failReason);
add(solrdoc, YaCySchema.httpstatus_i, httpstatus);
return solrdoc;
diff --git a/source/net/yacy/search/index/YaCySchema.java b/source/net/yacy/search/index/YaCySchema.java
index 0ca35f370..882d9e4a0 100644
--- a/source/net/yacy/search/index/YaCySchema.java
+++ b/source/net/yacy/search/index/YaCySchema.java
@@ -103,14 +103,14 @@ public enum YaCySchema implements Schema {
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"),
inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"),
inboundlinks_rel_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag"),
- inboundlinks_relflags_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
+ inboundlinks_relflags_val(SolrType.integer, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"),
outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"),
outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"),
outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"),
outboundlinks_rel_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag"),
- outboundlinks_relflags_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag, coded binary"),
+ outboundlinks_relflags_val(SolrType.integer, true, true, true, "external links, the rel property of the a-tag, coded binary"),
outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"),
images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as
tag inclusive alt- and title property"),
images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
@@ -131,6 +131,7 @@ public enum YaCySchema implements Schema {
iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"),
iframesscount_i(SolrType.integer, true, true, false, "number of iframes_txt"),
+ url_protocol_s(SolrType.string, true, true, false, "the protocol of the url"),
url_paths_sxt(SolrType.string, true, true, true, "all path elements in the url"),
url_parameter_i(SolrType.integer, true, true, false, "number of key-value pairs in search part of the url"),
url_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url"),
@@ -138,15 +139,18 @@ public enum YaCySchema implements Schema {
url_chars_i(SolrType.integer, true, true, false, "number of all characters in the url == length of sku field"),
host_s(SolrType.string, true, true, false, "host of the url"),
- url_protocol_s(SolrType.string, true, true, false, "the protocol of the url"),
host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used."),
host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"),
host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.'"),
host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc"),
- //title_count_i(SolrType.integer, true, true, false, ""),
- //title_chars_i(SolrType.integer, true, true, false, ""),
- //title_words_i(SolrType.integer, true, true, false, ""),
+ title_count_i(SolrType.integer, true, true, false, "number of titles (counting the 'title' field) in the document"),
+ title_chars_val(SolrType.integer, true, true, true, "number of characters for each title"),
+ title_words_val(SolrType.integer, true, true, true, "number of words in each title"),
+
+ description_count_i(SolrType.integer, true, true, false, "number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)"),
+ description_chars_val(SolrType.integer, true, true, true, "number of characters for each description"),
+ description_words_val(SolrType.integer, true, true, true, "number of words in each description"),
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
@@ -239,42 +243,77 @@ public enum YaCySchema implements Schema {
}
public final void add(final SolrInputDocument doc, final String value) {
+ assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final Date value) {
+ assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final int value) {
+ assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final long value) {
+ assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final String[] value) {
+ assert this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
- public final void add(final SolrInputDocument doc, final List value) {
- doc.setField(this.getSolrFieldName(), value.toArray(new String[value.size()]));
+ public final void add(final SolrInputDocument doc, final Integer[] value) {
+ assert this.isMultiValued();
+ doc.setField(this.getSolrFieldName(), value);
+ }
+
+ public final void add(final SolrInputDocument doc, final List> value) {
+ assert this.isMultiValued();
+ if (value == null || value.size() == 0) {
+ if (this.type == SolrType.integer) {
+ doc.setField(this.getSolrFieldName(), new Integer[0]);
+ } else if (this.type == SolrType.string) {
+ doc.setField(this.getSolrFieldName(), new String[0]);
+ } else {
+ assert false;
+ doc.setField(this.getSolrFieldName(), new Object[0]);
+ }
+ return;
+ }
+ if (this.type == SolrType.integer) {
+ assert (value.iterator().next() instanceof Integer);
+ doc.setField(this.getSolrFieldName(), value.toArray(new Integer[value.size()]));
+ } else if (this.type == SolrType.string || this.type == SolrType.text_general) {
+ assert (value.iterator().next() instanceof String);
+ doc.setField(this.getSolrFieldName(), value.toArray(new String[value.size()]));
+ } else {
+ assert false : "ADD: type is " + this.type.name();
+ doc.setField(this.getSolrFieldName(), value.toArray(new Object[value.size()]));
+ }
}
public final void add(final SolrInputDocument doc, final float value) {
+ assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final double value) {
+ assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final boolean value) {
+ assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final String value, final float boost) {
+ assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value, boost);
}