- added new solr fields:

title_count_i, title_chars_val, title_words_val
description_count_i, description_chars_val, description_words_val
- added many asserts to ensure data type correctness from YaCy to Solr
and vice versa
- made many fixes according to new findings from these asserts (!)
pull/1/head
Michael Peter Christen 13 years ago
parent 3142e675e8
commit 528d6763fa

@ -187,7 +187,7 @@ h6_txt
#inboundlinks_rel_sxt
## internal links, the rel property of the a-tag, coded binary
#inboundlinks_relflags_sxt
#inboundlinks_relflags_val
## internal links, the text content of the a-tag
#inboundlinks_text_txt
@ -208,7 +208,7 @@ h6_txt
#outboundlinks_rel_sxt
## external links, the rel property of the a-tag, coded binary
#outboundlinks_relflags_sxt
#outboundlinks_relflags_val
## external links, the text content of the a-tag
#outboundlinks_text_txt
@ -306,10 +306,28 @@ italic_txt
## the remaining part of the host without organizationdnc
#host_subdomain_s
## number of titles (counting the 'title' field) in the document
#title_count_i
## number of characters for each title
#title_chars_val
## number of words in each title
#title_words_val
## number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)
#description_count_i
## number of characters for each description
#description_chars_val
## number of words in each description
#description_words_val
## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
#ext_cms_txt
##number of attributes that count for a specific cms in attr_cms
## number of attributes that count for a specific cms in attr_cms
#ext_cms_val
## names of ad-servers/ad-services

@ -79,9 +79,12 @@ httpclient.wire.level = OFF
org.apache.pdfbox.level = INFO
# Properties for solr
org.apache.solr.util.plugin.AbstractPluginLoader.enabled = FALSE
org.apache.solr.schema.IndexSchema.enabled = FALSE
org.apache.solr.handler.enabled = FALSE
org.apache.solr.update.enabled = TRUE
org.apache.solr.update.level = INFO
org.apache.solr.core.enabled = TRUE
org.apache.solr.core.level = INFO
javax.management.mbeanserver.level = INFO
javax.management.mbeanserver.enabled = FALSE

@ -57,7 +57,7 @@
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[host]#</td>
<td><a href="AccessTracker_p.html?page=1&amp;host=#[host]#">#[host]#</a></td>
<td>#[date]#</td>
<td>#[path]#</td>
</tr>

@ -124,7 +124,7 @@ public class getpageinfo {
prop.putXML("desc", removelinebreaks(scraper.dc_description()));
// put language
final Set<String> languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
// get links and put them into a semicolon-separated list
final Set<MultiProtocolURI> uris = scraper.getAnchors().keySet();

@ -124,7 +124,7 @@ public class getpageinfo_p {
prop.putXML("desc", scraper.dc_description());
// put language
final Set<String> languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
prop.putXML("lang", (languages == null || languages.size() == 0) ? "unknown" : languages.iterator().next());
// get links and put them into a semicolon-separated list
final Set<MultiProtocolURI> uris = scraper.getAnchors().keySet();

@ -117,21 +117,21 @@ public class Response {
return doctype;
}
public static String doctype2mime(String ext, char doctype) {
if (doctype == DT_PDFPS) return "application/pdf";
if (doctype == DT_HTML) return "text/html";
if (doctype == DT_DOC) return "application/msword";
if (doctype == DT_FLASH) return "application/x-shockwave-flash";
if (doctype == DT_SHARE) return "text/plain";
if (doctype == DT_BINARY) return "application/octet-stream";
public static String[] doctype2mime(String ext, char doctype) {
if (doctype == DT_PDFPS) return new String[]{"application/pdf"};
if (doctype == DT_HTML) return new String[]{"text/html"};
if (doctype == DT_DOC) return new String[]{"application/msword"};
if (doctype == DT_FLASH) return new String[]{"application/x-shockwave-flash"};
if (doctype == DT_SHARE) return new String[]{"text/plain"};
if (doctype == DT_BINARY) return new String[]{"application/octet-stream"};
String mime = Classification.ext2mime(ext);
int p = mime.indexOf('/');
if (p < 0) return mime;
if (doctype == DT_TEXT) return "text" + mime.substring(p);
if (doctype == DT_IMAGE) return "image" + mime.substring(p);
if (doctype == DT_AUDIO) return "audio" + mime.substring(p);
if (doctype == DT_MOVIE) return "video" + mime.substring(p);
return mime;
if (p < 0) return new String[]{mime};
if (doctype == DT_TEXT) return new String[]{"text" + mime.substring(p)};
if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
return new String[]{mime};
}
public static final int QUEUE_STATE_FRESH = 0;

@ -80,6 +80,7 @@ import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
@ -493,7 +494,8 @@ public final class HTTPDFileHandler {
} else {
if (element.endsWith("html") || (element.endsWith("htm"))) {
scraper = ContentScraper.parseResource(f, 10000);
headline = scraper.getTitle();
Collection<String> t = scraper.getTitles();
headline = t.size() > 0 ? t.iterator().next() : "";
author = scraper.getAuthor();
publisher = scraper.getPublisher();
description = scraper.getDescription();

@ -707,6 +707,10 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return this.path;
}
public String[] getPaths() {
return this.path == null ? null : this.path.charAt(0) == '/' ? this.path.substring(1).split("/") : this.path.split("/");
}
/**
* return the file object to a local file
* this patches also 'strange' windows file paths

@ -7,12 +7,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -32,13 +32,13 @@ import java.util.Set;
public class RSSFeed implements Iterable<RSSMessage> {
public static final int DEFAULT_MAXSIZE = 1000;
// class variables
private RSSMessage channel;
private String imageURL;
private Map<String, RSSMessage> messages; // a guid:Item map
private int maxsize;
private final Map<String, RSSMessage> messages; // a guid:Item map
private final int maxsize;
public RSSFeed(final int maxsize) {
this.messages = Collections.synchronizedMap(new LinkedHashMap<String, RSSMessage>());
this.channel = null;
@ -62,13 +62,13 @@ public class RSSFeed implements Iterable<RSSMessage> {
this.addMessage(message);
}
}
public void setChannel(final RSSMessage channelItem) {
this.channel = channelItem;
}
public RSSMessage getChannel() {
return channel;
return this.channel;
}
public void setImage(final String imageURL) {
@ -78,82 +78,86 @@ public class RSSFeed implements Iterable<RSSMessage> {
public String getImage() {
return this.imageURL;
}
public Set<MultiProtocolURI> getLinks() {
Set<MultiProtocolURI> links = new HashSet<MultiProtocolURI>();
for (RSSMessage message: messages.values()) {
for (RSSMessage message: this.messages.values()) {
try {links.add(new MultiProtocolURI(message.getLink()));} catch (MalformedURLException e) {}
}
return links;
}
public void addMessage(final RSSMessage item) {
final String guid = item.getGuid();
messages.put(guid, item);
this.messages.put(guid, item);
// in case that the feed is full (size > maxsize) flush the oldest element
while (messages.size() > this.maxsize) pollMessage();
while (this.messages.size() > this.maxsize) pollMessage();
}
public RSSMessage getMessage(final String guid) {
// retrieve item by guid
return messages.get(guid);
return this.messages.get(guid);
}
public boolean isEmpty() {
return messages.isEmpty();
return this.messages.isEmpty();
}
public int size() {
return messages.size();
return this.messages.size();
}
@Override
public Iterator<RSSMessage> iterator() {
return new messageIterator();
}
public RSSMessage pollMessage() {
// retrieve and delete item
synchronized (messages) {
if (messages.isEmpty()) return null;
final String nextGUID = messages.keySet().iterator().next();
synchronized (this.messages) {
if (this.messages.isEmpty()) return null;
final String nextGUID = this.messages.keySet().size() == 0 ? null : this.messages.keySet().iterator().next();
if (nextGUID == null) return null;
return messages.remove(nextGUID);
return this.messages.remove(nextGUID);
}
}
public class messageIterator implements Iterator<RSSMessage>{
Iterator<String> GUIDiterator;
String lastGUID;
int t;
public messageIterator() {
t = messages.size(); // termination counter
GUIDiterator = messages.keySet().iterator();
lastGUID = null;
this.t = RSSFeed.this.messages.size(); // termination counter
this.GUIDiterator = RSSFeed.this.messages.keySet().iterator();
this.lastGUID = null;
}
@Override
public boolean hasNext() {
if (t <= 0) return false; // ensure termination
return GUIDiterator.hasNext();
if (this.t <= 0) return false; // ensure termination
return this.GUIDiterator.hasNext();
}
@Override
public RSSMessage next() {
t--; // ensure termination
this.t--; // ensure termination
try {
lastGUID = GUIDiterator.next();
this.lastGUID = this.GUIDiterator.next();
} catch (ConcurrentModificationException e) {
return null;
}
if (lastGUID == null) return null;
return messages.get(lastGUID);
if (this.lastGUID == null) return null;
return RSSFeed.this.messages.get(this.lastGUID);
}
@Override
public void remove() {
if (lastGUID == null) return;
GUIDiterator.remove();
messages.remove(lastGUID);
if (this.lastGUID == null) return;
this.GUIDiterator.remove();
RSSFeed.this.messages.remove(this.lastGUID);
}
}
}

@ -84,7 +84,7 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
@Override
public String toString() {
return this.keys.iterator().next();
return this.keys.size() == 0 ? "" : this.keys.iterator().next();
}
}

@ -23,7 +23,9 @@
package net.yacy.document;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import net.yacy.kelondro.logging.Log;
@ -46,6 +48,7 @@ public abstract class AbstractParser implements Parser {
/**
* return the name of the parser
*/
@Override
public String getName() {
return this.name;
}
@ -54,6 +57,7 @@ public abstract class AbstractParser implements Parser {
* each parser must define a set of supported mime types
* @return a set of mime type strings that are supported
*/
@Override
public Set<String> supportedMimeTypes() {
return this.SUPPORTED_MIME_TYPES;
}
@ -62,6 +66,7 @@ public abstract class AbstractParser implements Parser {
* each parser must define a set of supported file extensions
* @return a set of file name extensions that are supported
*/
@Override
public Set<String> supportedExtensions() {
return this.SUPPORTED_EXTENSIONS;
}
@ -71,6 +76,7 @@ public abstract class AbstractParser implements Parser {
* @param o
* @return
*/
@Override
public boolean equals(final Object o) {
return getName().equals(((Parser) o).getName());
}
@ -79,8 +85,15 @@ public abstract class AbstractParser implements Parser {
* the hash code of a parser
* @return the hash code of the parser name string
*/
@Override
public int hashCode() {
return getName().hashCode();
}
public static List<String> singleList(String t) {
List<String> c = new ArrayList<String>(1);
c.add(t);
return c;
}
}

@ -44,6 +44,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@ -75,7 +76,7 @@ public class Document {
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field
private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result
private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
private final StringBuilder creator; // author or copyright
private final String publisher; // publisher
private final List<String> sections; // if present: more titles/headlines appearing in the document
@ -99,7 +100,9 @@ public class Document {
public Document(final DigestURI location, final String mimeType, final String charset,
final Object parserObject,
final Set<String> languages,
final String[] keywords, final String title, final String author, final String publisher,
final String[] keywords,
final List<String> titles,
final String author, final String publisher,
final String[] sections, final String abstrct,
final double lon, final double lat,
final Object text,
@ -113,7 +116,7 @@ public class Document {
this.parserObject = parserObject;
this.keywords = new LinkedList<String>();
if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
this.title = (title == null) ? new StringBuilder(0) : new StringBuilder(title);
this.titles = titles;
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
@ -186,11 +189,20 @@ dc_rights
*/
public String dc_title() {
return (this.title == null) ? "" : this.title.toString();
return (this.titles == null || this.titles.size() == 0) ? "" : this.titles.iterator().next();
}
public List<String> titles() {
return this.titles;
}
public void setTitle(final String title) {
this.title = new StringBuilder(title);
this.titles = new ArrayList<String>();
if (title != null) this.titles.add(title);
}
public void addTitle(final String title) {
if (title != null) this.titles.add(title);
}
public String dc_creator() {
@ -620,10 +632,7 @@ dc_rights
public void addSubDocuments(final Document[] docs) throws IOException {
for (final Document doc: docs) {
this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
if (this.title.length() > 0) this.title.append('\n');
this.title.append(doc.dc_title());
this.titles.addAll(doc.titles());
this.keywords.addAll(doc.getKeywords());
if (this.description.length() > 0) this.description.append('\n');
@ -760,10 +769,9 @@ dc_rights
final StringBuilder authors = new StringBuilder(80);
final StringBuilder publishers = new StringBuilder(80);
final StringBuilder subjects = new StringBuilder(80);
final StringBuilder title = new StringBuilder(80);
final StringBuilder description = new StringBuilder(80);
final LinkedList<String> sectionTitles = new LinkedList<String>();
final Collection<String> titles = new LinkedHashSet<String>();
final Collection<String> sectionTitles = new LinkedHashSet<String>();
final Map<MultiProtocolURI, Properties> anchors = new HashMap<MultiProtocolURI, Properties>();
final Map<MultiProtocolURI, String> rss = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
@ -790,9 +798,7 @@ dc_rights
subjects.append(subject);
}
if (title.length() > 0) title.append("\n");
title.append(doc.dc_title());
titles.addAll(doc.titles());
sectionTitles.addAll(Arrays.asList(doc.getSectionTitles()));
if (description.length() > 0) description.append("\n");
@ -822,6 +828,8 @@ dc_rights
}
// return consolidation
ArrayList<String> titlesa = new ArrayList<String>();
titlesa.addAll(titles);
return new Document(
location,
globalMime,
@ -829,7 +837,7 @@ dc_rights
null,
null,
subjects.toString().split(" |,"),
title.toString(),
titlesa,
authors.toString(),
publishers.toString(),
sectionTitles.toArray(new String[sectionTitles.size()]),

@ -30,8 +30,10 @@ import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.text.Collator;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.TreeMap;
@ -261,7 +263,8 @@ public class DCEntry extends TreeMap<String, String> {
public Document document() {
HashSet<String> languages = new HashSet<String>();
languages.add(getLanguage());
List<String> t = new ArrayList<String>(1);
t.add(getTitle());
return new Document(
getIdentifier(true),
"text/html",
@ -269,7 +272,7 @@ public class DCEntry extends TreeMap<String, String> {
this,
languages,
getSubject(),
getTitle(),
t,
getCreator(),
getPublisher(),
null,

@ -60,14 +60,14 @@ public class AugmentParser extends AbstractParser implements Parser {
private static Document analyze (Document alreadyParsedDocument, DigestURI url,
String mimeType, String charset) {
Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, null, null, null, null, false);
// if the magic word appears in the document, perform extra actions.
if (alreadyParsedDocument.getKeywords().contains("magicword")) {
String all = "";
all = "yacylatest";
newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
}
@ -77,7 +77,7 @@ public class AugmentParser extends AbstractParser implements Parser {
private Document parseAndAugment(DigestURI url, String mimeType, String charset) {
String all = "";
Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
Iterator<net.yacy.kelondro.blob.Tables.Row> it;

@ -67,7 +67,7 @@ public class csvParser extends AbstractParser implements Parser {
this,
null,
null,
concatRow(table.get(0)),
singleList(concatRow(table.get(0))),
"",
"",
null,

@ -28,10 +28,12 @@
package net.yacy.document.parser;
import java.io.InputStream;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class docParser extends AbstractParser implements Parser {
@ -50,6 +52,7 @@ public class docParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/x-msword");
}
@Override
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
@ -90,7 +93,7 @@ public class docParser extends AbstractParser implements Parser {
this,
null,
null,
title,
singleList(title),
"", // TODO: AUTHOR
extractor.getDocSummaryInformation().getCompany(), // publisher
null,

@ -43,6 +43,7 @@ public class genericParser extends AbstractParser implements Parser {
// this parser is used if no other fits. This parser fits all
}
@Override
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source1)
throws Parser.Failure, InterruptedException {
@ -54,7 +55,7 @@ public class genericParser extends AbstractParser implements Parser {
this,
null,
null,
location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName()), // title
singleList(location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName())), // title
"", // author
location.getHost(),
null,

@ -32,8 +32,10 @@ import java.io.Writer;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@ -125,7 +127,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final Map<MultiProtocolURI, EmbedEntry> embeds; // urlhash/embed relation
private final Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final Map<String, String> metas;
private String title;
private Collection<String> titles;
//private String headline;
private List<String>[] headlines;
private final ClusteredScoreMap<String> bold, italic;
@ -170,7 +172,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.iframes = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.metas = new SizeLimitedMap<String, String>(maxLinks);
this.script = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.title = EMPTY_STRING;
this.titles = new LinkedHashSet<String>();
this.headlines = new ArrayList[6];
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
this.bold = new ClusteredScoreMap<String>();
@ -391,7 +393,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.images.put(ie.url(), ie);
this.favicon = newLink;
} else if (rel.equalsIgnoreCase("canonical")) {
tagopts.put("name", this.title);
tagopts.put("name", this.titles.size() == 0 ? "" : this.titles.iterator().next());
mergeAnchors(newLink, tagopts);
this.canonical = newLink;
} else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
@ -480,8 +482,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
h = recursiveParse(text);
if (h.length() > 0) this.headlines[5].add(h);
} else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
this.title = recursiveParse(text);
this.evaluationScores.match(Element.title, this.title);
String t = recursiveParse(text);
this.titles.add(t);
this.evaluationScores.match(Element.title, t);
} else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.bold.inc(h);
@ -542,35 +545,37 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return line;
}
public String getTitle() {
// construct a title string, even if the document has no title
public List<String> getTitles() {
// some documents have a title tag as meta tag
String s = this.metas.get("title");
// try to construct the title with the content of the title tag
if (this.title.length() > 0) {
if (s == null) {
return this.title;
if (s != null && s.length() > 0) {
LinkedHashSet<String> t = new LinkedHashSet<String>();
t.add(s);
t.addAll(this.titles);
this.titles = t;
}
if (this.titles.size() == 0) {
// take any headline
for (int i = 0; i < this.headlines.length; i++) {
if (!this.headlines[i].isEmpty()) {
this.titles.add(this.headlines[i].get(0));
break;
}
}
if ((this.title.compareToIgnoreCase(s) == 0) || (this.title.indexOf(s) >= 0)) return s;
return this.title + ": " + s;
}
if (s != null) {
return s;
}
// otherwise take any headline
for (int i = 0; i < this.headlines.length; i++) {
if (!this.headlines[i].isEmpty()) return this.headlines[i].get(0);
if (this.titles.size() == 0) {
// take description tag
s = getDescription();
if (!s.isEmpty()) this.titles.add(s);
}
// take description tag
s = getDescription();
if (!s.isEmpty()) return s;
// extract headline from file name
return MultiProtocolURI.unescape(this.root.getFileName());
ArrayList<String> t = new ArrayList<String>();
t.addAll(this.titles);
return t;
}
public String[] getHeadlines(final int i) {
@ -875,7 +880,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.embeds.clear();
this.images.clear();
this.metas.clear();
this.title = null;
this.titles.clear();
this.headlines = null;
this.bold.clear();
this.italic.clear();
@ -884,7 +889,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
public void print() {
System.out.println("TITLE :" + this.title);
for (String t: this.titles) {
System.out.println("TITLE :" + t);
}
for (int i = 0; i < 4; i++) {
System.out.println("HEADLINE" + i + ":" + this.headlines[i].toString());
}

@ -132,7 +132,7 @@ public class htmlParser extends AbstractParser implements Parser {
scraper,
scraper.getContentLanguages(),
scraper.getKeywords(),
scraper.getTitle(),
scraper.getTitles(),
scraper.getAuthor(),
scraper.getPublisher(),
sections,

@ -199,7 +199,7 @@ public class genericImageParser extends AbstractParser implements Parser {
this,
languages,
keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords
title, // title
singleList(title), // title
author == null ? "" : author, // author
location.getHost(), // Publisher
new String[]{}, // sections

@ -68,6 +68,7 @@ public class mmParser extends AbstractParser implements Parser {
return parser;
}
@Override
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException
@ -105,7 +106,7 @@ public class mmParser extends AbstractParser implements Parser {
this,
null,
null,
rootElementText,
singleList(rootElementText),
null,
null,
null,

@ -185,7 +185,7 @@ public class odtParser extends AbstractParser implements Parser {
this,
languages,
docKeywords,
docLongTitle,
singleList(docLongTitle),
docAuthor,
"",
null,

@ -170,7 +170,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
this,
languages,
docKeywords,
docLongTitle,
singleList(docLongTitle),
docAuthor,
"",
null,

@ -190,7 +190,7 @@ public class pdfParser extends AbstractParser implements Parser {
this,
null,
docKeywords,
docTitle,
singleList(docTitle),
docAuthor,
docPublisher,
null,

@ -58,6 +58,7 @@ public class pptParser extends AbstractParser implements Parser {
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source) throws Parser.Failure,
InterruptedException {
@ -88,7 +89,7 @@ public class pptParser extends AbstractParser implements Parser {
this,
null,
null,
title,
singleList(title),
"", // TODO: AUTHOR
pptExtractor.getDocSummaryInformation().getCompany(),
null,

@ -43,6 +43,7 @@ public class rdfParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/rdf+xml");
}
@Override
public Document[] parse(final DigestURI url, final String mimeType,
final String charset, final InputStream source)
throws Failure, InterruptedException {
@ -57,7 +58,7 @@ public class rdfParser extends AbstractParser implements Parser {
Document doc;
String all = "rdfdatasource";
doc = new Document(url, mimeType, charset, null, null, null, "", "",
doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
docs.add(doc);

@ -78,7 +78,7 @@ public class RDFaParser extends AbstractParser implements Parser {
Log.logWarning("RDFA PARSER", "Triple extraction failed");
}
Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, null, null, null, null, false);
try {
@ -137,7 +137,7 @@ public class RDFaParser extends AbstractParser implements Parser {
all += string + ",";
}
Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, "", 0, 0, all, null, null, null, false);
return doc;
}

@ -92,7 +92,7 @@ public class rssParser extends AbstractParser implements Parser {
this,
languages,
item.getSubject(),
item.getTitle(),
singleList(item.getTitle()),
item.getAuthor(),
item.getCopyright(),
new String[0],

@ -50,6 +50,7 @@ public class rtfParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/x-soffice");
}
@Override
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
@ -69,11 +70,11 @@ public class rtfParser extends AbstractParser implements Parser {
this,
null,
null,
((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
singleList(((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
replaceAll("\r\n"," ").
replaceAll("\n"," ").
replaceAll("\r"," ").
replaceAll("\t"," "),
replaceAll("\t"," ")),
"", // TODO: AUTHOR
"", // TODO: publisher
null,

@ -85,7 +85,7 @@ public class sidAudioParser extends AbstractParser implements Parser {
this,
null,
null,
header.get("name"),
singleList(header.get("name")),
header.get("author"),
header.get("publisher"),
null,

@ -88,7 +88,7 @@ public class sitemapParser extends AbstractParser implements Parser {
this,
null,
null,
"",
singleList(""),
"",
"",
new String[0],

@ -110,11 +110,11 @@ public class swfParser extends AbstractParser implements Parser {
this,
null,
null, //keywords
((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
singleList(((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
replaceAll("\r\n"," ").
replaceAll("\n"," ").
replaceAll("\r"," ").
replaceAll("\t"," "), // title
replaceAll("\t"," ")), // title
"", // TODO: AUTHOR
"",
sections, // an array of section headlines

@ -100,7 +100,7 @@ public class torrentParser extends AbstractParser implements Parser {
this,
null,
null,
title, // title
singleList(title), // title
comment, // author
location.getHost(),
null,

@ -215,7 +215,7 @@ public class vcfParser extends AbstractParser implements Parser {
this,
null, // set of languages
null, // a list of extracted keywords
parsedTitle.toString(), // a long document title
singleList(parsedTitle.toString()), // a long document title
"", // TODO: AUTHOR
"", // the publisher
sections, // an array of section headlines

@ -62,6 +62,7 @@ public class vsdParser extends AbstractParser implements Parser {
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document
*/
@Override
public Document[] parse(final DigestURI location, final String mimeType, final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
@ -108,7 +109,7 @@ public class vsdParser extends AbstractParser implements Parser {
this,
null, // language
keywords,
title,
singleList(title),
author,
"",
null, // an array of section headlines

@ -119,7 +119,7 @@ public class xlsParser extends AbstractParser implements Parser {
this,
null,
null,
location.getFile(),
singleList(location.getFile()),
"", // TODO: AUTHOR
"", // TODO: publisher
null,

@ -31,6 +31,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.services.federated.solr.SolrType;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
@ -116,31 +117,38 @@ public class URIMetadataNode implements URIMetadata {
}
private int getInt(YaCySchema field) {
assert !field.isMultiValued();
assert field.getType() == SolrType.integer;
Integer x = (Integer) this.doc.getFieldValue(field.name());
if (x == null) return 0;
return x.intValue();
}
/*
private double getDouble(YaCySchema field) {
Double x = (Double) this.doc.getFieldValue(field.name());
if (x == null) return 0.0d;
return x.doubleValue();
}
*/
private Date getDate(YaCySchema field) {
assert !field.isMultiValued();
assert field.getType() == SolrType.date;
Date x = (Date) this.doc.getFieldValue(field.name());
if (x == null) return new Date(0);
return x;
}
private String getString(YaCySchema field) {
String x = (String) this.doc.getFieldValue(field.name());
assert !field.isMultiValued();
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general || field.getType() == SolrType.text_en_splitting_tight;
Object x = this.doc.getFieldValue(field.name());
if (x == null) return "";
return x;
if (x instanceof ArrayList) {
@SuppressWarnings("unchecked")
ArrayList<String> xa = (ArrayList<String>) x;
return xa.size() == 0 ? "" : xa.get(0);
}
return (String) x;
}
@SuppressWarnings("unchecked")
private ArrayList<String> getArrayList(YaCySchema field) {
assert field.isMultiValued();
assert field.getType() == SolrType.string || field.getType() == SolrType.text_general;
Object r = this.doc.getFieldValue(field.name());
if (r == null) return new ArrayList<String>(0);
if (r instanceof ArrayList) {

@ -111,7 +111,7 @@ public class URIMetadataRow implements URIMetadata {
public URIMetadataRow() {
// create a dummy entry, good to produce poison objects
this.entry = rowdef.newEntry();
this.snippet = null;
this.snippet = "";
this.word = null;
this.ranking = 0;
this.comp = null;
@ -161,7 +161,7 @@ public class URIMetadataRow implements URIMetadata {
this.entry.setCol(col_lvideo, lvideo);
this.entry.setCol(col_lapp, lapp);
//System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString());
this.snippet = null;
this.snippet = "";
this.word = null;
this.ranking = 0;
this.comp = null;
@ -207,7 +207,7 @@ public class URIMetadataRow implements URIMetadata {
public URIMetadataRow(final Row.Entry entry, final WordReference searchedWord, final long ranking) {
this.entry = entry;
this.snippet = null;
this.snippet = "";
this.word = searchedWord;
this.ranking = ranking;
this.comp = null;

@ -110,46 +110,62 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final byte[] value) {
assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length != 0))) key.add(doc, UTF8.String(value));
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final String value) {
assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final String value, final float boost) {
assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) key.add(doc, value, boost);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final Date value) {
assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.getTime() > 0))) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final String[] value) {
assert key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final List<String> value) {
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) key.add(doc, value);
protected void add(final SolrInputDocument doc, final YaCySchema key, final Integer[] value) {
assert key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final List<?> values) {
assert key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || (values != null && !values.isEmpty()))) key.add(doc, values);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final int value) {
assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final long value) {
assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final float value) {
assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0.0f)) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final double value) {
assert !key.isMultiValued();
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0.0d)) key.add(doc, value);
}
protected void add(final SolrInputDocument doc, final YaCySchema key, final boolean value) {
assert !key.isMultiValued();
if (isEmpty() || contains(key)) key.add(doc, value);
}
@ -224,9 +240,32 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.host_organizationdnc_s)) add(doc, YaCySchema.host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom);
}
if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, md.dc_title());
String title = md.dc_title();
if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, new String[]{title});
if (allAttr || contains(YaCySchema.title_count_i)) add(doc, YaCySchema.title_count_i, 1);
if (allAttr || contains(YaCySchema.title_chars_val)) {
Integer[] cv = new Integer[]{new Integer(title.length())};
add(doc, YaCySchema.title_chars_val, cv);
}
if (allAttr || contains(YaCySchema.title_words_val)) {
Integer[] cv = new Integer[]{new Integer(title.split(" ").length)};
add(doc, YaCySchema.title_words_val, cv);
}
String description = md.snippet(); if (description == null) description = "";
if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, description);
if (allAttr || contains(YaCySchema.description_count_i)) add(doc, YaCySchema.description_count_i, 1);
if (allAttr || contains(YaCySchema.description_chars_val)) {
Integer[] cv = new Integer[]{new Integer(description.length())};
add(doc, YaCySchema.description_chars_val, cv);
}
if (allAttr || contains(YaCySchema.description_words_val)) {
Integer[] cv = new Integer[]{new Integer(description.split(" ").length)};
add(doc, YaCySchema.description_words_val, cv);
}
if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, md.dc_creator());
if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, md.snippet());
if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype()));
if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, md.moddate());
if (allAttr || contains(YaCySchema.wordcount_i)) add(doc, YaCySchema.wordcount_i, md.wordCount());
@ -243,10 +282,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// path elements of link
final String path = digestURI.getPath();
if (path != null && (allAttr || contains(YaCySchema.url_paths_sxt))) {
final String[] paths = path.split("/");
if (paths.length > 0) add(doc, YaCySchema.url_paths_sxt, paths);
if (allAttr || contains(YaCySchema.url_paths_sxt)) {
add(doc, YaCySchema.url_paths_sxt, digestURI.getPaths());
}
if (allAttr || contains(YaCySchema.imagescount_i)) add(doc, YaCySchema.imagescount_i, md.limage());
@ -331,10 +368,39 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.host_organizationdnc_s)) add(doc, YaCySchema.host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(YaCySchema.host_subdomain_s)) add(doc, YaCySchema.host_subdomain_s, subdom);
}
if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, yacydoc.dc_title());
List<String> titles = yacydoc.titles();
if (allAttr || contains(YaCySchema.title)) add(doc, YaCySchema.title, titles);
if (allAttr || contains(YaCySchema.title_count_i)) add(doc, YaCySchema.title_count_i, titles.size());
if (allAttr || contains(YaCySchema.title_chars_val)) {
ArrayList<Integer> cv = new ArrayList<Integer>(titles.size());
for (String s: titles) cv.add(new Integer(s.length()));
add(doc, YaCySchema.title_chars_val, cv);
}
if (allAttr || contains(YaCySchema.title_words_val)) {
ArrayList<Integer> cv = new ArrayList<Integer>(titles.size());
for (String s: titles) cv.add(new Integer(s.split(" ").length));
add(doc, YaCySchema.title_words_val, cv);
}
String description = yacydoc.dc_description();
List<String> descriptions = new ArrayList<String>();
for (String s: description.split("\n")) descriptions.add(s);
if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, description);
if (allAttr || contains(YaCySchema.description_count_i)) add(doc, YaCySchema.description_count_i, descriptions.size());
if (allAttr || contains(YaCySchema.description_chars_val)) {
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.size());
for (String s: descriptions) cv.add(new Integer(s.length()));
add(doc, YaCySchema.description_chars_val, cv);
}
if (allAttr || contains(YaCySchema.description_words_val)) {
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.size());
for (String s: descriptions) cv.add(new Integer(s.split(" ").length));
add(doc, YaCySchema.description_words_val, cv);
}
if (allAttr || contains(YaCySchema.author)) add(doc, YaCySchema.author, yacydoc.dc_creator());
if (allAttr || contains(YaCySchema.description)) add(doc, YaCySchema.description, yacydoc.dc_description());
if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, yacydoc.dc_format());
if (allAttr || contains(YaCySchema.content_type)) add(doc, YaCySchema.content_type, new String[]{yacydoc.dc_format()});
if (allAttr || contains(YaCySchema.last_modified)) add(doc, YaCySchema.last_modified, header == null ? new Date() : header.lastModified());
if (allAttr || contains(YaCySchema.keywords)) add(doc, YaCySchema.keywords, yacydoc.dc_subject(' '));
final String content = yacydoc.getTextString();
@ -345,10 +411,8 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
// path elements of link
final String path = digestURI.getPath();
if (path != null && (allAttr || contains(YaCySchema.url_paths_sxt))) {
final String[] paths = path.split("/");
if (paths.length > 0) add(doc, YaCySchema.url_paths_sxt, paths);
if (allAttr || contains(YaCySchema.url_paths_sxt)) {
add(doc, YaCySchema.url_paths_sxt, digestURI.getPaths());
}
// get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme
@ -614,7 +678,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.inboundlinks_urlstub_txt)) add(doc, YaCySchema.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (allAttr || contains(YaCySchema.inboundlinks_name_txt)) add(doc, YaCySchema.inboundlinks_name_txt, inboundlinksName);
if (allAttr || contains(YaCySchema.inboundlinks_rel_sxt)) add(doc, YaCySchema.inboundlinks_rel_sxt, inboundlinksRel);
if (allAttr || contains(YaCySchema.inboundlinks_relflags_sxt)) add(doc, YaCySchema.inboundlinks_relflags_sxt, relEval(inboundlinksRel));
if (allAttr || contains(YaCySchema.inboundlinks_relflags_val)) add(doc, YaCySchema.inboundlinks_relflags_val, relEval(inboundlinksRel));
if (allAttr || contains(YaCySchema.inboundlinks_text_txt)) add(doc, YaCySchema.inboundlinks_text_txt, inboundlinksText);
c = 0;
@ -652,7 +716,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (allAttr || contains(YaCySchema.outboundlinks_urlstub_txt)) add(doc, YaCySchema.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (allAttr || contains(YaCySchema.outboundlinks_name_txt)) add(doc, YaCySchema.outboundlinks_name_txt, outboundlinksName);
if (allAttr || contains(YaCySchema.outboundlinks_rel_sxt)) add(doc, YaCySchema.outboundlinks_rel_sxt, outboundlinksRel);
if (allAttr || contains(YaCySchema.outboundlinks_relflags_sxt)) add(doc, YaCySchema.outboundlinks_relflags_sxt, relEval(inboundlinksRel));
if (allAttr || contains(YaCySchema.outboundlinks_relflags_val)) add(doc, YaCySchema.outboundlinks_relflags_val, relEval(inboundlinksRel));
if (allAttr || contains(YaCySchema.outboundlinks_text_txt)) add(doc, YaCySchema.outboundlinks_text_txt, outboundlinksText);
// charset
@ -701,14 +765,16 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
* @param rel
* @return binary encoded information about rel
*/
private static int relEval(final List<String> rel) {
int i = 0;
private static List<Integer> relEval(final List<String> rel) {
List<Integer> il = new ArrayList<Integer>(rel.size());
for (final String s: rel) {
int i = 0;
final String s0 = s.toLowerCase().trim();
if ("me".equals(s0)) i += 1;
if ("nofollow".equals(s0)) i += 2;
il.add(i);
}
return i;
return il;
}
public String solrGetID(final SolrDocument solr) {
@ -768,11 +834,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (digestURI.getHost() != null) add(solrdoc, YaCySchema.host_s, digestURI.getHost());
// path elements of link
final String path = digestURI.getPath();
if (path != null) {
final String[] paths = path.split("/");
if (paths.length > 0) add(solrdoc, YaCySchema.url_paths_sxt, paths);
}
add(solrdoc, YaCySchema.url_paths_sxt, digestURI.getPaths());
add(solrdoc, YaCySchema.failreason_t, failReason);
add(solrdoc, YaCySchema.httpstatus_i, httpstatus);
return solrdoc;

@ -103,14 +103,14 @@ public enum YaCySchema implements Schema {
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"),
inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"),
inboundlinks_rel_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag"),
inboundlinks_relflags_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
inboundlinks_relflags_val(SolrType.integer, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"),
outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"),
outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"),
outboundlinks_rel_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag"),
outboundlinks_relflags_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag, coded binary"),
outboundlinks_relflags_val(SolrType.integer, true, true, true, "external links, the rel property of the a-tag, coded binary"),
outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"),
images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
@ -131,6 +131,7 @@ public enum YaCySchema implements Schema {
iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"),
iframesscount_i(SolrType.integer, true, true, false, "number of iframes_txt"),
url_protocol_s(SolrType.string, true, true, false, "the protocol of the url"),
url_paths_sxt(SolrType.string, true, true, true, "all path elements in the url"),
url_parameter_i(SolrType.integer, true, true, false, "number of key-value pairs in search part of the url"),
url_parameter_key_sxt(SolrType.string, true, true, true, "the keys from key-value pairs in the search part of the url"),
@ -138,15 +139,18 @@ public enum YaCySchema implements Schema {
url_chars_i(SolrType.integer, true, true, false, "number of all characters in the url == length of sku field"),
host_s(SolrType.string, true, true, false, "host of the url"),
url_protocol_s(SolrType.string, true, true, false, "the protocol of the url"),
host_dnc_s(SolrType.string, true, true, false, "the Domain Class Name, either the TLD or a combination of ccSLD+TLD if a ccSLD is used."),
host_organization_s(SolrType.string, true, true, false, "either the second level domain or, if a ccSLD is used, the third level domain"),
host_organizationdnc_s(SolrType.string, true, true, false, "the organization and dnc concatenated with '.'"),
host_subdomain_s(SolrType.string, true, true, false, "the remaining part of the host without organizationdnc"),
//title_count_i(SolrType.integer, true, true, false, ""),
//title_chars_i(SolrType.integer, true, true, false, ""),
//title_words_i(SolrType.integer, true, true, false, ""),
title_count_i(SolrType.integer, true, true, false, "number of titles (counting the 'title' field) in the document"),
title_chars_val(SolrType.integer, true, true, true, "number of characters for each title"),
title_words_val(SolrType.integer, true, true, true, "number of words in each title"),
description_count_i(SolrType.integer, true, true, false, "number of descriptions in the document. Its not counting the 'description' field since there is only one. But it counts the number of descriptions that appear in the document (if any)"),
description_chars_val(SolrType.integer, true, true, true, "number of characters for each description"),
description_words_val(SolrType.integer, true, true, true, "number of words in each description"),
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
@ -239,42 +243,77 @@ public enum YaCySchema implements Schema {
}
public final void add(final SolrInputDocument doc, final String value) {
assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final Date value) {
assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final int value) {
assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final long value) {
assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final String[] value) {
assert this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final List<String> value) {
doc.setField(this.getSolrFieldName(), value.toArray(new String[value.size()]));
public final void add(final SolrInputDocument doc, final Integer[] value) {
assert this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final List<?> value) {
assert this.isMultiValued();
if (value == null || value.size() == 0) {
if (this.type == SolrType.integer) {
doc.setField(this.getSolrFieldName(), new Integer[0]);
} else if (this.type == SolrType.string) {
doc.setField(this.getSolrFieldName(), new String[0]);
} else {
assert false;
doc.setField(this.getSolrFieldName(), new Object[0]);
}
return;
}
if (this.type == SolrType.integer) {
assert (value.iterator().next() instanceof Integer);
doc.setField(this.getSolrFieldName(), value.toArray(new Integer[value.size()]));
} else if (this.type == SolrType.string || this.type == SolrType.text_general) {
assert (value.iterator().next() instanceof String);
doc.setField(this.getSolrFieldName(), value.toArray(new String[value.size()]));
} else {
assert false : "ADD: type is " + this.type.name();
doc.setField(this.getSolrFieldName(), value.toArray(new Object[value.size()]));
}
}
public final void add(final SolrInputDocument doc, final float value) {
assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final double value) {
assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final boolean value) {
assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final String value, final float boost) {
assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value, boost);
}

Loading…
Cancel
Save