made the solr connection more generic

pull/1/head
Michael Peter Christen 13 years ago
parent ea2bd43b28
commit c00efc2717

@ -32,7 +32,6 @@ import java.util.Iterator;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.solr.SolrScheme;
import net.yacy.cora.services.federated.solr.SolrShardingConnector;
import net.yacy.cora.services.federated.solr.SolrShardingSelection;
import net.yacy.cora.services.federated.solr.SolrSingleConnector;
@ -40,6 +39,8 @@ import net.yacy.cora.storage.ConfigurationSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments;
import net.yacy.search.index.SolrField;
import net.yacy.search.index.SolrScheme;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -92,7 +93,7 @@ public class IndexFederated_p {
// switch on
final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
try {
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrShardingConnector(solrurls, scheme, SolrShardingSelection.Method.MODULO_HOST_MD5, 10000) : null);
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrShardingConnector(solrurls, SolrShardingSelection.Method.MODULO_HOST_MD5, 10000) : null);
} catch (final IOException e) {
Log.logException(e);
sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null);
@ -138,21 +139,17 @@ public class IndexFederated_p {
}
// write scheme
SolrScheme scheme = (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) ? null : sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().getScheme();
final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
if (scheme == null) {
scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
}
final Iterator<ConfigurationSet.Entry> i = scheme.allIterator();
final Iterator<ConfigurationSet.Entry> i = sb.solrScheme.allIterator();
int c = 0;
boolean dark = false;
ConfigurationSet.Entry entry;
SolrScheme.Field field;
SolrField field;
while (i.hasNext()) {
entry = i.next();
try {
field = SolrScheme.Field.valueOf(entry.key());
field = SolrField.valueOf(entry.key());
} catch (IllegalArgumentException e) {
continue;
}

@ -22,15 +22,12 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.File;
import java.util.Iterator;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.solr.SolrScheme;
import net.yacy.cora.services.federated.solr.SolrScheme.Field;
import net.yacy.cora.storage.ConfigurationSet;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments;
import net.yacy.search.index.SolrField;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -42,21 +39,16 @@ public class schema_p {
final Switchboard sb = (Switchboard) env;
// write scheme
SolrScheme scheme = (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) ? null : sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().getScheme();
final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
if (scheme == null) {
scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
}
final Iterator<ConfigurationSet.Entry> i = scheme.allIterator();
final Iterator<ConfigurationSet.Entry> i = sb.solrScheme.allIterator();
int c = 0;
ConfigurationSet.Entry entry;
SolrScheme.Field field = null;
SolrField field = null;
while (i.hasNext()) {
entry = i.next();
if (!entry.enabled()) continue; //scheme.contains(entry.key())
try {
field = Field.valueOf(entry.key());
field = SolrField.valueOf(entry.key());
} catch (IllegalArgumentException e) {
continue;
}

@ -27,8 +27,6 @@ package net.yacy.cora.services.federated.solr;
import java.io.IOException;
import java.util.List;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.solr.common.SolrDocumentList;
@ -37,12 +35,6 @@ import org.apache.solr.common.SolrInputDocument;
public interface SolrConnector {
/**
* with a scheme the fields of a SolrDocument can be translated to actual data values
* @return the solr scheme that can translate the SolrDocument
*/
public SolrScheme getScheme();
public void close();
/**
@ -73,15 +65,6 @@ public interface SolrConnector {
*/
public boolean exists(final String id) throws IOException;
/**
* add a YaCy document. This calls the scheme processor to add the document as solr document
* @param id the url hash of the entry
* @param header the http response header
* @param doc the YaCy document
* @throws IOException
*/
public void add(final String id, final ResponseHeader header, final Document doc) throws IOException;
/**
* add a solr input document
* @param solrdoc

@ -0,0 +1,32 @@
/**
* SolrField
* Copyright 2011 by Michael Peter Christen
* First released 14.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7654 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services.federated.solr;
public interface SolrField {
public String name();
}

@ -27,8 +27,6 @@ package net.yacy.cora.services.federated.solr;
import java.io.IOException;
import java.util.List;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.solr.common.SolrDocumentList;
@ -45,11 +43,6 @@ public class SolrRetryConnector implements SolrConnector {
this.retryMaxTime = retryMaxTime;
}
@Override
public SolrScheme getScheme() {
return this.solrConnector.getScheme();
}
@Override
public void close() {
this.solrConnector.close();
@ -115,21 +108,6 @@ public class SolrRetryConnector implements SolrConnector {
return false;
}
@Override
public void add(final String id, final ResponseHeader header, final Document doc) throws IOException {
final long t = System.currentTimeMillis() + this.retryMaxTime;
Throwable ee = null;
while (System.currentTimeMillis() < t) try {
this.solrConnector.add(id, header, doc);
return;
} catch (final Throwable e) {
ee = e;
try {Thread.sleep(10);} catch (final InterruptedException e1) {}
continue;
}
if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
}
@Override
public void add(final SolrInputDocument solrdoc) throws IOException, SolrException {
final long t = System.currentTimeMillis() + this.retryMaxTime;

@ -31,8 +31,7 @@ import java.util.Collection;
import java.util.List;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.cora.services.federated.solr.SolrShardingSelection.Method;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.solr.common.SolrDocument;
@ -43,25 +42,20 @@ import org.apache.solr.common.SolrInputDocument;
public class SolrShardingConnector implements SolrConnector {
private final List<SolrConnector> connectors;
private final SolrScheme scheme;
private final SolrShardingSelection sharding;
private final String[] urls;
public SolrShardingConnector(final String urlList, final SolrScheme scheme, final SolrShardingSelection.Method method, final long timeout) throws IOException {
public SolrShardingConnector(final String urlList, final SolrShardingSelection.Method method, final long timeout) throws IOException {
urlList.replace(' ', ',');
this.urls = urlList.split(",");
this.connectors = new ArrayList<SolrConnector>();
for (final String u: this.urls) {
this.connectors.add(new SolrRetryConnector(new SolrSingleConnector(u.trim(), scheme), timeout));
this.connectors.add(new SolrRetryConnector(new SolrSingleConnector(u.trim()), timeout));
}
this.sharding = new SolrShardingSelection(method, this.urls.length);
this.scheme = scheme;
}
public SolrScheme getScheme() {
return this.scheme;
}
@Override
public void close() {
for (final SolrConnector connector: this.connectors) connector.close();
}
@ -70,6 +64,7 @@ public class SolrShardingConnector implements SolrConnector {
* delete everything in the solr index
* @throws IOException
*/
@Override
public void clear() throws IOException {
for (final SolrConnector connector: this.connectors) connector.clear();
}
@ -79,6 +74,7 @@ public class SolrShardingConnector implements SolrConnector {
* @param id the url hash of the entry
* @throws IOException
*/
@Override
public void delete(final String id) throws IOException {
for (final SolrConnector connector: this.connectors) connector.delete(id);
}
@ -88,6 +84,7 @@ public class SolrShardingConnector implements SolrConnector {
* @param ids a list of url hashes
* @throws IOException
*/
@Override
public void delete(final List<String> ids) throws IOException {
for (final SolrConnector connector: this.connectors) connector.delete(ids);
}
@ -98,6 +95,7 @@ public class SolrShardingConnector implements SolrConnector {
* @return true if any entry in solr exists
* @throws IOException
*/
@Override
public boolean exists(final String id) throws IOException {
for (final SolrConnector connector: this.connectors) {
if (connector.exists(id)) return true;
@ -105,22 +103,12 @@ public class SolrShardingConnector implements SolrConnector {
return false;
}
/**
* add a YaCy document. This calls the scheme processor to add the document as solr document
* @param id the url hash of the entry
* @param header the http response header
* @param doc the YaCy document
* @throws IOException
*/
public void add(final String id, final ResponseHeader header, final Document doc) throws IOException {
add(this.scheme.yacy2solr(id, header, doc));
}
/**
* add a Solr document
* @param solrdoc
* @throws IOException
*/
@Override
public void add(final SolrInputDocument solrdoc) throws IOException {
this.connectors.get(this.sharding.select(solrdoc)).add(solrdoc);
}
@ -141,6 +129,7 @@ public class SolrShardingConnector implements SolrConnector {
* @param httpstatus
* @throws IOException
*/
@Override
public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException {
this.connectors.get(this.sharding.selectURL(digestURI.toNormalform(true, false))).err(digestURI, failReason, httpstatus);
}
@ -152,6 +141,7 @@ public class SolrShardingConnector implements SolrConnector {
* @param querystring
* @throws IOException
*/
@Override
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException {
final SolrDocumentList list = new SolrDocumentList();
for (final SolrConnector connector: this.connectors) {
@ -181,6 +171,7 @@ public class SolrShardingConnector implements SolrConnector {
return size;
}
@Override
public long getSize() {
final long[] size = getSizeList();
long s = 0;

@ -36,8 +36,6 @@ import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
@ -65,7 +63,6 @@ public class SolrSingleConnector implements SolrConnector {
private final String solrurl, host, solrpath, solraccount, solrpw;
private final int port;
private HttpSolrServer server;
private final SolrScheme scheme;
private final static int transmissionQueueCount = 4; // allow concurrent http sessions to solr
private final static int transmissionQueueSize = 50; // number of documents that are collected until a commit is sent
@ -80,9 +77,8 @@ public class SolrSingleConnector implements SolrConnector {
* @throws IOException
*/
@SuppressWarnings("unchecked")
public SolrSingleConnector(final String url, final SolrScheme scheme) throws IOException {
public SolrSingleConnector(final String url) throws IOException {
this.solrurl = url;
this.scheme = scheme;
this.transmissionRoundRobinCounter = 0;
this.transmissionQueue = new ArrayBlockingQueue[transmissionQueueCount];
for (int i = 0; i < transmissionQueueCount; i++) {
@ -187,11 +183,6 @@ public class SolrSingleConnector implements SolrConnector {
}
}
@Override
public SolrScheme getScheme() {
return this.scheme;
}
@Override
public long getSize() {
try {
@ -261,11 +252,6 @@ public class SolrSingleConnector implements SolrConnector {
}
}
@Override
public void add(final String id, final ResponseHeader header, final Document doc) throws IOException, SolrException {
add(this.scheme.yacy2solr(id, header, doc));
}
@Override
public void add(final SolrInputDocument solrdoc) throws IOException, SolrException {
int thisrrc = this.transmissionRoundRobinCounter;
@ -384,7 +370,8 @@ public class SolrSingleConnector implements SolrConnector {
public static void main(final String args[]) {
SolrSingleConnector solr;
try {
solr = new SolrSingleConnector("http://127.0.0.1:8983/solr", new SolrScheme());
//SolrScheme scheme = new SolrScheme();
solr = new SolrSingleConnector("http://127.0.0.1:8983/solr");
solr.clear();
final File exampleDir = new File("/Data/workspace2/yacy/test/parsertest/");
long t, t0, a = 0;

@ -0,0 +1,47 @@
/**
* SolrType
* Copyright 2011 by Michael Peter Christen
* First released 14.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7654 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services.federated.solr;
public enum SolrType {
string,
text_general,
text_en_splitting_tight,
date,
integer("int"),
tdouble,
bool("boolean");
private String printName;
private SolrType() {
this.printName = this.name();
}
private SolrType(String printName) {
this.printName = printName;
}
public String printName() {
return this.printName;
}
}

@ -91,7 +91,6 @@ import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.protocol.http.ProxySettings;
import net.yacy.cora.services.federated.solr.SolrScheme;
import net.yacy.cora.services.federated.solr.SolrShardingConnector;
import net.yacy.cora.services.federated.solr.SolrShardingSelection;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
@ -142,6 +141,7 @@ import net.yacy.repository.FilterEngine;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segments;
import net.yacy.search.index.SolrScheme;
import net.yacy.search.query.AccessTracker;
import net.yacy.search.query.QueryParams;
import net.yacy.search.query.SearchEvent;
@ -242,6 +242,7 @@ public final class Switchboard extends serverSwitch
public SeedDB peers;
public WorkTables tables;
public Tray tray;
public SolrScheme solrScheme;
public WorkflowProcessor<indexingQueueEntry> indexingDocumentProcessor;
public WorkflowProcessor<indexingQueueEntry> indexingCondensementProcessor;
@ -640,22 +641,20 @@ public final class Switchboard extends serverSwitch
FileUtils.copy(solrBackupProfile, solrWorkProfile);
}
final SolrScheme backupScheme = new SolrScheme(solrBackupProfile);
final SolrScheme workingScheme = new SolrScheme(solrWorkProfile);
this.solrScheme = new SolrScheme(solrWorkProfile);
// update the working scheme with the backup scheme. This is necessary to include new features.
// new features are always activated by default
workingScheme.fill(backupScheme, false);
this.solrScheme.fill(backupScheme, false);
// set up the solr interface
final String solrurls =
getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
final boolean usesolr =
getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
try {
this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(
(usesolr) ? new SolrShardingConnector(
solrurls,
workingScheme,
SolrShardingSelection.Method.MODULO_HOST_MD5,
10000) : null);
} catch ( final IOException e ) {
@ -2432,7 +2431,7 @@ public final class Switchboard extends serverSwitch
this.indexSegments
.segment(Segments.Process.LOCALCRAWLING)
.getSolr()
.add(id, in.queueEntry.getResponseHeader(), doc);
.add(this.solrScheme.yacy2solr(id, in.queueEntry.getResponseHeader(), doc));
} catch ( final IOException e ) {
Log.logWarning(
"SOLR",

@ -0,0 +1,173 @@
/**
* SolrField
* Copyright 2011 by Michael Peter Christen
* First released 14.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7654 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.index;
import net.yacy.cora.services.federated.solr.SolrType;
public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField {
id(SolrType.string, true, true, "primary key of document, the URL hash"),
sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"),
ip_s(SolrType.string, true, true, "ip of host of url (after DNS lookup)"),
host_s(SolrType.string, true, true, "host of the url"),
title(SolrType.text_general, true, true, true, "content of title tag"),
author(SolrType.text_general, true, true, "content of author-tag"),
description(SolrType.text_general, true, true, "content of description-tag"),
content_type(SolrType.string, true, true, true, "mime-type of document"),
last_modified(SolrType.date, true, true, "last-modified from http header"),
keywords(SolrType.text_general, true, true, "content of keywords tag; words are separated by space"),
text_t(SolrType.text_general, true, true, "all visible text"),
wordcount_i(SolrType.integer, true, true, "number of words in visible area"),
paths_txt(SolrType.text_general, true, true, true, "all path elements in the url"),
// encoded as binary value into an integer:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "noindex" contained in html header meta
// bit 3: "nofollow" contained in html header meta
// bit 8: "noarchive" contained in http header properties
// bit 9: "nosnippet" contained in http header properties
// bit 10: "noindex" contained in http header properties
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
robots_i(SolrType.integer, true, true, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
inboundlinkscount_i(SolrType.integer, true, true, "total number of inbound links"),
inboundlinksnofollowcount_i(SolrType.integer, true, true, "number of inbound links with nofollow tag"),
inboundlinks_tag_txt(SolrType.text_general, true, true, true, "internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
inboundlinks_protocol_txt(SolrType.text_general, true, true, true, "internal links, only the protocol"),
inboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "internal links, the url only without the protocol"),
inboundlinks_name_txt(SolrType.text_general, true, true, true, "internal links, the name property of the a-tag"),
inboundlinks_rel_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag"),
inboundlinks_relflags_txt(SolrType.text_general, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"),
outboundlinkscount_i(SolrType.integer, true, true, "external number of inbound links"),
outboundlinksnofollowcount_i(SolrType.integer, true, true, "number of external links with nofollow tag"),
outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_txt(SolrType.text_general, true, true, true, "external links, only the protocol"),
outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"),
outboundlinks_name_txt(SolrType.text_general, true, true, true, "external links, the name property of the a-tag"),
outboundlinks_rel_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag"),
outboundlinks_relflags_txt(SolrType.text_general, true, true, true, "external links, the rel property of the a-tag, coded binary"),
outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"),
charset_s(SolrType.string, true, true, "character encoding"),
lon_coordinate(SolrType.tdouble, true, false, "longitude of location as declared in WSG84"),
lat_coordinate(SolrType.tdouble, true, false, "latitude of location as declared in WSG84"),
httpstatus_i(SolrType.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
h1_txt(SolrType.text_general, true, true, true, "h1 header"),
h2_txt(SolrType.text_general, true, true, true, "h2 header"),
h3_txt(SolrType.text_general, true, true, true, "h3 header"),
h4_txt(SolrType.text_general, true, true, true, "h4 header"),
h5_txt(SolrType.text_general, true, true, true, "h5 header"),
h6_txt(SolrType.text_general, true, true, true, "h6 header"),
htags_i(SolrType.integer, true, true, "binary pattern for the existance of h1..h6 headlines"),
canonical_s(SolrType.string, true, true, "url inside the canonical link element"),
metagenerator_t(SolrType.text_general, true, true, "content of <meta name=\"generator\" content=#content#> tag"),
boldcount_i(SolrType.integer, true, true, "total number of occurrences of <b> or <strong>"),
bold_txt(SolrType.text_general, true, true, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
italiccount_i(SolrType.integer, true, true, "total number of occurrences of <i>"),
italic_txt(SolrType.text_general, true, true, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),
licount_i(SolrType.integer, true, true, "number of <li> tags"),
li_txt(SolrType.text_general, true, true, true, "all texts in <li> tags"),
imagescount_i(SolrType.integer, true, true, "number of images"),
images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_protocol_txt(SolrType.text_general, true, true, true, "all image link protocols"),
images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"),
images_alt_txt(SolrType.text_general, true, true, true, "all image link alt tag"),
csscount_i(SolrType.integer, true, true, "number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"),
css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"),
scripts_txt(SolrType.text_general, true, true, true, "normaluzed urls within a scripts tag"),
scriptscount_i(SolrType.integer, true, true, "number of entries in scripts_txt"),
frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"),
framesscount_i(SolrType.integer, true, true, "number of frames_txt"),
iframes_txt(SolrType.text_general, true, true, true, "list of all links to iframes"),
iframesscount_i(SolrType.integer, true, true, "number of iframes_txt"),
flash_b(SolrType.bool, true, true, "flag that shows if a swf file is linked"),
responsetime_i(SolrType.integer, true, true, "response time of target server in milliseconds"),
ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
ext_cms_val(SolrType.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"),
ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"),
ext_ads_val(SolrType.integer, true, true, true, "number of attributes counts in ext_ads_txt"),
ext_community_txt(SolrType.text_general, true, true, true, "names of recognized community functions"),
ext_community_val(SolrType.integer, true, true, true, "number of attribute counts in attr_community"),
ext_maps_txt(SolrType.text_general, true, true, true, "names of map services"),
ext_maps_val(SolrType.integer, true, true, true, "number of attribute counts in ext_maps_txt"),
ext_tracker_txt(SolrType.text_general, true, true, true, "names of tracker server"),
ext_tracker_val(SolrType.integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
ext_title_val(SolrType.integer, true, true, true, "number of matching title expressions"),
failreason_t(SolrType.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty");
final SolrType type;
final boolean indexed, stored;
boolean multiValued, omitNorms;
final String comment;
private SolrField(final SolrType type, final boolean indexed, final boolean stored, final String comment) {
this.type = type;
this.indexed = indexed;
this.stored = stored;
this.multiValued = false;
this.omitNorms = false;
this.comment = comment;
}
private SolrField(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final String comment) {
this(type, indexed, stored, comment);
this.multiValued = multiValued;
}
private SolrField(final SolrType type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final String comment) {
this(type, indexed, stored, multiValued, comment);
this.omitNorms = omitNorms;
}
public final SolrType getType() {
return this.type;
}
public final boolean isIndexed() {
return this.indexed;
}
public final boolean isStored() {
return this.stored;
}
public final boolean isMultiValued() {
return this.multiValued;
}
public final boolean isOmitNorms() {
return this.omitNorms;
}
public final String getComment() {
return this.comment;
}
}

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.services.federated.solr;
package net.yacy.search.index;
import java.io.File;
@ -67,16 +67,16 @@ public class SolrScheme extends ConfigurationSet {
*/
public SolrScheme(final File configurationFile) {
super(configurationFile);
// check consistency: compare with Field enum
// check consistency: compare with YaCyField enum
for (String name: this) {
try {
Field.valueOf(name);
SolrField.valueOf(name);
} catch (IllegalArgumentException e) {
Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " defines unknown attribute '" + name + "'");
}
}
/*
for (Field field: Field.values()) {
for (YaCyField field: YaCyField.values()) {
if (!this.contains(field.name())) {
Log.logWarning("SolrScheme", "solr scheme file " + configurationFile.getAbsolutePath() + " omits known attribute '" + field.name() + "'");
}
@ -84,228 +84,62 @@ public class SolrScheme extends ConfigurationSet {
*/
}
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String value) {
protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final String value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final Date value) {
protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final Date value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final int value) {
protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final int value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String[] value) {
protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final String[] value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final float value) {
protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final float value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final boolean value) {
protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final boolean value) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value);
}
protected void addSolr(final SolrInputDocument solrdoc, final Field key, final String value, final float boost) {
protected void addSolr(final SolrInputDocument solrdoc, final SolrField key, final String value, final float boost) {
if (isEmpty() || contains(key.name())) solrdoc.setField(key.name(), value, boost);
}
public static enum Types {
string,
text_general,
text_en_splitting_tight,
date,
integer("int"),
tdouble,
bool("boolean");
private String printName;
private Types() {
this.printName = this.name();
}
private Types(String printName) {
this.printName = printName;
}
public String printName() {
return this.printName;
}
}
public static enum Field {
id(Types.string, true, true, "primary key of document, the URL hash"),
sku(Types.text_en_splitting_tight, true, true, false, true, "url of document"),
ip_s(Types.string, true, true, "ip of host of url (after DNS lookup)"),
host_s(Types.string, true, true, "host of the url"),
title(Types.text_general, true, true, true, "content of title tag"),
author(Types.text_general, true, true, "content of author-tag"),
description(Types.text_general, true, true, "content of description-tag"),
content_type(Types.string, true, true, true, "mime-type of document"),
last_modified(Types.date, true, true, "last-modified from http header"),
keywords(Types.text_general, true, true, "content of keywords tag; words are separated by space"),
text_t(Types.text_general, true, true, "all visible text"),
wordcount_i(Types.integer, true, true, "number of words in visible area"),
paths_txt(Types.text_general, true, true, true, "all path elements in the url"),
// encoded as binary value into an integer:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "noindex" contained in html header meta
// bit 3: "nofollow" contained in html header meta
// bit 8: "noarchive" contained in http header properties
// bit 9: "nosnippet" contained in http header properties
// bit 10: "noindex" contained in http header properties
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
robots_i(Types.integer, true, true, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
inboundlinkscount_i(Types.integer, true, true, "total number of inbound links"),
inboundlinksnofollowcount_i(Types.integer, true, true, "number of inbound links with nofollow tag"),
inboundlinks_tag_txt(Types.text_general, true, true, true, "internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
inboundlinks_protocol_txt(Types.text_general, true, true, true, "internal links, only the protocol"),
inboundlinks_urlstub_txt(Types.text_general, true, true, true, "internal links, the url only without the protocol"),
inboundlinks_name_txt(Types.text_general, true, true, true, "internal links, the name property of the a-tag"),
inboundlinks_rel_txt(Types.text_general, true, true, true, "internal links, the rel property of the a-tag"),
inboundlinks_relflags_txt(Types.text_general, true, true, true, "internal links, the rel property of the a-tag, coded binary"),
inboundlinks_text_txt(Types.text_general, true, true, true, "internal links, the text content of the a-tag"),
outboundlinkscount_i(Types.integer, true, true, "external number of inbound links"),
outboundlinksnofollowcount_i(Types.integer, true, true, "number of external links with nofollow tag"),
outboundlinks_tag_txt(Types.text_general, true, true, true, "external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow"),
outboundlinks_protocol_txt(Types.text_general, true, true, true, "external links, only the protocol"),
outboundlinks_urlstub_txt(Types.text_general, true, true, true, "external links, the url only without the protocol"),
outboundlinks_name_txt(Types.text_general, true, true, true, "external links, the name property of the a-tag"),
outboundlinks_rel_txt(Types.text_general, true, true, true, "external links, the rel property of the a-tag"),
outboundlinks_relflags_txt(Types.text_general, true, true, true, "external links, the rel property of the a-tag, coded binary"),
outboundlinks_text_txt(Types.text_general, true, true, true, "external links, the text content of the a-tag"),
charset_s(Types.string, true, true, "character encoding"),
lon_coordinate(Types.tdouble, true, false, "longitude of location as declared in WSG84"),
lat_coordinate(Types.tdouble, true, false, "latitude of location as declared in WSG84"),
httpstatus_i(Types.integer, true, true, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
h1_txt(Types.text_general, true, true, true, "h1 header"),
h2_txt(Types.text_general, true, true, true, "h2 header"),
h3_txt(Types.text_general, true, true, true, "h3 header"),
h4_txt(Types.text_general, true, true, true, "h4 header"),
h5_txt(Types.text_general, true, true, true, "h5 header"),
h6_txt(Types.text_general, true, true, true, "h6 header"),
htags_i(Types.integer, true, true, "binary pattern for the existance of h1..h6 headlines"),
canonical_s(Types.string, true, true, "url inside the canonical link element"),
metagenerator_t(Types.text_general, true, true, "content of <meta name=\"generator\" content=#content#> tag"),
boldcount_i(Types.integer, true, true, "total number of occurrences of <b> or <strong>"),
bold_txt(Types.text_general, true, true, true, "all texts inside of <b> or <strong> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
bold_val(Types.integer, true, true, true, "number of occurrences of texts in bold_txt"),
italiccount_i(Types.integer, true, true, "total number of occurrences of <i>"),
italic_txt(Types.text_general, true, true, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
italic_val(Types.integer, true, true, true, "number of occurrences of texts in italic_txt"),
licount_i(Types.integer, true, true, "number of <li> tags"),
li_txt(Types.text_general, true, true, true, "all texts in <li> tags"),
imagescount_i(Types.integer, true, true, "number of images"),
images_tag_txt(Types.text_general, true, true, true, " all image tags, encoded as <img> tag inclusive alt- and title property"),
images_protocol_txt(Types.text_general, true, true, true, "all image link protocols"),
images_urlstub_txt(Types.text_general, true, true, true, "all image links without the protocol and '://'"),
images_alt_txt(Types.text_general, true, true, true, "all image link alt tag"),
csscount_i(Types.integer, true, true, "number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(Types.text_general, true, true, true, "full css tag with normalized url"),
css_url_txt(Types.text_general, true, true, true, "normalized urls within a css tag"),
scripts_txt(Types.text_general, true, true, true, "normaluzed urls within a scripts tag"),
scriptscount_i(Types.integer, true, true, "number of entries in scripts_txt"),
frames_txt(Types.text_general, true, true, true, "list of all links to frames"),
framesscount_i(Types.integer, true, true, "number of frames_txt"),
iframes_txt(Types.text_general, true, true, true, "list of all links to iframes"),
iframesscount_i(Types.integer, true, true, "number of iframes_txt"),
flash_b(Types.bool, true, true, "flag that shows if a swf file is linked"),
responsetime_i(Types.integer, true, true, "response time of target server in milliseconds"),
ext_cms_txt(Types.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
ext_cms_val(Types.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"),
ext_ads_txt(Types.text_general, true, true, true, "names of ad-servers/ad-services"),
ext_ads_val(Types.integer, true, true, true, "number of attributes counts in ext_ads_txt"),
ext_community_txt(Types.text_general, true, true, true, "names of recognized community functions"),
ext_community_val(Types.integer, true, true, true, "number of attribute counts in attr_community"),
ext_maps_txt(Types.text_general, true, true, true, "names of map services"),
ext_maps_val(Types.integer, true, true, true, "number of attribute counts in ext_maps_txt"),
ext_tracker_txt(Types.text_general, true, true, true, "names of tracker server"),
ext_tracker_val(Types.integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(Types.text_general, true, true, true, "names matching title expressions"),
ext_title_val(Types.integer, true, true, true, "number of matching title expressions"),
failreason_t(Types.text_general, true, true, "fail reason if a page was not loaded. if the page was loaded then this field is empty");
final Types type;
final boolean indexed, stored;
boolean multiValued, omitNorms;
final String comment;
private Field(final Types type, final boolean indexed, final boolean stored, final String comment) {
this.type = type;
this.indexed = indexed;
this.stored = stored;
this.multiValued = false;
this.omitNorms = false;
this.comment = comment;
}
private Field(final Types type, final boolean indexed, final boolean stored, final boolean multiValued, final String comment) {
this(type, indexed, stored, comment);
this.multiValued = multiValued;
}
private Field(final Types type, final boolean indexed, final boolean stored, final boolean multiValued, final boolean omitNorms, final String comment) {
this(type, indexed, stored, multiValued, comment);
this.omitNorms = omitNorms;
}
public final Types getType() {
return this.type;
}
public final boolean isIndexed() {
return this.indexed;
}
public final boolean isStored() {
return this.stored;
}
public final boolean isMultiValued() {
return this.multiValued;
}
public final boolean isOmitNorms() {
return this.omitNorms;
}
public final String getComment() {
return this.comment;
}
}
public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) {
// we user the SolrCell design as index scheme
final SolrInputDocument solrdoc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
addSolr(solrdoc, Field.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
addSolr(solrdoc, Field.id, id);
addSolr(solrdoc, Field.sku, digestURI.toNormalform(true, false));
addSolr(solrdoc, SolrField.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)
addSolr(solrdoc, SolrField.id, id);
addSolr(solrdoc, SolrField.sku, digestURI.toNormalform(true, false));
final InetAddress address = digestURI.getInetAddress();
if (address != null) addSolr(solrdoc, Field.ip_s, address.getHostAddress());
if (digestURI.getHost() != null) addSolr(solrdoc, Field.host_s, digestURI.getHost());
addSolr(solrdoc, Field.title, yacydoc.dc_title());
addSolr(solrdoc, Field.author, yacydoc.dc_creator());
addSolr(solrdoc, Field.description, yacydoc.dc_description());
addSolr(solrdoc, Field.content_type, yacydoc.dc_format());
addSolr(solrdoc, Field.last_modified, header.lastModified());
addSolr(solrdoc, Field.keywords, yacydoc.dc_subject(' '));
if (address != null) addSolr(solrdoc, SolrField.ip_s, address.getHostAddress());
if (digestURI.getHost() != null) addSolr(solrdoc, SolrField.host_s, digestURI.getHost());
addSolr(solrdoc, SolrField.title, yacydoc.dc_title());
addSolr(solrdoc, SolrField.author, yacydoc.dc_creator());
addSolr(solrdoc, SolrField.description, yacydoc.dc_description());
addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format());
addSolr(solrdoc, SolrField.last_modified, header.lastModified());
addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' '));
final String content = UTF8.String(yacydoc.getTextBytes());
addSolr(solrdoc, Field.text_t, content);
if (isEmpty() || contains(Field.wordcount_i.name())) {
addSolr(solrdoc, SolrField.text_t, content);
if (isEmpty() || contains(SolrField.wordcount_i.name())) {
final int contentwc = content.split(" ").length;
addSolr(solrdoc, Field.wordcount_i, contentwc);
addSolr(solrdoc, SolrField.wordcount_i, contentwc);
}
// path elements of link
final String path = digestURI.getPath();
if (path != null && (isEmpty() || contains(Field.paths_txt.name()))) {
if (path != null && (isEmpty() || contains(SolrField.paths_txt.name()))) {
final String[] paths = path.split("/");
if (paths.length > 0) addSolr(solrdoc, Field.paths_txt, paths);
if (paths.length > 0) addSolr(solrdoc, SolrField.paths_txt, paths);
}
// get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme
@ -322,17 +156,17 @@ public class SolrScheme extends ConfigurationSet {
int f = 1;
String[] hs;
hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h1_txt, hs);
hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h2_txt, hs);
hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h3_txt, hs);
hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h4_txt, hs);
hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h5_txt, hs);
hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, Field.h6_txt, hs);
hs = html.getHeadlines(1); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h1_txt, hs);
hs = html.getHeadlines(2); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h2_txt, hs);
hs = html.getHeadlines(3); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h3_txt, hs);
hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h4_txt, hs);
hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h5_txt, hs);
hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; addSolr(solrdoc, SolrField.h6_txt, hs);
addSolr(solrdoc, Field.htags_i, h);
addSolr(solrdoc, SolrField.htags_i, h);
// canonical tag
if (html.getCanonical() != null) addSolr(solrdoc, Field.canonical_s, html.getCanonical().toNormalform(false, false));
if (html.getCanonical() != null) addSolr(solrdoc, SolrField.canonical_s, html.getCanonical().toNormalform(false, false));
// noindex and nofollow attributes
// from HTML (meta-tag in HTML header: robots)
@ -366,32 +200,32 @@ public class SolrScheme extends ConfigurationSet {
if (x_robots_tag.indexOf("nofollow",0) >= 0) b += 2048; // set bit 11
if (x_robots_tag.indexOf("unavailable_after",0) >=0) b += 4096; // set bit 12
}
addSolr(solrdoc, Field.robots_i, b);
addSolr(solrdoc, SolrField.robots_i, b);
// meta tags: generator
final String generator = html.getMetas().get("generator");
if (generator != null) addSolr(solrdoc, Field.metagenerator_t, generator);
if (generator != null) addSolr(solrdoc, SolrField.metagenerator_t, generator);
// bold, italic
final String[] bold = html.getBold();
addSolr(solrdoc, Field.boldcount_i, bold.length);
addSolr(solrdoc, SolrField.boldcount_i, bold.length);
if (bold.length > 0) {
addSolr(solrdoc, Field.bold_txt, bold);
if (isEmpty() || contains(Field.bold_val.name())) {
addSolr(solrdoc, Field.bold_val, html.getBoldCount(bold));
addSolr(solrdoc, SolrField.bold_txt, bold);
if (isEmpty() || contains(SolrField.bold_val.name())) {
addSolr(solrdoc, SolrField.bold_val, html.getBoldCount(bold));
}
}
final String[] italic = html.getItalic();
addSolr(solrdoc, Field.italiccount_i, italic.length);
addSolr(solrdoc, SolrField.italiccount_i, italic.length);
if (italic.length > 0) {
addSolr(solrdoc, Field.italic_txt, italic);
if (isEmpty() || contains(Field.italic_val.name())) {
addSolr(solrdoc, Field.italic_val, html.getItalicCount(italic));
addSolr(solrdoc, SolrField.italic_txt, italic);
if (isEmpty() || contains(SolrField.italic_val.name())) {
addSolr(solrdoc, SolrField.italic_val, html.getItalicCount(italic));
}
}
final String[] li = html.getLi();
addSolr(solrdoc, Field.licount_i, li.length);
if (li.length > 0) addSolr(solrdoc, Field.li_txt, li);
addSolr(solrdoc, SolrField.licount_i, li.length);
if (li.length > 0) addSolr(solrdoc, SolrField.li_txt, li);
// images
final Collection<ImageEntry> imagesc = html.getImages().values();
@ -410,14 +244,14 @@ public class SolrScheme extends ConfigurationSet {
imgalts[c] = ie.alt();
c++;
}
addSolr(solrdoc, Field.imagescount_i, imgtags.length);
if (isEmpty() || contains(Field.images_tag_txt.name())) addSolr(solrdoc, Field.images_tag_txt, imgtags);
if (isEmpty() || contains(Field.images_protocol_txt.name())) addSolr(solrdoc, Field.images_protocol_txt, protocolList2indexedList(imgprots));
if (isEmpty() || contains(Field.images_urlstub_txt.name())) addSolr(solrdoc, Field.images_urlstub_txt, imgstubs);
if (isEmpty() || contains(Field.images_alt_txt.name())) addSolr(solrdoc, Field.images_alt_txt, imgalts);
addSolr(solrdoc, SolrField.imagescount_i, imgtags.length);
if (isEmpty() || contains(SolrField.images_tag_txt.name())) addSolr(solrdoc, SolrField.images_tag_txt, imgtags);
if (isEmpty() || contains(SolrField.images_protocol_txt.name())) addSolr(solrdoc, SolrField.images_protocol_txt, protocolList2indexedList(imgprots));
if (isEmpty() || contains(SolrField.images_urlstub_txt.name())) addSolr(solrdoc, SolrField.images_urlstub_txt, imgstubs);
if (isEmpty() || contains(SolrField.images_alt_txt.name())) addSolr(solrdoc, SolrField.images_alt_txt, imgalts);
// style sheets
if (isEmpty() || contains(Field.css_tag_txt.name())) {
if (isEmpty() || contains(SolrField.css_tag_txt.name())) {
final Map<MultiProtocolURI, String> csss = html.getCSS();
final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()];
@ -432,13 +266,13 @@ public class SolrScheme extends ConfigurationSet {
css_url[c] = url;
c++;
}
addSolr(solrdoc, Field.csscount_i, css_tag.length);
if (css_tag.length > 0) addSolr(solrdoc, Field.css_tag_txt, css_tag);
if (css_url.length > 0) addSolr(solrdoc, Field.css_url_txt, css_url);
addSolr(solrdoc, SolrField.csscount_i, css_tag.length);
if (css_tag.length > 0) addSolr(solrdoc, SolrField.css_tag_txt, css_tag);
if (css_url.length > 0) addSolr(solrdoc, SolrField.css_url_txt, css_url);
}
// Scripts
if (isEmpty() || contains(Field.scripts_txt.name())) {
if (isEmpty() || contains(SolrField.scripts_txt.name())) {
final Set<MultiProtocolURI> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
@ -447,12 +281,12 @@ public class SolrScheme extends ConfigurationSet {
ouboundLinks.remove(url);
scripts[c++] = url.toNormalform(false, false, false, false);
}
addSolr(solrdoc, Field.scriptscount_i, scripts.length);
if (scripts.length > 0) addSolr(solrdoc, Field.scripts_txt, scripts);
addSolr(solrdoc, SolrField.scriptscount_i, scripts.length);
if (scripts.length > 0) addSolr(solrdoc, SolrField.scripts_txt, scripts);
}
// Frames
if (isEmpty() || contains(Field.frames_txt.name())) {
if (isEmpty() || contains(SolrField.frames_txt.name())) {
final Set<MultiProtocolURI> framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
@ -461,12 +295,12 @@ public class SolrScheme extends ConfigurationSet {
ouboundLinks.remove(url);
frames[c++] = url.toNormalform(false, false, false, false);
}
addSolr(solrdoc, Field.framesscount_i, frames.length);
if (frames.length > 0) addSolr(solrdoc, Field.frames_txt, frames);
addSolr(solrdoc, SolrField.framesscount_i, frames.length);
if (frames.length > 0) addSolr(solrdoc, SolrField.frames_txt, frames);
}
// IFrames
if (isEmpty() || contains(Field.iframes_txt.name())) {
if (isEmpty() || contains(SolrField.iframes_txt.name())) {
final Set<MultiProtocolURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
@ -475,33 +309,33 @@ public class SolrScheme extends ConfigurationSet {
ouboundLinks.remove(url);
iframes[c++] = url.toNormalform(false, false, false, false);
}
addSolr(solrdoc, Field.iframesscount_i, iframes.length);
if (iframes.length > 0) addSolr(solrdoc, Field.iframes_txt, iframes);
addSolr(solrdoc, SolrField.iframesscount_i, iframes.length);
if (iframes.length > 0) addSolr(solrdoc, SolrField.iframes_txt, iframes);
}
// flash embedded
addSolr(solrdoc, Field.flash_b, html.containsFlash());
addSolr(solrdoc, SolrField.flash_b, html.containsFlash());
// generic evaluation pattern
for (final String model: html.getEvaluationModelNames()) {
if (isEmpty() || contains("ext_" + model + "_txt")) {
final String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) {
addSolr(solrdoc, Field.valueOf("ext_" + model + "_txt"), scorenames);
addSolr(solrdoc, Field.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames));
addSolr(solrdoc, SolrField.valueOf("ext_" + model + "_txt"), scorenames);
addSolr(solrdoc, SolrField.valueOf("ext_" + model + "_val"), html.getEvaluationModelScoreCounts(model, scorenames));
}
}
}
// response time
addSolr(solrdoc, Field.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
addSolr(solrdoc, SolrField.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
}
// list all links
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
c = 0;
if (isEmpty() || contains(Field.inboundlinkscount_i.name())) addSolr(solrdoc, Field.inboundlinkscount_i, inboundLinks.size());
if (isEmpty() || contains(Field.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
if (isEmpty() || contains(SolrField.inboundlinkscount_i.name())) addSolr(solrdoc, SolrField.inboundlinkscount_i, inboundLinks.size());
if (isEmpty() || contains(SolrField.inboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.inboundlinksnofollowcount_i, yacydoc.inboundLinkNofollowCount());
final String[] inboundlinksTag = new String[inboundLinks.size()];
final String[] inboundlinksURLProtocol = new String[inboundLinks.size()];
final String[] inboundlinksURLStub = new String[inboundLinks.size()];
@ -528,17 +362,17 @@ public class SolrScheme extends ConfigurationSet {
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains(Field.inboundlinks_tag_txt.name())) addSolr(solrdoc, Field.inboundlinks_tag_txt, inboundlinksTag);
if (isEmpty() || contains(Field.inboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
if (isEmpty() || contains(Field.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (isEmpty() || contains(Field.inboundlinks_name_txt.name())) addSolr(solrdoc, Field.inboundlinks_name_txt, inboundlinksName);
if (isEmpty() || contains(Field.inboundlinks_rel_txt.name())) addSolr(solrdoc, Field.inboundlinks_rel_txt, inboundlinksRel);
if (isEmpty() || contains(Field.inboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.inboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.inboundlinks_text_txt.name())) addSolr(solrdoc, Field.inboundlinks_text_txt, inboundlinksText);
if (isEmpty() || contains(SolrField.inboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_tag_txt, inboundlinksTag);
if (isEmpty() || contains(SolrField.inboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_protocol_txt, protocolList2indexedList(inboundlinksURLProtocol));
if (isEmpty() || contains(SolrField.inboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_urlstub_txt, inboundlinksURLStub);
if (isEmpty() || contains(SolrField.inboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_name_txt, inboundlinksName);
if (isEmpty() || contains(SolrField.inboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_rel_txt, inboundlinksRel);
if (isEmpty() || contains(SolrField.inboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(SolrField.inboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.inboundlinks_text_txt, inboundlinksText);
c = 0;
if (isEmpty() || contains(Field.outboundlinkscount_i.name())) addSolr(solrdoc, Field.outboundlinkscount_i, ouboundLinks.size());
if (isEmpty() || contains(Field.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, Field.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
if (isEmpty() || contains(SolrField.outboundlinkscount_i.name())) addSolr(solrdoc, SolrField.outboundlinkscount_i, ouboundLinks.size());
if (isEmpty() || contains(SolrField.outboundlinksnofollowcount_i.name())) addSolr(solrdoc, SolrField.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount());
final String[] outboundlinksTag = new String[ouboundLinks.size()];
final String[] outboundlinksURLProtocol = new String[ouboundLinks.size()];
final String[] outboundlinksURLStub = new String[ouboundLinks.size()];
@ -565,24 +399,24 @@ public class SolrScheme extends ConfigurationSet {
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains(Field.outboundlinks_tag_txt.name())) addSolr(solrdoc, Field.outboundlinks_tag_txt, outboundlinksTag);
if (isEmpty() || contains(Field.outboundlinks_protocol_txt.name())) addSolr(solrdoc, Field.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
if (isEmpty() || contains(Field.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, Field.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (isEmpty() || contains(Field.outboundlinks_name_txt.name())) addSolr(solrdoc, Field.outboundlinks_name_txt, outboundlinksName);
if (isEmpty() || contains(Field.outboundlinks_rel_txt.name())) addSolr(solrdoc, Field.outboundlinks_rel_txt, outboundlinksRel);
if (isEmpty() || contains(Field.outboundlinks_relflags_txt.name())) addSolr(solrdoc, Field.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(Field.outboundlinks_text_txt.name())) addSolr(solrdoc, Field.outboundlinks_text_txt, outboundlinksText);
if (isEmpty() || contains(SolrField.outboundlinks_tag_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_tag_txt, outboundlinksTag);
if (isEmpty() || contains(SolrField.outboundlinks_protocol_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_protocol_txt, protocolList2indexedList(outboundlinksURLProtocol));
if (isEmpty() || contains(SolrField.outboundlinks_urlstub_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_urlstub_txt, outboundlinksURLStub);
if (isEmpty() || contains(SolrField.outboundlinks_name_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_name_txt, outboundlinksName);
if (isEmpty() || contains(SolrField.outboundlinks_rel_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_rel_txt, outboundlinksRel);
if (isEmpty() || contains(SolrField.outboundlinks_relflags_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_relflags_txt, relEval(inboundlinksRel));
if (isEmpty() || contains(SolrField.outboundlinks_text_txt.name())) addSolr(solrdoc, SolrField.outboundlinks_text_txt, outboundlinksText);
// charset
addSolr(solrdoc, Field.charset_s, yacydoc.getCharset());
addSolr(solrdoc, SolrField.charset_s, yacydoc.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
addSolr(solrdoc, Field.lon_coordinate, yacydoc.lon());
addSolr(solrdoc, Field.lat_coordinate, yacydoc.lat());
addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon());
addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat());
}
addSolr(solrdoc, Field.httpstatus_i, 200);
addSolr(solrdoc, SolrField.httpstatus_i, 200);
return solrdoc;
}

@ -502,7 +502,7 @@ public class SnippetProcess {
sd = sdl.get(0);
}
if (sd != null) {
solrContent = this.solr.getScheme().solrGetText(sd);
solrContent = Switchboard.getSwitchboard().solrScheme.solrGetText(sd);
}
}

Loading…
Cancel
Save