diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 0bec57872..ed6392b50 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -141,7 +141,7 @@ attr_images ## number of images, int imagescount_i -## flag that shows if a swf file is linked, boolen +## flag that shows if a swf file is linked, boolean flash_b ## list of all links to frames, textgen diff --git a/defaults/yacy.init b/defaults/yacy.init index aa8e7f955..f962a24db 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1029,4 +1029,4 @@ federated.service.yacy.indexing.enabled = true federated.service.solr.indexing.enabled = false federated.service.solr.indexing.url = http://127.0.0.1:8983/solr federated.service.solr.indexing.charding = MODULO_HOST_MD5 -federated.service.solr.indexing.scheme = SolrCellExtended +federated.service.solr.indexing.schemefile = solr.keys.default.list diff --git a/htroot/IndexFederated_p.html b/htroot/IndexFederated_p.html index 6888725db..86f735e18 100644 --- a/htroot/IndexFederated_p.html +++ b/htroot/IndexFederated_p.html @@ -21,6 +21,7 @@ You can just switch on or off this index. If you switch it off, you will not be able to search with YaCy any more. +
@@ -55,9 +56,26 @@
Charding Method
Scheme
-
+
+
+

Index Scheme

+ + + + + + + #{scheme}# + + + + + + #{/scheme}# +
ActiveAttributeComment
#[key]##[comment]#
+
diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index 54b39fe0d..627093db0 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -11,25 +11,27 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . */ +import java.io.File; import java.io.IOException; +import java.util.Iterator; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.solr.SolrChardingConnector; import net.yacy.cora.services.federated.solr.SolrChardingSelection; import net.yacy.cora.services.federated.solr.SolrScheme; +import net.yacy.cora.storage.ConfigurationSet; import net.yacy.kelondro.logging.Log; - import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -39,47 +41,65 @@ public class IndexFederated_p { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final serverObjects prop = new serverObjects(); - Switchboard sb = (Switchboard) env; + final Switchboard sb = (Switchboard) env; if (post != null && post.containsKey("set")) { // yacy env.setConfig("federated.service.yacy.indexing.enabled", post.getBoolean("yacy.indexing.enabled", false)); - + // solr - boolean solrWasOn = env.getConfigBool("federated.service.solr.indexing.enabled", true); - boolean solrIsOnAfterwards = post.getBoolean("solr.indexing.enabled", false); + final boolean solrWasOn = env.getConfigBool("federated.service.solr.indexing.enabled", true); + final boolean solrIsOnAfterwards = post.getBoolean("solr.indexing.enabled", false); env.setConfig("federated.service.solr.indexing.enabled", solrIsOnAfterwards); env.setConfig("federated.service.solr.indexing.url", post.get("solr.indexing.url", env.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"))); env.setConfig("federated.service.solr.indexing.charding", post.get("solr.indexing.charding", env.getConfig("federated.service.solr.indexing.charding", "modulo-host-md5"))); - env.setConfig("federated.service.solr.indexing.scheme", post.get("solr.indexing.scheme", env.getConfig("federated.service.solr.indexing.scheme", "SolrCellExtended"))); + env.setConfig("federated.service.solr.indexing.schemefile", post.get("solr.indexing.schemefile", env.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list"))); if (solrWasOn && !solrIsOnAfterwards) { // switch off sb.solrConnector.close(); sb.solrConnector = null; } - + if (!solrWasOn && solrIsOnAfterwards) { // switch on - String solrurls = sb.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); - boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; + final String solrurls = sb.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); + final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; + final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/solr.keys.default.list")); try { - sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, SolrScheme.SolrCellExtended, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; - } catch (IOException e) { + sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; + } catch (final IOException e) { Log.logException(e); sb.solrConnector = null; } } + + // read index scheme table flags + final SolrScheme scheme = sb.solrConnector.getScheme(); + final Iterator i = scheme.allIterator(); + ConfigurationSet.Entry entry; + while (i.hasNext()) { + entry = i.next(); + final String v = post.get("scheme_" + entry.key()); + final boolean c = v != null && v.equals("checked"); + try { + if (entry.enabled()) { + if (!c) scheme.disable(entry.key()); + } else { + if (c) scheme.enable(entry.key()); + } + } catch (final IOException e) {} + } } - + // show solr host table if (sb.solrConnector == null) { prop.put("table", 0); } else { prop.put("table", 1); try { - long[] size = sb.solrConnector.getSizeList(); - String[] urls = sb.solrConnector.getAdminInterfaceList(); + final long[] size = sb.solrConnector.getSizeList(); + final String[] urls = sb.solrConnector.getAdminInterfaceList(); boolean dark = false; for (int i = 0; i < size.length; i++) { prop.put("table_list_" + i + "_dark", dark ? 1 : 0); dark = !dark; @@ -87,18 +107,34 @@ public class IndexFederated_p { prop.put("table_list_" + i + "_size", size[i]); } prop.put("table_list", size.length); - } catch (IOException e) { + + // write scheme + final SolrScheme scheme = sb.solrConnector.getScheme(); + final Iterator i = scheme.allIterator(); + int c = 0; + dark = false; + ConfigurationSet.Entry entry; + while (i.hasNext()) { + entry = i.next(); + prop.put("scheme_" + c + "_dark", dark ? 1 : 0); dark = !dark; + prop.put("scheme_" + c + "_checked", scheme.contains(entry.key()) ? 1 : 0); + prop.putHTML("scheme_" + c + "_key", entry.key()); + prop.putHTML("scheme_" + c + "_comment", scheme.commentHeadline(entry.key())); + c++; + } + prop.put("scheme", c); + } catch (final IOException e) { Log.logException(e); prop.put("table", 0); } } - + // fill attribute fields prop.put("yacy.indexing.enabled.checked", env.getConfigBool("federated.service.yacy.indexing.enabled", true) ? 1 : 0); prop.put("solr.indexing.enabled.checked", env.getConfigBool("federated.service.solr.indexing.enabled", false) ? 1 : 0); prop.put("solr.indexing.url", env.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr")); prop.put("solr.indexing.charding", env.getConfig("federated.service.solr.indexing.charding", "modulo-host-md5")); - prop.put("solr.indexing.scheme", env.getConfig("federated.service.solr.indexing.scheme", "SolrCellExtended")); + prop.put("solr.indexing.schemefile", env.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list")); // return rewrite properties return prop; diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 91c6e8334..6657def71 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -560,11 +560,16 @@ public final class Switchboard extends serverSwitch { this.log.logConfig("Parser: Initializing Mime Type deny list"); TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, "")); + // prepare a solr index profile switch list + final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/solr.keys.default.list"); + if (!solrWorkProfile.exists()) FileUtils.copy(new File("defaults/solr.keys.list"), solrWorkProfile); + final SolrScheme scheme = new SolrScheme(solrWorkProfile); + // set up the solr interface final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; try { - this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, SolrScheme.SolrCellExtended, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; + this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; } catch (final IOException e) { Log.logException(e); this.solrConnector = null; diff --git a/source/net/yacy/cora/services/federated/solr/SolrChardingConnector.java b/source/net/yacy/cora/services/federated/solr/SolrChardingConnector.java index 34be34b52..d4287a2a0 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrChardingConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrChardingConnector.java @@ -55,6 +55,10 @@ public class SolrChardingConnector { this.scheme = scheme; } + public SolrScheme getScheme() { + return this.scheme; + } + public void close() { for (final SolrSingleConnector connector: this.connectors) connector.close(); } diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java index 98dda52ca..7b548221a 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java @@ -11,12 +11,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -25,211 +25,271 @@ package net.yacy.cora.services.federated.solr; +import java.io.File; import java.net.InetAddress; import java.util.Collection; +import java.util.Date; import java.util.Map; import java.util.Properties; import java.util.Set; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.storage.ConfigurationSet; import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.cora.document.MultiProtocolURI; + import org.apache.solr.common.SolrInputDocument; -public enum SolrScheme { +public class SolrScheme extends ConfigurationSet { + + /** + * initialize with an empty ConfigurationSet which will cause that all the index + * attributes are used + */ + public SolrScheme() { + super(); + } + + /** + * initialize the scheme with a given configuration file + * the configuration file simply contains a list of lines with keywords + * @param configurationFile + */ + public SolrScheme(final File configurationFile) { + super(configurationFile); + } - SolrCell, - SolrCellExtended, - DublinCore; + private void addSolr(final SolrInputDocument solrdoc, final String key, final String value) { + if (isEmpty() || contains(key)) solrdoc.setField(key, value); + } - - public SolrInputDocument yacy2solr(String id, ResponseHeader header, Document document) { - if (this == SolrCellExtended) return yacy2solrSolrCellExtended(id, header, document); - return null; + private void addSolr(final SolrInputDocument solrdoc, final String key, final Date value) { + if (isEmpty() || contains(key)) solrdoc.setField(key, value); } - - public static SolrInputDocument yacy2solrSolrCellExtended(String id, ResponseHeader header, Document yacydoc) { + + private void addSolr(final SolrInputDocument solrdoc, final String key, final int value) { + if (isEmpty() || contains(key)) solrdoc.setField(key, value); + } + + private void addSolr(final SolrInputDocument solrdoc, final String key, final String[] value) { + if (isEmpty() || contains(key)) solrdoc.setField(key, value); + } + + private void addSolr(final SolrInputDocument solrdoc, final String key, final float value) { + if (isEmpty() || contains(key)) solrdoc.setField(key, value); + } + + private void addSolr(final SolrInputDocument solrdoc, final String key, final boolean value) { + if (isEmpty() || contains(key)) solrdoc.setField(key, value); + } + + private void addSolr(final SolrInputDocument solrdoc, final String key, final String value, final float boost) { + if (isEmpty() || contains(key)) solrdoc.setField(key, value, boost); + } + + public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) { // we user the SolrCell design as index scheme - SolrInputDocument solrdoc = new SolrInputDocument(); - DigestURI digestURI = new DigestURI(yacydoc.dc_source()); - solrdoc.addField("failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before) - solrdoc.addField("id", id); - solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f); - InetAddress address = Domains.dnsResolve(digestURI.getHost()); - if (address != null) solrdoc.addField("ip_s", address.getHostAddress()); - if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost()); - solrdoc.addField("title", yacydoc.dc_title()); - solrdoc.addField("author", yacydoc.dc_creator()); - solrdoc.addField("description", yacydoc.dc_description()); - solrdoc.addField("content_type", yacydoc.dc_format()); - solrdoc.addField("last_modified", header.lastModified()); - solrdoc.addField("keywords", yacydoc.dc_subject(' ')); - String content = UTF8.String(yacydoc.getTextBytes()); - solrdoc.addField("text_t", content); - int contentwc = content.split(" ").length; - solrdoc.addField("wordcount_i", contentwc); + final SolrInputDocument solrdoc = new SolrInputDocument(); + final DigestURI digestURI = new DigestURI(yacydoc.dc_source()); + addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before) + addSolr(solrdoc, "id", id); + addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f); + final InetAddress address = Domains.dnsResolve(digestURI.getHost()); + if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress()); + if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost()); + addSolr(solrdoc, "title", yacydoc.dc_title()); + addSolr(solrdoc, "author", yacydoc.dc_creator()); + addSolr(solrdoc, "description", yacydoc.dc_description()); + addSolr(solrdoc, "content_type", yacydoc.dc_format()); + addSolr(solrdoc, "last_modified", header.lastModified()); + addSolr(solrdoc, "keywords", yacydoc.dc_subject(' ')); + final String content = UTF8.String(yacydoc.getTextBytes()); + addSolr(solrdoc, "text_t", content); + if (contains("wordcount_i")) { + final int contentwc = content.split(" ").length; + addSolr(solrdoc, "wordcount_i", contentwc); + } // path elements of link - String path = digestURI.getPath(); - if (path != null) { - String[] paths = path.split("/"); - if (paths.length > 0) solrdoc.addField("attr_paths", paths); + final String path = digestURI.getPath(); + if (path != null && contains("attr_paths")) { + final String[] paths = path.split("/"); + if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths); } - + // list all links - Map alllinks = yacydoc.getAnchors(); + final Map alllinks = yacydoc.getAnchors(); int c = 0; - String[] inboundlinks = new String[yacydoc.inboundLinkCount()]; - solrdoc.addField("inboundlinkscount_i", inboundlinks.length); - for (MultiProtocolURI url: yacydoc.inboundLinks()) { - Properties p = alllinks.get(url); - String name = p.getProperty("name", ""); - String rel = p.getProperty("rel", ""); - inboundlinks[c++] = - "" + - ((name.length() > 0) ? name : "") + ""; + addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount()); + if (contains("attr_inboundlinks")) { + final String[] inboundlinks = new String[yacydoc.inboundLinkCount()]; + for (final MultiProtocolURI url: yacydoc.inboundLinks()) { + final Properties p = alllinks.get(url); + final String name = p.getProperty("name", ""); + final String rel = p.getProperty("rel", ""); + inboundlinks[c++] = + "" + + ((name.length() > 0) ? name : "") + ""; + } + addSolr(solrdoc, "attr_inboundlinks", inboundlinks); } - solrdoc.addField("attr_inboundlinks", inboundlinks); c = 0; - String[] outboundlinks = new String[yacydoc.outboundLinkCount()]; - solrdoc.addField("outboundlinkscount_i", outboundlinks.length); - for (MultiProtocolURI url: yacydoc.outboundLinks()) { - Properties p = alllinks.get(url); - String name = p.getProperty("name", ""); - String rel = p.getProperty("rel", ""); - outboundlinks[c++] = - "" + - ((name.length() > 0) ? name : "") + ""; + final String[] outboundlinks = new String[yacydoc.outboundLinkCount()]; + if (contains("attr_outboundlinks")) { + addSolr(solrdoc, "outboundlinkscount_i", outboundlinks.length); + for (final MultiProtocolURI url: yacydoc.outboundLinks()) { + final Properties p = alllinks.get(url); + final String name = p.getProperty("name", ""); + final String rel = p.getProperty("rel", ""); + outboundlinks[c++] = + "" + + ((name.length() > 0) ? name : "") + ""; + } + addSolr(solrdoc, "attr_outboundlinks", outboundlinks); } - solrdoc.addField("attr_outboundlinks", outboundlinks); - // charset - solrdoc.addField("charset_s", yacydoc.getCharset()); + addSolr(solrdoc, "charset_s", yacydoc.getCharset()); // coordinates if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) { - solrdoc.addField("lon_coordinate", yacydoc.lon()); - solrdoc.addField("lat_coordinate", yacydoc.lat()); + addSolr(solrdoc, "lon_coordinate", yacydoc.lon()); + addSolr(solrdoc, "lat_coordinate", yacydoc.lat()); } - solrdoc.addField("httpstatus_i", 200); - Object parser = yacydoc.getParserObject(); + addSolr(solrdoc, "httpstatus_i", 200); + final Object parser = yacydoc.getParserObject(); if (parser instanceof ContentScraper) { - ContentScraper html = (ContentScraper) parser; - + final ContentScraper html = (ContentScraper) parser; + // header tags int h = 0; int f = 1; for (int i = 1; i <= 6; i++) { - String[] hs = html.getHeadlines(i); + final String[] hs = html.getHeadlines(i); h = h | (hs.length > 0 ? f : 0); f = f * 2; - solrdoc.addField("attr_h" + i, hs); + addSolr(solrdoc, "attr_h" + i, hs); } - solrdoc.addField("htags_i", h); + addSolr(solrdoc, "htags_i", h); // meta tags - Map metas = html.getMetas(); - String robots = metas.get("robots"); - if (robots != null) solrdoc.addField("metarobots_t", robots); - String generator = metas.get("generator"); - if (generator != null) solrdoc.addField("metagenerator_t", generator); - + final Map metas = html.getMetas(); + final String robots = metas.get("robots"); + if (robots != null) addSolr(solrdoc, "metarobots_t", robots); + final String generator = metas.get("generator"); + if (generator != null) addSolr(solrdoc, "metagenerator_t", generator); + // bold, italic - String[] bold = html.getBold(); - solrdoc.addField("boldcount_i", bold.length); + final String[] bold = html.getBold(); + addSolr(solrdoc, "boldcount_i", bold.length); if (bold.length > 0) { - solrdoc.addField("attr_bold", bold); - solrdoc.addField("attr_boldcount", html.getBoldCount(bold)); + addSolr(solrdoc, "attr_bold", bold); + if (contains("attr_boldcount")) { + addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold)); + } } - String[] italic = html.getItalic(); - solrdoc.addField("italiccount_i", italic.length); + final String[] italic = html.getItalic(); + addSolr(solrdoc, "italiccount_i", italic.length); if (italic.length > 0) { - solrdoc.addField("attr_italic", italic); - solrdoc.addField("attr_italiccount", html.getItalicCount(italic)); + addSolr(solrdoc, "attr_italic", italic); + if (contains("attr_italiccount")) { + addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic)); + } } - String[] li = html.getLi(); - solrdoc.addField("licount_i", li.length); - if (li.length > 0) solrdoc.addField("attr_li", li); - + final String[] li = html.getLi(); + addSolr(solrdoc, "licount_i", li.length); + if (li.length > 0) addSolr(solrdoc, "attr_li", li); + // images - Collection imagesc = html.getImages().values(); - String[] images = new String[imagesc.size()]; - c = 0; - for (ImageEntry ie: imagesc) images[c++] = ie.toString(); - solrdoc.addField("imagescount_i", images.length); - if (images.length > 0) solrdoc.addField("attr_images", images); + if (contains("attr_images")) { + final Collection imagesc = html.getImages().values(); + final String[] images = new String[imagesc.size()]; + c = 0; + for (final ImageEntry ie: imagesc) images[c++] = ie.toString(); + addSolr(solrdoc, "imagescount_i", images.length); + if (images.length > 0) addSolr(solrdoc, "attr_images", images); + } // style sheets - Map csss = html.getCSS(); - String[] css = new String[csss.size()]; - c = 0; - for (Map.Entry entry: csss.entrySet()) { - css[c++] = - ""; + if (contains("attr_css")) { + final Map csss = html.getCSS(); + final String[] css = new String[csss.size()]; + c = 0; + for (final Map.Entry entry: csss.entrySet()) { + css[c++] = + ""; + } + addSolr(solrdoc, "csscount_i", css.length); + if (css.length > 0) addSolr(solrdoc, "attr_css", css); } - solrdoc.addField("csscount_i", css.length); - if (css.length > 0) solrdoc.addField("attr_css", css); - + // Scripts - Set scriptss = html.getScript(); - String[] scripts = new String[scriptss.size()]; - c = 0; - for (MultiProtocolURI url: scriptss) { - scripts[c++] = url.toNormalform(false, false, false, false); + if (contains("attr_scripts")) { + final Set scriptss = html.getScript(); + final String[] scripts = new String[scriptss.size()]; + c = 0; + for (final MultiProtocolURI url: scriptss) { + scripts[c++] = url.toNormalform(false, false, false, false); + } + addSolr(solrdoc, "scriptscount_i", scripts.length); + if (scripts.length > 0) addSolr(solrdoc, "attr_scripts", scripts); } - solrdoc.addField("scriptscount_i", scripts.length); - if (scripts.length > 0) solrdoc.addField("attr_scripts", scripts); - + // Frames - Set framess = html.getFrames(); - String[] frames = new String[framess.size()]; - c = 0; - for (MultiProtocolURI entry: framess) { - frames[c++] = entry.toNormalform(false, false, false, false); + if (contains("attr_frames")) { + final Set framess = html.getFrames(); + final String[] frames = new String[framess.size()]; + c = 0; + for (final MultiProtocolURI entry: framess) { + frames[c++] = entry.toNormalform(false, false, false, false); + } + addSolr(solrdoc, "framesscount_i", frames.length); + if (frames.length > 0) addSolr(solrdoc, "attr_frames", frames); } - solrdoc.addField("framesscount_i", frames.length); - if (frames.length > 0) solrdoc.addField("attr_frames", frames); - + // IFrames - Set iframess = html.getIFrames(); - String[] iframes = new String[iframess.size()]; - c = 0; - for (MultiProtocolURI entry: iframess) { - iframes[c++] = entry.toNormalform(false, false, false, false); + if (contains("attr_iframes")) { + final Set iframess = html.getIFrames(); + final String[] iframes = new String[iframess.size()]; + c = 0; + for (final MultiProtocolURI entry: iframess) { + iframes[c++] = entry.toNormalform(false, false, false, false); + } + addSolr(solrdoc, "iframesscount_i", iframes.length); + if (iframes.length > 0) addSolr(solrdoc, "attr_iframes", iframes); } - solrdoc.addField("iframesscount_i", iframes.length); - if (iframes.length > 0) solrdoc.addField("attr_iframes", iframes); - + // flash embedded - solrdoc.addField("flash_b", html.containsFlash()); - + addSolr(solrdoc, "flash_b", html.containsFlash()); + // generic evaluation pattern - for (String model: html.getEvaluationModelNames()) { - String[] scorenames = html.getEvaluationModelScoreNames(model); - if (scorenames.length > 0) { - solrdoc.addField("attr_" + model, scorenames); - solrdoc.addField("attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames)); + for (final String model: html.getEvaluationModelNames()) { + if (contains("attr_" + model)) { + final String[] scorenames = html.getEvaluationModelScoreNames(model); + if (scorenames.length > 0) { + addSolr(solrdoc, "attr_" + model, scorenames); + addSolr(solrdoc, "attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames)); + } } } - + // response time - solrdoc.addField("responsetime_i", header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")); + addSolr(solrdoc, "responsetime_i", header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")); } return solrdoc; } - - + + /* * standard solr scheme diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java index 82e549c28..44867c73b 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java @@ -11,12 +11,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -34,6 +34,13 @@ import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; +import net.yacy.cora.document.ASCII; +import net.yacy.cora.protocol.Domains; +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.document.Document; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; + import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; @@ -42,38 +49,31 @@ import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; -import net.yacy.cora.document.ASCII; -import net.yacy.cora.protocol.Domains; -import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.document.Document; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.logging.Log; - public class SolrSingleConnector { - private String solrurl; + private final String solrurl; private SolrServer server; - private SolrScheme scheme; - + private final SolrScheme scheme; + private final static int transmissionQueueCount = 4; // allow concurrent http sessions to solr private final static int transmissionQueueSize = 50; // number of documents that are collected until a commit is sent - private Worker[] transmissionWorker; // the transmission workers to solr - private BlockingQueue[] transmissionQueue; // the queues quere documents are collected + private final Worker[] transmissionWorker; // the transmission workers to solr + private final BlockingQueue[] transmissionQueue; // the queues quere documents are collected private int transmissionRoundRobinCounter; // a rount robin counter for the transmission queues - + @SuppressWarnings("unchecked") - public SolrSingleConnector(String url, SolrScheme scheme) throws IOException { + public SolrSingleConnector(final String url, final SolrScheme scheme) throws IOException { this.solrurl = url; this.scheme = scheme; - transmissionRoundRobinCounter = 0; + this.transmissionRoundRobinCounter = 0; this.transmissionQueue = new ArrayBlockingQueue[transmissionQueueCount]; for (int i = 0; i < transmissionQueueCount; i++) { this.transmissionQueue[i] = new ArrayBlockingQueue(transmissionQueueSize); } try { this.server = new SolrHTTPClient(this.solrurl); - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { throw new IOException("bad connector url: " + this.solrurl); } this.transmissionWorker = new Worker[transmissionQueueCount]; @@ -86,7 +86,7 @@ public class SolrSingleConnector { private class Worker extends Thread { boolean shallRun; int idx; - public Worker(int i) { + public Worker(final int i) { this.idx = i; this.shallRun = true; } @@ -95,86 +95,86 @@ public class SolrSingleConnector { } public void run() { while (this.shallRun) { - if (transmissionQueue[idx].size() > 0) { + if (SolrSingleConnector.this.transmissionQueue[this.idx].size() > 0) { try { - flushTransmissionQueue(idx); - } catch (IOException e) { + flushTransmissionQueue(this.idx); + } catch (final IOException e) { Log.logSevere("SolrSingleConnector", "flush Transmission failed in worker", e); continue; } } else { - try {Thread.sleep(1000);} catch (InterruptedException e) {} + try {Thread.sleep(1000);} catch (final InterruptedException e) {} } } try { - flushTransmissionQueue(idx); - } catch (IOException e) {} + flushTransmissionQueue(this.idx); + } catch (final IOException e) {} } } - + public void close() { for (int i = 0; i < transmissionQueueCount; i++) { if (this.transmissionWorker[i].isAlive()) { this.transmissionWorker[i].pleaseStop(); - try {this.transmissionWorker[i].join();} catch (InterruptedException e) {} + try {this.transmissionWorker[i].join();} catch (final InterruptedException e) {} } } for (int i = 0; i < transmissionQueueCount; i++) { try { flushTransmissionQueue(i); - } catch (IOException e) {} + } catch (final IOException e) {} } } - + /** * delete everything in the solr index * @throws IOException */ public void clear() throws IOException { try { - server.deleteByQuery("*:*"); - server.commit(); - } catch (SolrServerException e) { + this.server.deleteByQuery("*:*"); + this.server.commit(); + } catch (final SolrServerException e) { throw new IOException(e); } } - - public void delete(String id) throws IOException { + + public void delete(final String id) throws IOException { try { - server.deleteById(id); - } catch (SolrServerException e) { + this.server.deleteById(id); + } catch (final SolrServerException e) { throw new IOException(e); } } - - public void delete(List ids) throws IOException { + + public void delete(final List ids) throws IOException { try { - server.deleteById(ids); - } catch (SolrServerException e) { + this.server.deleteById(ids); + } catch (final SolrServerException e) { throw new IOException(e); } } - - public void add(File file, String solrId) throws IOException { - ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract"); + + public void add(final File file, final String solrId) throws IOException { + final ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract"); up.addFile(file); up.setParam("literal.id", solrId); up.setParam("uprefix", "attr_"); up.setParam("fmap.content", "attr_content"); //up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true); try { - server.request(up); - server.commit(); - } catch (SolrServerException e) { + this.server.request(up); + this.server.commit(); + } catch (final SolrServerException e) { throw new IOException(e); } } - - public void add(String id, ResponseHeader header, Document doc) throws IOException { + + public void add(final String id, final ResponseHeader header, final Document doc) throws IOException { add(this.scheme.yacy2solr(id, header, doc)); } - protected void add(SolrInputDocument solrdoc) throws IOException { + protected void add(final SolrInputDocument solrdoc) throws IOException { int thisrrc = this.transmissionRoundRobinCounter; int nextrrc = thisrrc++; if (nextrrc >= transmissionQueueCount) nextrrc = 0; @@ -183,81 +183,81 @@ public class SolrSingleConnector { this.transmissionQueue[thisrrc].offer(solrdoc); } else { if (this.transmissionQueue[thisrrc].size() > 0) flushTransmissionQueue(thisrrc); - Collection docs = new ArrayList(); + final Collection docs = new ArrayList(); docs.add(solrdoc); addSolr(docs); } } - - protected void addSolr(Collection docs) throws IOException { + + protected void addSolr(final Collection docs) throws IOException { try { - server.add(docs); - server.commit(); - /* To immediately commit after adding documents, you could use: + this.server.add(docs); + this.server.commit(); + /* To immediately commit after adding documents, you could use: UpdateRequest req = new UpdateRequest(); req.setAction( UpdateRequest.ACTION.COMMIT, false, false ); req.add( docs ); UpdateResponse rsp = req.process( server ); */ - } catch (SolrServerException e) { + } catch (final SolrServerException e) { throw new IOException(e); } } - - public void err(DigestURI digestURI, String failReason, int httpstatus) throws IOException { - - SolrInputDocument solrdoc = new SolrInputDocument(); + + public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException { + + final SolrInputDocument solrdoc = new SolrInputDocument(); solrdoc.addField("id", ASCII.String(digestURI.hash())); solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f); - InetAddress address = Domains.dnsResolve(digestURI.getHost()); + final InetAddress address = Domains.dnsResolve(digestURI.getHost()); if (address != null) solrdoc.addField("ip_s", address.getHostAddress()); if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost()); // path elements of link - String path = digestURI.getPath(); + final String path = digestURI.getPath(); if (path != null) { - String[] paths = path.split("/"); + final String[] paths = path.split("/"); if (paths.length > 0) solrdoc.addField("attr_paths", paths); } solrdoc.addField("failreason_t", failReason); solrdoc.addField("httpstatus_i", httpstatus); - + add(solrdoc); } - - private void flushTransmissionQueue(int idx) throws IOException { - Collection c = new ArrayList(); + + private void flushTransmissionQueue(final int idx) throws IOException { + final Collection c = new ArrayList(); while (this.transmissionQueue[idx].size() > 0) { try { c.add(this.transmissionQueue[idx].take()); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { continue; } } addSolr(c); } - - + + /** * get a query result from solr * to get all results set the query String to "*:*" * @param querystring * @throws IOException */ - public SolrDocumentList get(String querystring, int offset, int count) throws IOException { + public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException { // construct query - SolrQuery query = new SolrQuery(); + final SolrQuery query = new SolrQuery(); query.setQuery(querystring); query.setRows(count); query.setStart(offset); query.addSortField( "price", SolrQuery.ORDER.asc ); - + // query the server //SearchResult result = new SearchResult(count); try { - QueryResponse rsp = server.query( query ); - SolrDocumentList docs = rsp.getResults(); + final QueryResponse rsp = this.server.query( query ); + final SolrDocumentList docs = rsp.getResults(); return docs; // add the docs into the YaCy search result container /* @@ -265,22 +265,22 @@ public class SolrSingleConnector { result.put(element) } */ - } catch (SolrServerException e) { + } catch (final SolrServerException e) { throw new IOException(e); } - + //return result; } - - public static void main(String args[]) { + + public static void main(final String args[]) { SolrSingleConnector solr; try { - solr = new SolrSingleConnector("http://127.0.0.1:8983/solr", SolrScheme.SolrCellExtended); + solr = new SolrSingleConnector("http://127.0.0.1:8983/solr", new SolrScheme()); solr.clear(); - File exampleDir = new File("/Data/workspace2/yacy/test/parsertest/"); + final File exampleDir = new File("/Data/workspace2/yacy/test/parsertest/"); long t, t0, a = 0; int c = 0; - for (String s: exampleDir.list()) { + for (final String s: exampleDir.list()) { if (s.startsWith(".")) continue; t = System.currentTimeMillis(); solr.add(new File(exampleDir, s), s); @@ -290,9 +290,9 @@ public class SolrSingleConnector { System.out.println("pushed file " + s + " to solr, " + t0 + " milliseconds"); } System.out.println("pushed " + c + " files in " + a + " milliseconds, " + (a / c) + " milliseconds average; " + (60000 / a * c) + " PPM"); - } catch (IOException e) { + } catch (final IOException e) { e.printStackTrace(); } } - + } diff --git a/source/net/yacy/cora/storage/ConfigurationSet.java b/source/net/yacy/cora/storage/ConfigurationSet.java index 6ad6ce12e..3f2f1d176 100644 --- a/source/net/yacy/cora/storage/ConfigurationSet.java +++ b/source/net/yacy/cora/storage/ConfigurationSet.java @@ -40,6 +40,11 @@ import java.util.Set; * the list may contain lines with one keyword, comment lines, empty lines and out-commented keyword lines * when an attribute is changed here, the list is stored again with the original formatting * + * the syntax of configuration files: + * - all lines beginning with '##' are comments + * - all non-empty lines not beginning with '#' are keyword lines + * - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines + * * @author Michael Christen */ public class ConfigurationSet extends AbstractSet implements Set { @@ -47,6 +52,11 @@ public class ConfigurationSet extends AbstractSet implements Set private final File file; private String[] lines; + public ConfigurationSet() { + this.file = null; + this.lines = new String[0]; + } + public ConfigurationSet(final File file) { this.file = file; try { @@ -62,11 +72,18 @@ public class ConfigurationSet extends AbstractSet implements Set } } + @Override + public boolean isEmpty() { + // a shortcut to a fast 'true' in case that we initialized the class without a configuration file + return this.lines == null || this.lines.length == 0 || super.isEmpty(); + } + /** * save the configuration back to the file * @throws IOException */ private void commit() throws IOException { + if (this.file == null) return; final BufferedWriter writer = new BufferedWriter(new FileWriter(this.file)); for (final String s: this.lines) { writer.write(s);