- added an index profile editor in the /indexFederated_p.html servlet for solr indexes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7811 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 214ea005cf
commit b6f09a475d

@ -141,7 +141,7 @@ attr_images
## number of images, int
imagescount_i
## flag that shows if a swf file is linked, boolen
## flag that shows if a swf file is linked, boolean
flash_b
## list of all links to frames, textgen

@ -1029,4 +1029,4 @@ federated.service.yacy.indexing.enabled = true
federated.service.solr.indexing.enabled = false
federated.service.solr.indexing.url = http://127.0.0.1:8983/solr
federated.service.solr.indexing.charding = MODULO_HOST_MD5
federated.service.solr.indexing.scheme = SolrCellExtended
federated.service.solr.indexing.schemefile = solr.keys.default.list

@ -21,6 +21,7 @@
</legend>
You can just switch on or off this index. If you switch it off, you will not be able to search with YaCy any more.
</fieldset>
<input type="submit" name="set" value="Set" />
<fieldset>
<legend>
@ -55,9 +56,26 @@
<dt class="TableCellDark">Charding Method</dt>
<dd><input type="text" size="50" maxlength="50" value="#[solr.indexing.charding]#" name="solr.indexing.charding" id="solr.indexing.charding" disabled="disabled"/></dd>
<dt class="TableCellDark">Scheme</dt>
<dd><input type="text" size="50" maxlength="50" value="#[solr.indexing.scheme]#" name="solr.indexing.scheme" id="solr.indexing.scheme" disabled="disabled"/></dd>
<dd><input type="text" size="50" maxlength="50" value="#[solr.indexing.schemefile]#" name="solr.indexing.schemefile" id="solr.indexing.schemefile" disabled="disabled"/></dd>
</dl>
</div>
<div>
<h3>Index Scheme</h3>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td>Active</td>
<td>Attribute</td>
<td>Comment</td>
</tr>
#{scheme}#
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td align="center"><input type="checkbox" name="scheme_#[key]#" value="checked" #(checked)#::checked="checked"#(/checked)#/></td>
<td align="left">#[key]#</td>
<td align="left">#[comment]#</td>
</tr>
#{/scheme}#
</table>
</div>
</fieldset>
<input type="submit" name="set" value="Set" />
</form>

@ -11,25 +11,27 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.solr.SolrChardingConnector;
import net.yacy.cora.services.federated.solr.SolrChardingSelection;
import net.yacy.cora.services.federated.solr.SolrScheme;
import net.yacy.cora.storage.ConfigurationSet;
import net.yacy.kelondro.logging.Log;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -39,47 +41,65 @@ public class IndexFederated_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final serverObjects prop = new serverObjects();
Switchboard sb = (Switchboard) env;
final Switchboard sb = (Switchboard) env;
if (post != null && post.containsKey("set")) {
// yacy
env.setConfig("federated.service.yacy.indexing.enabled", post.getBoolean("yacy.indexing.enabled", false));
// solr
boolean solrWasOn = env.getConfigBool("federated.service.solr.indexing.enabled", true);
boolean solrIsOnAfterwards = post.getBoolean("solr.indexing.enabled", false);
final boolean solrWasOn = env.getConfigBool("federated.service.solr.indexing.enabled", true);
final boolean solrIsOnAfterwards = post.getBoolean("solr.indexing.enabled", false);
env.setConfig("federated.service.solr.indexing.enabled", solrIsOnAfterwards);
env.setConfig("federated.service.solr.indexing.url", post.get("solr.indexing.url", env.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr")));
env.setConfig("federated.service.solr.indexing.charding", post.get("solr.indexing.charding", env.getConfig("federated.service.solr.indexing.charding", "modulo-host-md5")));
env.setConfig("federated.service.solr.indexing.scheme", post.get("solr.indexing.scheme", env.getConfig("federated.service.solr.indexing.scheme", "SolrCellExtended")));
env.setConfig("federated.service.solr.indexing.schemefile", post.get("solr.indexing.schemefile", env.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list")));
if (solrWasOn && !solrIsOnAfterwards) {
// switch off
sb.solrConnector.close();
sb.solrConnector = null;
}
if (!solrWasOn && solrIsOnAfterwards) {
// switch on
String solrurls = sb.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
final String solrurls = sb.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/solr.keys.default.list"));
try {
sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, SolrScheme.SolrCellExtended, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
} catch (IOException e) {
sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
} catch (final IOException e) {
Log.logException(e);
sb.solrConnector = null;
}
}
// read index scheme table flags
final SolrScheme scheme = sb.solrConnector.getScheme();
final Iterator<ConfigurationSet.Entry> i = scheme.allIterator();
ConfigurationSet.Entry entry;
while (i.hasNext()) {
entry = i.next();
final String v = post.get("scheme_" + entry.key());
final boolean c = v != null && v.equals("checked");
try {
if (entry.enabled()) {
if (!c) scheme.disable(entry.key());
} else {
if (c) scheme.enable(entry.key());
}
} catch (final IOException e) {}
}
}
// show solr host table
if (sb.solrConnector == null) {
prop.put("table", 0);
} else {
prop.put("table", 1);
try {
long[] size = sb.solrConnector.getSizeList();
String[] urls = sb.solrConnector.getAdminInterfaceList();
final long[] size = sb.solrConnector.getSizeList();
final String[] urls = sb.solrConnector.getAdminInterfaceList();
boolean dark = false;
for (int i = 0; i < size.length; i++) {
prop.put("table_list_" + i + "_dark", dark ? 1 : 0); dark = !dark;
@ -87,18 +107,34 @@ public class IndexFederated_p {
prop.put("table_list_" + i + "_size", size[i]);
}
prop.put("table_list", size.length);
} catch (IOException e) {
// write scheme
final SolrScheme scheme = sb.solrConnector.getScheme();
final Iterator<ConfigurationSet.Entry> i = scheme.allIterator();
int c = 0;
dark = false;
ConfigurationSet.Entry entry;
while (i.hasNext()) {
entry = i.next();
prop.put("scheme_" + c + "_dark", dark ? 1 : 0); dark = !dark;
prop.put("scheme_" + c + "_checked", scheme.contains(entry.key()) ? 1 : 0);
prop.putHTML("scheme_" + c + "_key", entry.key());
prop.putHTML("scheme_" + c + "_comment", scheme.commentHeadline(entry.key()));
c++;
}
prop.put("scheme", c);
} catch (final IOException e) {
Log.logException(e);
prop.put("table", 0);
}
}
// fill attribute fields
prop.put("yacy.indexing.enabled.checked", env.getConfigBool("federated.service.yacy.indexing.enabled", true) ? 1 : 0);
prop.put("solr.indexing.enabled.checked", env.getConfigBool("federated.service.solr.indexing.enabled", false) ? 1 : 0);
prop.put("solr.indexing.url", env.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"));
prop.put("solr.indexing.charding", env.getConfig("federated.service.solr.indexing.charding", "modulo-host-md5"));
prop.put("solr.indexing.scheme", env.getConfig("federated.service.solr.indexing.scheme", "SolrCellExtended"));
prop.put("solr.indexing.schemefile", env.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list"));
// return rewrite properties
return prop;

@ -560,11 +560,16 @@ public final class Switchboard extends serverSwitch {
this.log.logConfig("Parser: Initializing Mime Type deny list");
TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
// prepare a solr index profile switch list
final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/solr.keys.default.list");
if (!solrWorkProfile.exists()) FileUtils.copy(new File("defaults/solr.keys.list"), solrWorkProfile);
final SolrScheme scheme = new SolrScheme(solrWorkProfile);
// set up the solr interface
final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
try {
this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, SolrScheme.SolrCellExtended, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
} catch (final IOException e) {
Log.logException(e);
this.solrConnector = null;

@ -55,6 +55,10 @@ public class SolrChardingConnector {
this.scheme = scheme;
}
public SolrScheme getScheme() {
return this.scheme;
}
public void close() {
for (final SolrSingleConnector connector: this.connectors) connector.close();
}

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -25,211 +25,271 @@
package net.yacy.cora.services.federated.solr;
import java.io.File;
import java.net.InetAddress;
import java.util.Collection;
import java.util.Date;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.storage.ConfigurationSet;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.cora.document.MultiProtocolURI;
import org.apache.solr.common.SolrInputDocument;
public enum SolrScheme {
public class SolrScheme extends ConfigurationSet {
/**
* initialize with an empty ConfigurationSet which will cause that all the index
* attributes are used
*/
public SolrScheme() {
super();
}
/**
* initialize the scheme with a given configuration file
* the configuration file simply contains a list of lines with keywords
* @param configurationFile
*/
public SolrScheme(final File configurationFile) {
super(configurationFile);
}
SolrCell,
SolrCellExtended,
DublinCore;
private void addSolr(final SolrInputDocument solrdoc, final String key, final String value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
}
public SolrInputDocument yacy2solr(String id, ResponseHeader header, Document document) {
if (this == SolrCellExtended) return yacy2solrSolrCellExtended(id, header, document);
return null;
private void addSolr(final SolrInputDocument solrdoc, final String key, final Date value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
}
public static SolrInputDocument yacy2solrSolrCellExtended(String id, ResponseHeader header, Document yacydoc) {
private void addSolr(final SolrInputDocument solrdoc, final String key, final int value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final String[] value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final float value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final boolean value) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value);
}
private void addSolr(final SolrInputDocument solrdoc, final String key, final String value, final float boost) {
if (isEmpty() || contains(key)) solrdoc.setField(key, value, boost);
}
public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, final Document yacydoc) {
// we user the SolrCell design as index scheme
SolrInputDocument solrdoc = new SolrInputDocument();
DigestURI digestURI = new DigestURI(yacydoc.dc_source());
solrdoc.addField("failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before)
solrdoc.addField("id", id);
solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
InetAddress address = Domains.dnsResolve(digestURI.getHost());
if (address != null) solrdoc.addField("ip_s", address.getHostAddress());
if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost());
solrdoc.addField("title", yacydoc.dc_title());
solrdoc.addField("author", yacydoc.dc_creator());
solrdoc.addField("description", yacydoc.dc_description());
solrdoc.addField("content_type", yacydoc.dc_format());
solrdoc.addField("last_modified", header.lastModified());
solrdoc.addField("keywords", yacydoc.dc_subject(' '));
String content = UTF8.String(yacydoc.getTextBytes());
solrdoc.addField("text_t", content);
int contentwc = content.split(" ").length;
solrdoc.addField("wordcount_i", contentwc);
final SolrInputDocument solrdoc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
addSolr(solrdoc, "failreason_t", ""); // overwrite a possible fail reason (in case that there was a fail reason before)
addSolr(solrdoc, "id", id);
addSolr(solrdoc, "sku", digestURI.toNormalform(true, false), 3.0f);
final InetAddress address = Domains.dnsResolve(digestURI.getHost());
if (address != null) addSolr(solrdoc, "ip_s", address.getHostAddress());
if (digestURI.getHost() != null) addSolr(solrdoc, "host_s", digestURI.getHost());
addSolr(solrdoc, "title", yacydoc.dc_title());
addSolr(solrdoc, "author", yacydoc.dc_creator());
addSolr(solrdoc, "description", yacydoc.dc_description());
addSolr(solrdoc, "content_type", yacydoc.dc_format());
addSolr(solrdoc, "last_modified", header.lastModified());
addSolr(solrdoc, "keywords", yacydoc.dc_subject(' '));
final String content = UTF8.String(yacydoc.getTextBytes());
addSolr(solrdoc, "text_t", content);
if (contains("wordcount_i")) {
final int contentwc = content.split(" ").length;
addSolr(solrdoc, "wordcount_i", contentwc);
}
// path elements of link
String path = digestURI.getPath();
if (path != null) {
String[] paths = path.split("/");
if (paths.length > 0) solrdoc.addField("attr_paths", paths);
final String path = digestURI.getPath();
if (path != null && contains("attr_paths")) {
final String[] paths = path.split("/");
if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths);
}
// list all links
Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
int c = 0;
String[] inboundlinks = new String[yacydoc.inboundLinkCount()];
solrdoc.addField("inboundlinkscount_i", inboundlinks.length);
for (MultiProtocolURI url: yacydoc.inboundLinks()) {
Properties p = alllinks.get(url);
String name = p.getProperty("name", "");
String rel = p.getProperty("rel", "");
inboundlinks[c++] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
if (contains("attr_inboundlinks")) {
final String[] inboundlinks = new String[yacydoc.inboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", "");
final String rel = p.getProperty("rel", "");
inboundlinks[c++] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
}
addSolr(solrdoc, "attr_inboundlinks", inboundlinks);
}
solrdoc.addField("attr_inboundlinks", inboundlinks);
c = 0;
String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
solrdoc.addField("outboundlinkscount_i", outboundlinks.length);
for (MultiProtocolURI url: yacydoc.outboundLinks()) {
Properties p = alllinks.get(url);
String name = p.getProperty("name", "");
String rel = p.getProperty("rel", "");
outboundlinks[c++] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
final String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
if (contains("attr_outboundlinks")) {
addSolr(solrdoc, "outboundlinkscount_i", outboundlinks.length);
for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", "");
final String rel = p.getProperty("rel", "");
outboundlinks[c++] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
}
addSolr(solrdoc, "attr_outboundlinks", outboundlinks);
}
solrdoc.addField("attr_outboundlinks", outboundlinks);
// charset
solrdoc.addField("charset_s", yacydoc.getCharset());
addSolr(solrdoc, "charset_s", yacydoc.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
solrdoc.addField("lon_coordinate", yacydoc.lon());
solrdoc.addField("lat_coordinate", yacydoc.lat());
addSolr(solrdoc, "lon_coordinate", yacydoc.lon());
addSolr(solrdoc, "lat_coordinate", yacydoc.lat());
}
solrdoc.addField("httpstatus_i", 200);
Object parser = yacydoc.getParserObject();
addSolr(solrdoc, "httpstatus_i", 200);
final Object parser = yacydoc.getParserObject();
if (parser instanceof ContentScraper) {
ContentScraper html = (ContentScraper) parser;
final ContentScraper html = (ContentScraper) parser;
// header tags
int h = 0;
int f = 1;
for (int i = 1; i <= 6; i++) {
String[] hs = html.getHeadlines(i);
final String[] hs = html.getHeadlines(i);
h = h | (hs.length > 0 ? f : 0);
f = f * 2;
solrdoc.addField("attr_h" + i, hs);
addSolr(solrdoc, "attr_h" + i, hs);
}
solrdoc.addField("htags_i", h);
addSolr(solrdoc, "htags_i", h);
// meta tags
Map<String, String> metas = html.getMetas();
String robots = metas.get("robots");
if (robots != null) solrdoc.addField("metarobots_t", robots);
String generator = metas.get("generator");
if (generator != null) solrdoc.addField("metagenerator_t", generator);
final Map<String, String> metas = html.getMetas();
final String robots = metas.get("robots");
if (robots != null) addSolr(solrdoc, "metarobots_t", robots);
final String generator = metas.get("generator");
if (generator != null) addSolr(solrdoc, "metagenerator_t", generator);
// bold, italic
String[] bold = html.getBold();
solrdoc.addField("boldcount_i", bold.length);
final String[] bold = html.getBold();
addSolr(solrdoc, "boldcount_i", bold.length);
if (bold.length > 0) {
solrdoc.addField("attr_bold", bold);
solrdoc.addField("attr_boldcount", html.getBoldCount(bold));
addSolr(solrdoc, "attr_bold", bold);
if (contains("attr_boldcount")) {
addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold));
}
}
String[] italic = html.getItalic();
solrdoc.addField("italiccount_i", italic.length);
final String[] italic = html.getItalic();
addSolr(solrdoc, "italiccount_i", italic.length);
if (italic.length > 0) {
solrdoc.addField("attr_italic", italic);
solrdoc.addField("attr_italiccount", html.getItalicCount(italic));
addSolr(solrdoc, "attr_italic", italic);
if (contains("attr_italiccount")) {
addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic));
}
}
String[] li = html.getLi();
solrdoc.addField("licount_i", li.length);
if (li.length > 0) solrdoc.addField("attr_li", li);
final String[] li = html.getLi();
addSolr(solrdoc, "licount_i", li.length);
if (li.length > 0) addSolr(solrdoc, "attr_li", li);
// images
Collection<ImageEntry> imagesc = html.getImages().values();
String[] images = new String[imagesc.size()];
c = 0;
for (ImageEntry ie: imagesc) images[c++] = ie.toString();
solrdoc.addField("imagescount_i", images.length);
if (images.length > 0) solrdoc.addField("attr_images", images);
if (contains("attr_images")) {
final Collection<ImageEntry> imagesc = html.getImages().values();
final String[] images = new String[imagesc.size()];
c = 0;
for (final ImageEntry ie: imagesc) images[c++] = ie.toString();
addSolr(solrdoc, "imagescount_i", images.length);
if (images.length > 0) addSolr(solrdoc, "attr_images", images);
}
// style sheets
Map<MultiProtocolURI, String> csss = html.getCSS();
String[] css = new String[csss.size()];
c = 0;
for (Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
css[c++] =
"<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
" href=\""+ entry.getKey().toNormalform(false, false, false, false) + "\" />";
if (contains("attr_css")) {
final Map<MultiProtocolURI, String> csss = html.getCSS();
final String[] css = new String[csss.size()];
c = 0;
for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
css[c++] =
"<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
" href=\""+ entry.getKey().toNormalform(false, false, false, false) + "\" />";
}
addSolr(solrdoc, "csscount_i", css.length);
if (css.length > 0) addSolr(solrdoc, "attr_css", css);
}
solrdoc.addField("csscount_i", css.length);
if (css.length > 0) solrdoc.addField("attr_css", css);
// Scripts
Set<MultiProtocolURI> scriptss = html.getScript();
String[] scripts = new String[scriptss.size()];
c = 0;
for (MultiProtocolURI url: scriptss) {
scripts[c++] = url.toNormalform(false, false, false, false);
if (contains("attr_scripts")) {
final Set<MultiProtocolURI> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
for (final MultiProtocolURI url: scriptss) {
scripts[c++] = url.toNormalform(false, false, false, false);
}
addSolr(solrdoc, "scriptscount_i", scripts.length);
if (scripts.length > 0) addSolr(solrdoc, "attr_scripts", scripts);
}
solrdoc.addField("scriptscount_i", scripts.length);
if (scripts.length > 0) solrdoc.addField("attr_scripts", scripts);
// Frames
Set<MultiProtocolURI> framess = html.getFrames();
String[] frames = new String[framess.size()];
c = 0;
for (MultiProtocolURI entry: framess) {
frames[c++] = entry.toNormalform(false, false, false, false);
if (contains("attr_frames")) {
final Set<MultiProtocolURI> framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
for (final MultiProtocolURI entry: framess) {
frames[c++] = entry.toNormalform(false, false, false, false);
}
addSolr(solrdoc, "framesscount_i", frames.length);
if (frames.length > 0) addSolr(solrdoc, "attr_frames", frames);
}
solrdoc.addField("framesscount_i", frames.length);
if (frames.length > 0) solrdoc.addField("attr_frames", frames);
// IFrames
Set<MultiProtocolURI> iframess = html.getIFrames();
String[] iframes = new String[iframess.size()];
c = 0;
for (MultiProtocolURI entry: iframess) {
iframes[c++] = entry.toNormalform(false, false, false, false);
if (contains("attr_iframes")) {
final Set<MultiProtocolURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
for (final MultiProtocolURI entry: iframess) {
iframes[c++] = entry.toNormalform(false, false, false, false);
}
addSolr(solrdoc, "iframesscount_i", iframes.length);
if (iframes.length > 0) addSolr(solrdoc, "attr_iframes", iframes);
}
solrdoc.addField("iframesscount_i", iframes.length);
if (iframes.length > 0) solrdoc.addField("attr_iframes", iframes);
// flash embedded
solrdoc.addField("flash_b", html.containsFlash());
addSolr(solrdoc, "flash_b", html.containsFlash());
// generic evaluation pattern
for (String model: html.getEvaluationModelNames()) {
String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) {
solrdoc.addField("attr_" + model, scorenames);
solrdoc.addField("attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
for (final String model: html.getEvaluationModelNames()) {
if (contains("attr_" + model)) {
final String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) {
addSolr(solrdoc, "attr_" + model, scorenames);
addSolr(solrdoc, "attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
}
}
}
// response time
solrdoc.addField("responsetime_i", header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
addSolr(solrdoc, "responsetime_i", header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"));
}
return solrdoc;
}
/*
* standard solr scheme

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -34,6 +34,13 @@ import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
@ -42,38 +49,31 @@ import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
public class SolrSingleConnector {
private String solrurl;
private final String solrurl;
private SolrServer server;
private SolrScheme scheme;
private final SolrScheme scheme;
private final static int transmissionQueueCount = 4; // allow concurrent http sessions to solr
private final static int transmissionQueueSize = 50; // number of documents that are collected until a commit is sent
private Worker[] transmissionWorker; // the transmission workers to solr
private BlockingQueue<SolrInputDocument>[] transmissionQueue; // the queues quere documents are collected
private final Worker[] transmissionWorker; // the transmission workers to solr
private final BlockingQueue<SolrInputDocument>[] transmissionQueue; // the queues quere documents are collected
private int transmissionRoundRobinCounter; // a rount robin counter for the transmission queues
@SuppressWarnings("unchecked")
public SolrSingleConnector(String url, SolrScheme scheme) throws IOException {
public SolrSingleConnector(final String url, final SolrScheme scheme) throws IOException {
this.solrurl = url;
this.scheme = scheme;
transmissionRoundRobinCounter = 0;
this.transmissionRoundRobinCounter = 0;
this.transmissionQueue = new ArrayBlockingQueue[transmissionQueueCount];
for (int i = 0; i < transmissionQueueCount; i++) {
this.transmissionQueue[i] = new ArrayBlockingQueue<SolrInputDocument>(transmissionQueueSize);
}
try {
this.server = new SolrHTTPClient(this.solrurl);
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
throw new IOException("bad connector url: " + this.solrurl);
}
this.transmissionWorker = new Worker[transmissionQueueCount];
@ -86,7 +86,7 @@ public class SolrSingleConnector {
private class Worker extends Thread {
boolean shallRun;
int idx;
public Worker(int i) {
public Worker(final int i) {
this.idx = i;
this.shallRun = true;
}
@ -95,86 +95,86 @@ public class SolrSingleConnector {
}
public void run() {
while (this.shallRun) {
if (transmissionQueue[idx].size() > 0) {
if (SolrSingleConnector.this.transmissionQueue[this.idx].size() > 0) {
try {
flushTransmissionQueue(idx);
} catch (IOException e) {
flushTransmissionQueue(this.idx);
} catch (final IOException e) {
Log.logSevere("SolrSingleConnector", "flush Transmission failed in worker", e);
continue;
}
} else {
try {Thread.sleep(1000);} catch (InterruptedException e) {}
try {Thread.sleep(1000);} catch (final InterruptedException e) {}
}
}
try {
flushTransmissionQueue(idx);
} catch (IOException e) {}
flushTransmissionQueue(this.idx);
} catch (final IOException e) {}
}
}
public void close() {
for (int i = 0; i < transmissionQueueCount; i++) {
if (this.transmissionWorker[i].isAlive()) {
this.transmissionWorker[i].pleaseStop();
try {this.transmissionWorker[i].join();} catch (InterruptedException e) {}
try {this.transmissionWorker[i].join();} catch (final InterruptedException e) {}
}
}
for (int i = 0; i < transmissionQueueCount; i++) {
try {
flushTransmissionQueue(i);
} catch (IOException e) {}
} catch (final IOException e) {}
}
}
/**
* delete everything in the solr index
* @throws IOException
*/
public void clear() throws IOException {
try {
server.deleteByQuery("*:*");
server.commit();
} catch (SolrServerException e) {
this.server.deleteByQuery("*:*");
this.server.commit();
} catch (final SolrServerException e) {
throw new IOException(e);
}
}
public void delete(String id) throws IOException {
public void delete(final String id) throws IOException {
try {
server.deleteById(id);
} catch (SolrServerException e) {
this.server.deleteById(id);
} catch (final SolrServerException e) {
throw new IOException(e);
}
}
public void delete(List<String> ids) throws IOException {
public void delete(final List<String> ids) throws IOException {
try {
server.deleteById(ids);
} catch (SolrServerException e) {
this.server.deleteById(ids);
} catch (final SolrServerException e) {
throw new IOException(e);
}
}
public void add(File file, String solrId) throws IOException {
ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
public void add(final File file, final String solrId) throws IOException {
final ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
up.addFile(file);
up.setParam("literal.id", solrId);
up.setParam("uprefix", "attr_");
up.setParam("fmap.content", "attr_content");
//up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);
try {
server.request(up);
server.commit();
} catch (SolrServerException e) {
this.server.request(up);
this.server.commit();
} catch (final SolrServerException e) {
throw new IOException(e);
}
}
public void add(String id, ResponseHeader header, Document doc) throws IOException {
public void add(final String id, final ResponseHeader header, final Document doc) throws IOException {
add(this.scheme.yacy2solr(id, header, doc));
}
protected void add(SolrInputDocument solrdoc) throws IOException {
protected void add(final SolrInputDocument solrdoc) throws IOException {
int thisrrc = this.transmissionRoundRobinCounter;
int nextrrc = thisrrc++;
if (nextrrc >= transmissionQueueCount) nextrrc = 0;
@ -183,81 +183,81 @@ public class SolrSingleConnector {
this.transmissionQueue[thisrrc].offer(solrdoc);
} else {
if (this.transmissionQueue[thisrrc].size() > 0) flushTransmissionQueue(thisrrc);
Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
final Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
docs.add(solrdoc);
addSolr(docs);
}
}
protected void addSolr(Collection<SolrInputDocument> docs) throws IOException {
protected void addSolr(final Collection<SolrInputDocument> docs) throws IOException {
try {
server.add(docs);
server.commit();
/* To immediately commit after adding documents, you could use:
this.server.add(docs);
this.server.commit();
/* To immediately commit after adding documents, you could use:
UpdateRequest req = new UpdateRequest();
req.setAction( UpdateRequest.ACTION.COMMIT, false, false );
req.add( docs );
UpdateResponse rsp = req.process( server );
*/
} catch (SolrServerException e) {
} catch (final SolrServerException e) {
throw new IOException(e);
}
}
public void err(DigestURI digestURI, String failReason, int httpstatus) throws IOException {
SolrInputDocument solrdoc = new SolrInputDocument();
public void err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException {
final SolrInputDocument solrdoc = new SolrInputDocument();
solrdoc.addField("id", ASCII.String(digestURI.hash()));
solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
InetAddress address = Domains.dnsResolve(digestURI.getHost());
final InetAddress address = Domains.dnsResolve(digestURI.getHost());
if (address != null) solrdoc.addField("ip_s", address.getHostAddress());
if (digestURI.getHost() != null) solrdoc.addField("host_s", digestURI.getHost());
// path elements of link
String path = digestURI.getPath();
final String path = digestURI.getPath();
if (path != null) {
String[] paths = path.split("/");
final String[] paths = path.split("/");
if (paths.length > 0) solrdoc.addField("attr_paths", paths);
}
solrdoc.addField("failreason_t", failReason);
solrdoc.addField("httpstatus_i", httpstatus);
add(solrdoc);
}
private void flushTransmissionQueue(int idx) throws IOException {
Collection<SolrInputDocument> c = new ArrayList<SolrInputDocument>();
private void flushTransmissionQueue(final int idx) throws IOException {
final Collection<SolrInputDocument> c = new ArrayList<SolrInputDocument>();
while (this.transmissionQueue[idx].size() > 0) {
try {
c.add(this.transmissionQueue[idx].take());
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
continue;
}
}
addSolr(c);
}
/**
* get a query result from solr
* to get all results set the query String to "*:*"
* @param querystring
* @throws IOException
*/
public SolrDocumentList get(String querystring, int offset, int count) throws IOException {
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException {
// construct query
SolrQuery query = new SolrQuery();
final SolrQuery query = new SolrQuery();
query.setQuery(querystring);
query.setRows(count);
query.setStart(offset);
query.addSortField( "price", SolrQuery.ORDER.asc );
// query the server
//SearchResult result = new SearchResult(count);
try {
QueryResponse rsp = server.query( query );
SolrDocumentList docs = rsp.getResults();
final QueryResponse rsp = this.server.query( query );
final SolrDocumentList docs = rsp.getResults();
return docs;
// add the docs into the YaCy search result container
/*
@ -265,22 +265,22 @@ public class SolrSingleConnector {
result.put(element)
}
*/
} catch (SolrServerException e) {
} catch (final SolrServerException e) {
throw new IOException(e);
}
//return result;
}
public static void main(String args[]) {
public static void main(final String args[]) {
SolrSingleConnector solr;
try {
solr = new SolrSingleConnector("http://127.0.0.1:8983/solr", SolrScheme.SolrCellExtended);
solr = new SolrSingleConnector("http://127.0.0.1:8983/solr", new SolrScheme());
solr.clear();
File exampleDir = new File("/Data/workspace2/yacy/test/parsertest/");
final File exampleDir = new File("/Data/workspace2/yacy/test/parsertest/");
long t, t0, a = 0;
int c = 0;
for (String s: exampleDir.list()) {
for (final String s: exampleDir.list()) {
if (s.startsWith(".")) continue;
t = System.currentTimeMillis();
solr.add(new File(exampleDir, s), s);
@ -290,9 +290,9 @@ public class SolrSingleConnector {
System.out.println("pushed file " + s + " to solr, " + t0 + " milliseconds");
}
System.out.println("pushed " + c + " files in " + a + " milliseconds, " + (a / c) + " milliseconds average; " + (60000 / a * c) + " PPM");
} catch (IOException e) {
} catch (final IOException e) {
e.printStackTrace();
}
}
}

@ -40,6 +40,11 @@ import java.util.Set;
* the list may contain lines with one keyword, comment lines, empty lines and out-commented keyword lines
* when an attribute is changed here, the list is stored again with the original formatting
*
* the syntax of configuration files:
* - all lines beginning with '##' are comments
* - all non-empty lines not beginning with '#' are keyword lines
* - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines
*
* @author Michael Christen
*/
public class ConfigurationSet extends AbstractSet<String> implements Set<String> {
@ -47,6 +52,11 @@ public class ConfigurationSet extends AbstractSet<String> implements Set<String>
private final File file;
private String[] lines;
public ConfigurationSet() {
this.file = null;
this.lines = new String[0];
}
public ConfigurationSet(final File file) {
this.file = file;
try {
@ -62,11 +72,18 @@ public class ConfigurationSet extends AbstractSet<String> implements Set<String>
}
}
@Override
public boolean isEmpty() {
// a shortcut to a fast 'true' in case that we initialized the class without a configuration file
return this.lines == null || this.lines.length == 0 || super.isEmpty();
}
/**
* save the configuration back to the file
* @throws IOException
*/
private void commit() throws IOException {
if (this.file == null) return;
final BufferedWriter writer = new BufferedWriter(new FileWriter(this.file));
for (final String s: this.lines) {
writer.write(s);

Loading…
Cancel
Save