- added counting of links with noindex tag for solr index

- bugfixes for solr index

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7820 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 528b59e078
commit 2d4bb139d3

@ -75,15 +75,21 @@ wordcount_i
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen ## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_inboundlinks attr_inboundlinks
## number of inbound links, int ## total number of inbound links, int
inboundlinkscount_i inboundlinkscount_i
## number of inbound links with noindex tag, int
inboundlinksnoindexcount_i
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen ## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_outboundlinks attr_outboundlinks
## number of external links, int ## total number of external links, int
outboundlinkscount_i outboundlinkscount_i
## number of external links with noindex tag, int
outboundlinksnoindexcount_i
## h1 header, textgen ## h1 header, textgen
attr_h1 attr_h1

@ -61,11 +61,13 @@ public class IndexFederated_p {
sb.solrConnector = null; sb.solrConnector = null;
} }
final String schemename = sb.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/" + schemename));
if (!solrWasOn && solrIsOnAfterwards) { if (!solrWasOn && solrIsOnAfterwards) {
// switch on // switch on
final String solrurls = sb.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); final String solrurls = sb.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
final SolrScheme scheme = new SolrScheme(new File(env.getDataPath(), "DATA/SETTINGS/solr.keys.default.list"));
try { try {
sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; sb.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
} catch (final IOException e) { } catch (final IOException e) {
@ -75,7 +77,6 @@ public class IndexFederated_p {
} }
// read index scheme table flags // read index scheme table flags
final SolrScheme scheme = sb.solrConnector.getScheme();
final Iterator<ConfigurationSet.Entry> i = scheme.allIterator(); final Iterator<ConfigurationSet.Entry> i = scheme.allIterator();
ConfigurationSet.Entry entry; ConfigurationSet.Entry entry;
while (i.hasNext()) { while (i.hasNext()) {

@ -562,7 +562,8 @@ public final class Switchboard extends serverSwitch {
// prepare a solr index profile switch list // prepare a solr index profile switch list
final File solrBackupProfile = new File("defaults/solr.keys.list"); final File solrBackupProfile = new File("defaults/solr.keys.list");
final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/solr.keys.default.list"); final String schemename = getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list");
final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/" + schemename);
if (!solrWorkProfile.exists()) FileUtils.copy(solrBackupProfile, solrWorkProfile); if (!solrWorkProfile.exists()) FileUtils.copy(solrBackupProfile, solrWorkProfile);
final SolrScheme backupScheme = new SolrScheme(solrBackupProfile); final SolrScheme backupScheme = new SolrScheme(solrBackupProfile);
final SolrScheme workingScheme = new SolrScheme(solrWorkProfile); final SolrScheme workingScheme = new SolrScheme(solrWorkProfile);

@ -111,14 +111,14 @@ public class SolrScheme extends ConfigurationSet {
addSolr(solrdoc, "keywords", yacydoc.dc_subject(' ')); addSolr(solrdoc, "keywords", yacydoc.dc_subject(' '));
final String content = UTF8.String(yacydoc.getTextBytes()); final String content = UTF8.String(yacydoc.getTextBytes());
addSolr(solrdoc, "text_t", content); addSolr(solrdoc, "text_t", content);
if (contains("wordcount_i")) { if (isEmpty() || contains("wordcount_i")) {
final int contentwc = content.split(" ").length; final int contentwc = content.split(" ").length;
addSolr(solrdoc, "wordcount_i", contentwc); addSolr(solrdoc, "wordcount_i", contentwc);
} }
// path elements of link // path elements of link
final String path = digestURI.getPath(); final String path = digestURI.getPath();
if (path != null && contains("attr_paths")) { if (path != null && (isEmpty() || contains("attr_paths"))) {
final String[] paths = path.split("/"); final String[] paths = path.split("/");
if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths); if (paths.length > 0) addSolr(solrdoc, "attr_paths", paths);
} }
@ -126,8 +126,9 @@ public class SolrScheme extends ConfigurationSet {
// list all links // list all links
final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors(); final Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
int c = 0; int c = 0;
addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount()); if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
if (contains("attr_inboundlinks")) { if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
if (isEmpty() || contains("attr_inboundlinks")) {
final String[] inboundlinks = new String[yacydoc.inboundLinkCount()]; final String[] inboundlinks = new String[yacydoc.inboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.inboundLinks()) { for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
final Properties p = alllinks.get(url); final Properties p = alllinks.get(url);
@ -135,23 +136,24 @@ public class SolrScheme extends ConfigurationSet {
final String rel = p.getProperty("rel", ""); final String rel = p.getProperty("rel", "");
inboundlinks[c++] = inboundlinks[c++] =
"<a href=\"" + url.toNormalform(false, false) + "\"" + "<a href=\"" + url.toNormalform(false, false) + "\"" +
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") + (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
">" + ">" +
((name.length() > 0) ? name : "") + "</a>"; ((name.length() > 0) ? name : "") + "</a>";
} }
addSolr(solrdoc, "attr_inboundlinks", inboundlinks); addSolr(solrdoc, "attr_inboundlinks", inboundlinks);
} }
c = 0; c = 0;
final String[] outboundlinks = new String[yacydoc.outboundLinkCount()]; if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
if (contains("attr_outboundlinks")) { if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
addSolr(solrdoc, "outboundlinkscount_i", outboundlinks.length); if (isEmpty() || contains("attr_outboundlinks")) {
final String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.outboundLinks()) { for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
final Properties p = alllinks.get(url); final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); final String name = p.getProperty("name", "");
final String rel = p.getProperty("rel", ""); final String rel = p.getProperty("rel", "");
outboundlinks[c++] = outboundlinks[c++] =
"<a href=\"" + url.toNormalform(false, false) + "\"" + "<a href=\"" + url.toNormalform(false, false) + "\"" +
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") + (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
">" + ">" +
((name.length() > 0) ? name : "") + "</a>"; ((name.length() > 0) ? name : "") + "</a>";
} }
@ -196,7 +198,7 @@ public class SolrScheme extends ConfigurationSet {
addSolr(solrdoc, "boldcount_i", bold.length); addSolr(solrdoc, "boldcount_i", bold.length);
if (bold.length > 0) { if (bold.length > 0) {
addSolr(solrdoc, "attr_bold", bold); addSolr(solrdoc, "attr_bold", bold);
if (contains("attr_boldcount")) { if (isEmpty() || contains("attr_boldcount")) {
addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold)); addSolr(solrdoc, "attr_boldcount", html.getBoldCount(bold));
} }
} }
@ -204,7 +206,7 @@ public class SolrScheme extends ConfigurationSet {
addSolr(solrdoc, "italiccount_i", italic.length); addSolr(solrdoc, "italiccount_i", italic.length);
if (italic.length > 0) { if (italic.length > 0) {
addSolr(solrdoc, "attr_italic", italic); addSolr(solrdoc, "attr_italic", italic);
if (contains("attr_italiccount")) { if (isEmpty() || contains("attr_italiccount")) {
addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic)); addSolr(solrdoc, "attr_italiccount", html.getItalicCount(italic));
} }
} }
@ -213,7 +215,7 @@ public class SolrScheme extends ConfigurationSet {
if (li.length > 0) addSolr(solrdoc, "attr_li", li); if (li.length > 0) addSolr(solrdoc, "attr_li", li);
// images // images
if (contains("attr_images")) { if (isEmpty() || contains("attr_images")) {
final Collection<ImageEntry> imagesc = html.getImages().values(); final Collection<ImageEntry> imagesc = html.getImages().values();
final String[] images = new String[imagesc.size()]; final String[] images = new String[imagesc.size()];
c = 0; c = 0;
@ -223,7 +225,7 @@ public class SolrScheme extends ConfigurationSet {
} }
// style sheets // style sheets
if (contains("attr_css")) { if (isEmpty() || contains("attr_css")) {
final Map<MultiProtocolURI, String> csss = html.getCSS(); final Map<MultiProtocolURI, String> csss = html.getCSS();
final String[] css = new String[csss.size()]; final String[] css = new String[csss.size()];
c = 0; c = 0;
@ -237,7 +239,7 @@ public class SolrScheme extends ConfigurationSet {
} }
// Scripts // Scripts
if (contains("attr_scripts")) { if (isEmpty() || contains("attr_scripts")) {
final Set<MultiProtocolURI> scriptss = html.getScript(); final Set<MultiProtocolURI> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()]; final String[] scripts = new String[scriptss.size()];
c = 0; c = 0;
@ -249,7 +251,7 @@ public class SolrScheme extends ConfigurationSet {
} }
// Frames // Frames
if (contains("attr_frames")) { if (isEmpty() || contains("attr_frames")) {
final Set<MultiProtocolURI> framess = html.getFrames(); final Set<MultiProtocolURI> framess = html.getFrames();
final String[] frames = new String[framess.size()]; final String[] frames = new String[framess.size()];
c = 0; c = 0;
@ -261,7 +263,7 @@ public class SolrScheme extends ConfigurationSet {
} }
// IFrames // IFrames
if (contains("attr_iframes")) { if (isEmpty() || contains("attr_iframes")) {
final Set<MultiProtocolURI> iframess = html.getIFrames(); final Set<MultiProtocolURI> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()]; final String[] iframes = new String[iframess.size()];
c = 0; c = 0;
@ -277,7 +279,7 @@ public class SolrScheme extends ConfigurationSet {
// generic evaluation pattern // generic evaluation pattern
for (final String model: html.getEvaluationModelNames()) { for (final String model: html.getEvaluationModelNames()) {
if (contains("attr_" + model)) { if (isEmpty() || contains("attr_" + model)) {
final String[] scorenames = html.getEvaluationModelScoreNames(model); final String[] scorenames = html.getEvaluationModelScoreNames(model);
if (scorenames.length > 0) { if (scorenames.length > 0) {
addSolr(solrdoc, "attr_" + model, scorenames); addSolr(solrdoc, "attr_" + model, scorenames);

@ -403,13 +403,15 @@ dc_rights
for (final Map.Entry<MultiProtocolURI, Properties> entry: this.anchors.entrySet()) { for (final Map.Entry<MultiProtocolURI, Properties> entry: this.anchors.entrySet()) {
url = entry.getKey(); url = entry.getKey();
if (url == null) continue; if (url == null) continue;
final boolean noindex = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("noindex") >= 0;
final boolean nofollow = entry.getValue().getProperty("rel", "").toLowerCase().indexOf("nofollow") >= 0;
if ((thishost == null && url.getHost() == null) || if ((thishost == null && url.getHost() == null) ||
((thishost != null && url.getHost() != null) && ((thishost != null && url.getHost() != null) &&
(url.getHost().endsWith(thishost) || (url.getHost().endsWith(thishost) ||
(thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4)))))) { (thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4)))))) {
this.inboundlinks.put(url, "anchor"); this.inboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : ""));
} else { } else {
this.outboundlinks.put(url, "anchor"); this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : ""));
} }
u = url.toNormalform(true, false); u = url.toNormalform(true, false);
final String name = entry.getValue().getProperty("name", ""); final String name = entry.getValue().getProperty("name", "");
@ -605,6 +607,26 @@ dc_rights
return (this.outboundlinks == null) ? 0 : this.outboundlinks.size(); return (this.outboundlinks == null) ? 0 : this.outboundlinks.size();
} }
public int inboundLinkNoindexCount() {
if (this.inboundlinks == null) resortLinks();
if (this.inboundlinks == null) return 0;
int c = 0;
for (final String tag: this.inboundlinks.values()) {
if (tag.contains("noindex")) c++;
}
return c;
}
public int outboundLinkNoindexCount() {
if (this.outboundlinks == null) resortLinks();
if (this.outboundlinks == null) return 0;
int c = 0;
for (final String tag: this.outboundlinks.values()) {
if (tag.contains("noindex")) c++;
}
return c;
}
public Set<MultiProtocolURI> inboundLinks() { public Set<MultiProtocolURI> inboundLinks() {
if (this.inboundlinks == null) resortLinks(); if (this.inboundlinks == null) resortLinks();
return (this.inboundlinks == null) ? null : this.inboundlinks.keySet(); return (this.inboundlinks == null) ? null : this.inboundlinks.keySet();

Loading…
Cancel
Save