diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index ed6392b50..bb33c44ac 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -111,6 +111,9 @@ attr_paths ## host of the url, string host_s +## url inside the canonical link element, string +canonical_s + ## all texts in
  • tags, textgen attr_li diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 6657def71..7da70867a 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -561,15 +561,21 @@ public final class Switchboard extends serverSwitch { TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, "")); // prepare a solr index profile switch list + final File solrBackupProfile = new File("defaults/solr.keys.list"); final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/solr.keys.default.list"); - if (!solrWorkProfile.exists()) FileUtils.copy(new File("defaults/solr.keys.list"), solrWorkProfile); - final SolrScheme scheme = new SolrScheme(solrWorkProfile); + if (!solrWorkProfile.exists()) FileUtils.copy(solrBackupProfile, solrWorkProfile); + final SolrScheme backupScheme = new SolrScheme(solrBackupProfile); + final SolrScheme workingScheme = new SolrScheme(solrWorkProfile); + + // update the working scheme with the backup scheme. This is necessary to include new features. + // new features are always activated by default + // set up the solr interface final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; try { - this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; + this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null; } catch (final IOException e) { Log.logException(e); this.solrConnector = null; diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java index 7b548221a..7d6bd5513 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java @@ -181,6 +181,9 @@ public class SolrScheme extends ConfigurationSet { } addSolr(solrdoc, "htags_i", h); + // canonical tag + if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false)); + // meta tags final Map metas = html.getMetas(); final String robots = metas.get("robots"); diff --git a/source/net/yacy/cora/storage/ConfigurationSet.java b/source/net/yacy/cora/storage/ConfigurationSet.java index 3f2f1d176..3232fad1a 100644 --- a/source/net/yacy/cora/storage/ConfigurationSet.java +++ b/source/net/yacy/cora/storage/ConfigurationSet.java @@ -72,6 +72,7 @@ public class ConfigurationSet extends AbstractSet implements Set } } + @Override public boolean isEmpty() { // a shortcut to a fast 'true' in case that we initialized the class without a configuration file diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 908447d22..6e292d4cd 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -125,6 +125,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private CharBuffer content; private final EventListenerList htmlFilterEventListeners; private float lon, lat; + private MultiProtocolURI canonical; /** * {@link MultiProtocolURI} to the favicon that belongs to the document @@ -167,6 +168,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.lon = 0.0f; this.lat = 0.0f; this.evaluationScores.match(Element.url, root.toNormalform(false, false)); + this.canonical = null; } public void scrapeText(final char[] newtext, final String insideTag) { @@ -345,6 +347,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1); this.images.put(ie.url(), ie); this.favicon = newLink; + } else if (rel.equalsIgnoreCase("canonical")) { + final Properties p = new Properties(); p.put("name", this.title); + this.anchors.put(newLink, p); + this.canonical = newLink; } else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) { this.rss.put(newLink, linktitle); } else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) { @@ -599,6 +605,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { return this.script; } + public MultiProtocolURI getCanonical() { + return this.canonical; + } + /** * get all images * @return a map of