added parsing of canonical link element to html parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7812 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · bda3eec0ff
parent b6f09a475d
commit bda3eec0ff
5 changed files with 26 additions and 3 deletions
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@ -111,6 +111,9 @@ attr_paths
 ## host of the url, string
 host_s

+## url inside the canonical link element, string
+canonical_s
+
 ## all texts in <li> tags, textgen
 attr_li

--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -561,15 +561,21 @@ public final class Switchboard extends serverSwitch {
        TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));

        // prepare a solr index profile switch list
+        final File solrBackupProfile = new File("defaults/solr.keys.list");
        final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/solr.keys.default.list");
-        if (!solrWorkProfile.exists()) FileUtils.copy(new File("defaults/solr.keys.list"), solrWorkProfile);
-        final SolrScheme scheme = new SolrScheme(solrWorkProfile);
+        if (!solrWorkProfile.exists()) FileUtils.copy(solrBackupProfile, solrWorkProfile);
+        final SolrScheme backupScheme = new SolrScheme(solrBackupProfile);
+        final SolrScheme workingScheme = new SolrScheme(solrWorkProfile);
+
+        // update the working scheme with the backup scheme. This is necessary to include new features.
+        // new features are always activated by default
+

        // set up the solr interface
        final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
        final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
        try {
-            this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
+            this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
        } catch (final IOException e) {
            Log.logException(e);
            this.solrConnector = null;
--- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
@ -181,6 +181,9 @@ public class SolrScheme extends ConfigurationSet {
            }
            addSolr(solrdoc, "htags_i", h);

+            // canonical tag
+            if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false));
+
            // meta tags
            final Map<String, String> metas = html.getMetas();
            final String robots = metas.get("robots");
--- a/source/net/yacy/cora/storage/ConfigurationSet.java
+++ b/source/net/yacy/cora/storage/ConfigurationSet.java
@ -72,6 +72,7 @@ public class ConfigurationSet extends AbstractSet<String> implements Set<String>
        }
    }

+
    @Override
    public boolean isEmpty() {
        // a shortcut to a fast 'true' in case that we initialized the class without a configuration file
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -125,6 +125,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private CharBuffer content;
    private final EventListenerList htmlFilterEventListeners;
    private float lon, lat;
+    private MultiProtocolURI canonical;

    /**
     * {@link MultiProtocolURI} to the favicon that belongs to the document
@ -167,6 +168,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.lon = 0.0f;
        this.lat = 0.0f;
        this.evaluationScores.match(Element.url, root.toNormalform(false, false));
+        this.canonical = null;
    }

    public void scrapeText(final char[] newtext, final String insideTag) {
@ -345,6 +347,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                    final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
                    this.images.put(ie.url(), ie);
                    this.favicon = newLink;
+                } else if (rel.equalsIgnoreCase("canonical")) {
+                    final Properties p = new Properties(); p.put("name", this.title);
+                    this.anchors.put(newLink, p);
+                    this.canonical = newLink;
                } else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
                    this.rss.put(newLink, linktitle);
                } else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
@ -599,6 +605,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        return this.script;
    }

+    public MultiProtocolURI getCanonical() {
+        return this.canonical;
+    }
+
    /**
     * get all images
     * @return a map of <urlhash, ImageEntry>