From 508a81b86ca2b11a7bde0175fe247c915b8bd418 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 28 Jun 2012 13:27:45 +0200 Subject: [PATCH] added solr field 'refresh_s' which stores the refresh url contained in the meta-refresh html header field. --- defaults/solr.keys.list | 3 +++ .../document/parser/html/ContentScraper.java | 2 +- .../yacy/search/index/SolrConfiguration.java | 18 ++++++++++++++++++ source/net/yacy/search/index/SolrField.java | 8 ++++++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 3394a7fd6..05fde7e4a 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -184,6 +184,9 @@ host_s ## url inside the canonical link element, string canonical_s +## link from the url property inside the refresh link element, string +refresh_s + ## all texts in
  • tags, textgen li_txt diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 4b4618598..3ddd7859d 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -807,7 +807,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final int pos = s.indexOf(';'); if (pos < 0) return EMPTY_STRING; - s = s.substring(pos + 1); + s = s.substring(pos + 1).trim(); if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim(); return EMPTY_STRING; } diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 59179ac4f..fab3e4a68 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -354,6 +354,24 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } } + // meta refresh tag + if (isEmpty() || contains(SolrField.refresh_s.name())) { + String refresh = html.getRefreshPath(); + if (refresh != null && refresh.length() > 0) { + MultiProtocolURI refreshURL; + try { + refreshURL = refresh.startsWith("http") ? new MultiProtocolURI(html.getRefreshPath()) : new MultiProtocolURI(digestURI, html.getRefreshPath()); + if (refreshURL != null) { + inboundLinks.remove(refreshURL); + ouboundLinks.remove(refreshURL); + addSolr(solrdoc, SolrField.refresh_s, refreshURL.toNormalform(false, false)); + } + } catch (MalformedURLException e) { + addSolr(solrdoc, SolrField.refresh_s, refresh); + } + } + } + // flash embedded if (isEmpty() || contains(SolrField.flash_b.name())) { MultiProtocolURI[] flashURLs = html.getFlash(); diff --git a/source/net/yacy/search/index/SolrField.java b/source/net/yacy/search/index/SolrField.java index 77713800c..3fcbfd721 100644 --- a/source/net/yacy/search/index/SolrField.java +++ b/source/net/yacy/search/index/SolrField.java @@ -82,6 +82,7 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField h6_txt(SolrType.text_general, true, true, true, "h6 header"), htags_i(SolrType.integer, true, true, "binary pattern for the existance of h1..h6 headlines"), canonical_s(SolrType.string, true, true, "url inside the canonical link element"), + refresh_s(SolrType.string, true, true, "link from the url property inside the refresh link element"), metagenerator_t(SolrType.text_general, true, true, "content of tag"), boldcount_i(SolrType.integer, true, true, "total number of occurrences of or "), bold_txt(SolrType.text_general, true, true, true, "all texts inside of or tags. no doubles. listed in the order of number of occurrences in decreasing order"), @@ -150,6 +151,7 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField * Returns the YaCy default or (if available) custom field name for Solr * @return SolrFieldname String */ + @Override public final String getSolrFieldName() { return (this.solrFieldName == null ? this.name() : this.solrFieldName); } @@ -167,26 +169,32 @@ public enum SolrField implements net.yacy.cora.services.federated.solr.SolrField } } + @Override public final SolrType getType() { return this.type; } + @Override public final boolean isIndexed() { return this.indexed; } + @Override public final boolean isStored() { return this.stored; } + @Override public final boolean isMultiValued() { return this.multiValued; } + @Override public final boolean isOmitNorms() { return this.omitNorms; } + @Override public final String getComment() { return this.comment; }