added parsing of canonical link element to html parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7812 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent b6f09a475d
commit bda3eec0ff

@ -111,6 +111,9 @@ attr_paths
## host of the url, string
host_s
## url inside the canonical link element, string
canonical_s
## all texts in <li> tags, textgen
attr_li

@ -561,15 +561,21 @@ public final class Switchboard extends serverSwitch {
TextParser.setDenyMime(getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
// prepare a solr index profile switch list
final File solrBackupProfile = new File("defaults/solr.keys.list");
final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/solr.keys.default.list");
if (!solrWorkProfile.exists()) FileUtils.copy(new File("defaults/solr.keys.list"), solrWorkProfile);
final SolrScheme scheme = new SolrScheme(solrWorkProfile);
if (!solrWorkProfile.exists()) FileUtils.copy(solrBackupProfile, solrWorkProfile);
final SolrScheme backupScheme = new SolrScheme(solrBackupProfile);
final SolrScheme workingScheme = new SolrScheme(solrWorkProfile);
// update the working scheme with the backup scheme. This is necessary to include new features.
// new features are always activated by default
// set up the solr interface
final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr");
final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0;
try {
this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, scheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
this.solrConnector = (usesolr) ? new SolrChardingConnector(solrurls, workingScheme, SolrChardingSelection.Method.MODULO_HOST_MD5) : null;
} catch (final IOException e) {
Log.logException(e);
this.solrConnector = null;

@ -181,6 +181,9 @@ public class SolrScheme extends ConfigurationSet {
}
addSolr(solrdoc, "htags_i", h);
// canonical tag
if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false));
// meta tags
final Map<String, String> metas = html.getMetas();
final String robots = metas.get("robots");

@ -72,6 +72,7 @@ public class ConfigurationSet extends AbstractSet<String> implements Set<String>
}
}
@Override
public boolean isEmpty() {
// a shortcut to a fast 'true' in case that we initialized the class without a configuration file

@ -125,6 +125,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private float lon, lat;
private MultiProtocolURI canonical;
/**
* {@link MultiProtocolURI} to the favicon that belongs to the document
@ -167,6 +168,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.lon = 0.0f;
this.lat = 0.0f;
this.evaluationScores.match(Element.url, root.toNormalform(false, false));
this.canonical = null;
}
public void scrapeText(final char[] newtext, final String insideTag) {
@ -345,6 +347,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
this.images.put(ie.url(), ie);
this.favicon = newLink;
} else if (rel.equalsIgnoreCase("canonical")) {
final Properties p = new Properties(); p.put("name", this.title);
this.anchors.put(newLink, p);
this.canonical = newLink;
} else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
this.rss.put(newLink, linktitle);
} else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
@ -599,6 +605,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.script;
}
public MultiProtocolURI getCanonical() {
return this.canonical;
}
/**
* get all images
* @return a map of <urlhash, ImageEntry>

Loading…
Cancel
Save