adding the canonical tag to crawl queues

pull/1/head
Michael Peter Christen 12 years ago
parent 40c5ee47c1
commit e6f361f474

@ -23,7 +23,6 @@ import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import javax.servlet.ServletException;

@ -868,7 +868,9 @@ dc_rights
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
String refresh = html.getRefreshPath();
if (refresh != null && refresh.length() > 0)try {result.put(new DigestURI(refresh), "refresh");} catch (MalformedURLException e) {}
if (refresh != null && refresh.length() > 0) try {result.put(new DigestURI(refresh), "refresh");} catch (MalformedURLException e) {}
DigestURI canonical = html.getCanonical();
if (canonical != null) result.put(canonical, "canonical");
}
}
return result;

@ -490,9 +490,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
Map<DigestURI, Properties> alllinks = document.getAnchors();
Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
int c = 0;
final Object parser = document.getParserObject();
Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
images = html.getImages();

Loading…
Cancel
Save