added a sitemap entry parser and loader for sitemaps

(a recursion if a sitemap refers to another sitemap)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7295 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 790e0b1894
commit 4c72885cba

@ -25,20 +25,12 @@
package de.anomic.crawler; package de.anomic.crawler;
import java.io.InputStream;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Date; import java.util.Date;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.document.parser.sitemapParser; import net.yacy.document.parser.sitemapParser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.io.ByteCountInputStream;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Request;
import de.anomic.search.Segments; import de.anomic.search.Segments;
@ -60,45 +52,16 @@ public class SitemapImporter extends Thread {
} }
public void run() { public void run() {
// download document
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent);
final HTTPClient client = new HTTPClient();
client.setTimout(5000);
client.setHeader(requestHeader.entrySet());
try { try {
try { logger.logInfo("Start parsing sitemap file " + this.siteMapURL);
client.GET(siteMapURL.toString()); sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL);
if (client.getStatusCode() != 200) { for (sitemapParser.URLEntry entry: parser) process(entry);
logger.logWarning("Unable to download the sitemap file " + this.siteMapURL +
"\nServer returned status: " + client.getHttpResponse().getStatusLine());
return;
}
// get some metadata
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final String contentMimeType = header.mime();
InputStream contentStream = client.getContentstream();
if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
if (logger.isFine()) logger.logFine("Sitemap file has mimetype " + contentMimeType);
contentStream = new GZIPInputStream(contentStream);
}
final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
// parse it
logger.logInfo("Start parsing sitemap file " + this.siteMapURL + "\n\tMimeType: " + contentMimeType + "\n\tLength: " + header.getContentLength());
sitemapParser.SitemapReader parser = sitemapParser.parse(counterStream);
for (sitemapParser.SitemapEntry entry: parser) process(entry);
} finally {
client.finish();
}
} catch (final Exception e) { } catch (final Exception e) {
logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e); logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e);
} }
} }
public void process(sitemapParser.SitemapEntry entry) { public void process(sitemapParser.URLEntry entry) {
// get the url hash // get the url hash
byte[] nexturlhash = null; byte[] nexturlhash = null;

@ -132,8 +132,9 @@ public final class robotsParser {
continue lineparser; continue lineparser;
} }
// parse sitemap // parse sitemap; if there are several sitemaps then take the first url
if (lineUpper.startsWith(ROBOTS_SITEMAP)) { // TODO: support for multiple sitemaps
if (lineUpper.startsWith(ROBOTS_SITEMAP) && (sitemap == null || sitemap.length() == 0)) {
pos = line.indexOf(' '); pos = line.indexOf(' ');
if (pos != -1) { if (pos != -1) {
sitemap = line.substring(pos).trim(); sitemap = line.substring(pos).trim();

@ -29,6 +29,7 @@ import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.ParserConfigurationException;
@ -38,13 +39,20 @@ import org.w3c.dom.Element;
import org.w3c.dom.Node; import org.w3c.dom.Node;
import org.w3c.dom.NodeList; import org.w3c.dom.NodeList;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.ByteCountInputStream;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
public class sitemapParser extends AbstractParser implements Parser { public class sitemapParser extends AbstractParser implements Parser {
@ -67,7 +75,7 @@ public class sitemapParser extends AbstractParser implements Parser {
List<Document> docs = new ArrayList<Document>(); List<Document> docs = new ArrayList<Document>();
MultiProtocolURI uri; MultiProtocolURI uri;
Document doc; Document doc;
for (SitemapEntry item: sitemap) try { for (URLEntry item: sitemap) try {
uri = new MultiProtocolURI(item.loc); uri = new MultiProtocolURI(item.loc);
doc = new Document( doc = new Document(
uri, uri,
@ -95,6 +103,37 @@ public class sitemapParser extends AbstractParser implements Parser {
return da; return da;
} }
public static SitemapReader parse(final DigestURI sitemapURL) throws IOException {
// download document
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent);
final HTTPClient client = new HTTPClient();
client.setTimout(5000);
client.setHeader(requestHeader.entrySet());
try {
client.GET(sitemapURL.toString());
if (client.getStatusCode() != 200) {
throw new IOException("Unable to download the sitemap file " + sitemapURL +
"\nServer returned status: " + client.getHttpResponse().getStatusLine());
}
// get some metadata
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final String contentMimeType = header.mime();
InputStream contentStream = client.getContentstream();
if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
contentStream = new GZIPInputStream(contentStream);
}
final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
return sitemapParser.parse(counterStream);
} catch (IOException e) {
throw e;
} finally {
client.finish();
}
}
public static SitemapReader parse(InputStream stream) throws IOException { public static SitemapReader parse(InputStream stream) throws IOException {
return new SitemapReader(stream); return new SitemapReader(stream);
} }
@ -104,37 +143,61 @@ public class sitemapParser extends AbstractParser implements Parser {
* http://www.sitemaps.org/schemas/sitemap/0.9 * http://www.sitemaps.org/schemas/sitemap/0.9
* http://www.google.com/schemas/sitemap/0.84 * http://www.google.com/schemas/sitemap/0.84
*/ */
public static class SitemapReader extends ArrayList<SitemapEntry> { public static class SitemapReader extends ArrayList<URLEntry> {
private static final long serialVersionUID = 1337L; private static final long serialVersionUID = 1337L;
public SitemapReader(InputStream source) throws IOException { public SitemapReader(InputStream source) throws IOException {
org.w3c.dom.Document doc; org.w3c.dom.Document doc;
try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); } try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); }
catch (ParserConfigurationException e) { throw new IOException (e); } catch (ParserConfigurationException e) { throw new IOException (e); }
catch (SAXParseException e) { throw new IOException (e); }
catch (SAXException e) { throw new IOException (e); } catch (SAXException e) { throw new IOException (e); }
NodeList nodes = doc.getElementsByTagName("url"); NodeList SitemapNodes = doc.getElementsByTagName("sitemap");
for (int i = 0; i < nodes.getLength(); i++) for (int i = 0; i < SitemapNodes.getLength(); i++) {
this.add(new SitemapEntry((Element) nodes.item(i))); String url = new SitemapEntry((Element) SitemapNodes.item(i)).url();
if (url != null && url.length() > 0) {
try {
SitemapReader r = parse(new DigestURI(url));
for (URLEntry ue: r) this.add(ue);
} catch (IOException e) {}
}
}
NodeList urlEntryNodes = doc.getElementsByTagName("url");
for (int i = 0; i < urlEntryNodes.getLength(); i++) {
this.add(new URLEntry((Element) urlEntryNodes.item(i)));
}
} }
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (SitemapEntry entry: this) sb.append(entry.toString()); for (URLEntry entry: this) sb.append(entry.toString());
return sb.toString(); return sb.toString();
} }
} }
public static class SitemapEntry { public static class URLEntry {
public String loc, lastmod, changefreq, priority; public String loc, lastmod, changefreq, priority;
public SitemapEntry(Element element) { public URLEntry(Element element) {
loc = val(element, "loc", ""); loc = val(element, "loc", "");
lastmod = val(element, "lastmod", ""); lastmod = val(element, "lastmod", "");
changefreq = val(element, "changefreq", ""); changefreq = val(element, "changefreq", "");
priority = val(element, "priority", ""); priority = val(element, "priority", "");
} }
private String val(Element parent, String label, String dflt) { public String url() {
Element e = (Element) parent.getElementsByTagName(label).item(0); return this.loc;
if (e == null) return dflt; }
Node child = e.getFirstChild(); public Date lastmod(Date dflt) {
return (child instanceof CharacterData) ? ((CharacterData) child).getData() : dflt; try {
return DateFormatter.parseISO8601(lastmod);
} catch (final ParseException e) {
return dflt;
}
}
}
public static class SitemapEntry {
public String loc, lastmod;
public SitemapEntry(Element element) {
loc = val(element, "loc", "");
lastmod = val(element, "lastmod", "");
} }
public String url() { public String url() {
return this.loc; return this.loc;
@ -148,4 +211,10 @@ public class sitemapParser extends AbstractParser implements Parser {
} }
} }
private static String val(Element parent, String label, String dflt) {
Element e = (Element) parent.getElementsByTagName(label).item(0);
if (e == null) return dflt;
Node child = e.getFirstChild();
return (child instanceof CharacterData) ? ((CharacterData) child).getData() : dflt;
}
} }

Loading…
Cancel
Save