added a sitemap entry parser and loader for sitemaps

(a recursion if a sitemap refers to another sitemap)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7295 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 790e0b1894
commit 4c72885cba

@ -25,20 +25,12 @@
package de.anomic.crawler;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.document.parser.sitemapParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.io.ByteCountInputStream;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.retrieval.Request;
import de.anomic.search.Segments;
@ -60,45 +52,16 @@ public class SitemapImporter extends Thread {
}
public void run() {
// download document
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent);
final HTTPClient client = new HTTPClient();
client.setTimout(5000);
client.setHeader(requestHeader.entrySet());
try {
try {
client.GET(siteMapURL.toString());
if (client.getStatusCode() != 200) {
logger.logWarning("Unable to download the sitemap file " + this.siteMapURL +
"\nServer returned status: " + client.getHttpResponse().getStatusLine());
return;
}
// get some metadata
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final String contentMimeType = header.mime();
InputStream contentStream = client.getContentstream();
if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
if (logger.isFine()) logger.logFine("Sitemap file has mimetype " + contentMimeType);
contentStream = new GZIPInputStream(contentStream);
}
final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
// parse it
logger.logInfo("Start parsing sitemap file " + this.siteMapURL + "\n\tMimeType: " + contentMimeType + "\n\tLength: " + header.getContentLength());
sitemapParser.SitemapReader parser = sitemapParser.parse(counterStream);
for (sitemapParser.SitemapEntry entry: parser) process(entry);
} finally {
client.finish();
}
logger.logInfo("Start parsing sitemap file " + this.siteMapURL);
sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL);
for (sitemapParser.URLEntry entry: parser) process(entry);
} catch (final Exception e) {
logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e);
}
}
public void process(sitemapParser.SitemapEntry entry) {
public void process(sitemapParser.URLEntry entry) {
// get the url hash
byte[] nexturlhash = null;

@ -132,8 +132,9 @@ public final class robotsParser {
continue lineparser;
}
// parse sitemap
if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
// parse sitemap; if there are several sitemaps then take the first url
// TODO: support for multiple sitemaps
if (lineUpper.startsWith(ROBOTS_SITEMAP) && (sitemap == null || sitemap.length() == 0)) {
pos = line.indexOf(' ');
if (pos != -1) {
sitemap = line.substring(pos).trim();

@ -29,6 +29,7 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
@ -38,13 +39,20 @@ import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.ByteCountInputStream;
import net.yacy.kelondro.util.DateFormatter;
public class sitemapParser extends AbstractParser implements Parser {
@ -67,7 +75,7 @@ public class sitemapParser extends AbstractParser implements Parser {
List<Document> docs = new ArrayList<Document>();
MultiProtocolURI uri;
Document doc;
for (SitemapEntry item: sitemap) try {
for (URLEntry item: sitemap) try {
uri = new MultiProtocolURI(item.loc);
doc = new Document(
uri,
@ -95,6 +103,37 @@ public class sitemapParser extends AbstractParser implements Parser {
return da;
}
public static SitemapReader parse(final DigestURI sitemapURL) throws IOException {
// download document
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, MultiProtocolURI.yacybotUserAgent);
final HTTPClient client = new HTTPClient();
client.setTimout(5000);
client.setHeader(requestHeader.entrySet());
try {
client.GET(sitemapURL.toString());
if (client.getStatusCode() != 200) {
throw new IOException("Unable to download the sitemap file " + sitemapURL +
"\nServer returned status: " + client.getHttpResponse().getStatusLine());
}
// get some metadata
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final String contentMimeType = header.mime();
InputStream contentStream = client.getContentstream();
if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
contentStream = new GZIPInputStream(contentStream);
}
final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
return sitemapParser.parse(counterStream);
} catch (IOException e) {
throw e;
} finally {
client.finish();
}
}
public static SitemapReader parse(InputStream stream) throws IOException {
return new SitemapReader(stream);
}
@ -104,38 +143,44 @@ public class sitemapParser extends AbstractParser implements Parser {
* http://www.sitemaps.org/schemas/sitemap/0.9
* http://www.google.com/schemas/sitemap/0.84
*/
public static class SitemapReader extends ArrayList<SitemapEntry> {
public static class SitemapReader extends ArrayList<URLEntry> {
private static final long serialVersionUID = 1337L;
public SitemapReader(InputStream source) throws IOException {
org.w3c.dom.Document doc;
try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); }
catch (ParserConfigurationException e) { throw new IOException (e); }
catch (SAXParseException e) { throw new IOException (e); }
catch (SAXException e) { throw new IOException (e); }
NodeList nodes = doc.getElementsByTagName("url");
for (int i = 0; i < nodes.getLength(); i++)
this.add(new SitemapEntry((Element) nodes.item(i)));
NodeList SitemapNodes = doc.getElementsByTagName("sitemap");
for (int i = 0; i < SitemapNodes.getLength(); i++) {
String url = new SitemapEntry((Element) SitemapNodes.item(i)).url();
if (url != null && url.length() > 0) {
try {
SitemapReader r = parse(new DigestURI(url));
for (URLEntry ue: r) this.add(ue);
} catch (IOException e) {}
}
}
NodeList urlEntryNodes = doc.getElementsByTagName("url");
for (int i = 0; i < urlEntryNodes.getLength(); i++) {
this.add(new URLEntry((Element) urlEntryNodes.item(i)));
}
}
public String toString() {
StringBuilder sb = new StringBuilder();
for (SitemapEntry entry: this) sb.append(entry.toString());
for (URLEntry entry: this) sb.append(entry.toString());
return sb.toString();
}
}
public static class SitemapEntry {
public static class URLEntry {
public String loc, lastmod, changefreq, priority;
public SitemapEntry(Element element) {
public URLEntry(Element element) {
loc = val(element, "loc", "");
lastmod = val(element, "lastmod", "");
changefreq = val(element, "changefreq", "");
priority = val(element, "priority", "");
}
private String val(Element parent, String label, String dflt) {
Element e = (Element) parent.getElementsByTagName(label).item(0);
if (e == null) return dflt;
Node child = e.getFirstChild();
return (child instanceof CharacterData) ? ((CharacterData) child).getData() : dflt;
}
public String url() {
return this.loc;
}
@ -148,4 +193,28 @@ public class sitemapParser extends AbstractParser implements Parser {
}
}
public static class SitemapEntry {
public String loc, lastmod;
public SitemapEntry(Element element) {
loc = val(element, "loc", "");
lastmod = val(element, "lastmod", "");
}
public String url() {
return this.loc;
}
public Date lastmod(Date dflt) {
try {
return DateFormatter.parseISO8601(lastmod);
} catch (final ParseException e) {
return dflt;
}
}
}
private static String val(Element parent, String label, String dflt) {
Element e = (Element) parent.getElementsByTagName(label).item(0);
if (e == null) return dflt;
Node child = e.getFirstChild();
return (child instanceof CharacterData) ? ((CharacterData) child).getData() : dflt;
}
}

Loading…
Cancel
Save