- removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
b73ea6581d
commit
114bdd8ba7
@ -1,105 +0,0 @@
|
||||
package de.anomic.crawler;
|
||||
|
||||
public abstract class AbstractImporter extends Thread implements Importer {
|
||||
|
||||
private int jobID = -1;
|
||||
private String jobType;
|
||||
private boolean stopped = false;
|
||||
private boolean paused = false;
|
||||
private long globalStart = System.currentTimeMillis();
|
||||
protected long globalEnd;
|
||||
private long globalPauseLast;
|
||||
private long globalPauseDuration;
|
||||
private String error;
|
||||
|
||||
public AbstractImporter(final String theJobType) {
|
||||
this.jobType = theJobType;
|
||||
|
||||
// initializing the logger and setting a more verbose thread name
|
||||
this.setName("IMPORT_" + this.jobType + "_" + this.jobID);
|
||||
}
|
||||
|
||||
public String getError() {
|
||||
return this.error;
|
||||
}
|
||||
|
||||
public void startIt() {
|
||||
this.start();
|
||||
}
|
||||
|
||||
public void stopIt() throws InterruptedException {
|
||||
this.stopped = true;
|
||||
this.continueIt();
|
||||
this.join();
|
||||
}
|
||||
|
||||
public void pauseIt() {
|
||||
synchronized(this) {
|
||||
this.globalPauseLast = System.currentTimeMillis();
|
||||
this.paused = true;
|
||||
}
|
||||
}
|
||||
|
||||
public void continueIt() {
|
||||
synchronized(this) {
|
||||
if (this.paused) {
|
||||
this.globalPauseDuration += System.currentTimeMillis()-this.globalPauseLast;
|
||||
this.paused = false;
|
||||
this.notifyAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isPaused() {
|
||||
synchronized(this) {
|
||||
return this.paused;
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean isAborted() {
|
||||
synchronized(this) {
|
||||
if (this.paused) {
|
||||
try {
|
||||
this.wait();
|
||||
}
|
||||
catch (final InterruptedException e){}
|
||||
}
|
||||
}
|
||||
|
||||
return (this.stopped) || Thread.currentThread().isInterrupted();
|
||||
}
|
||||
|
||||
public boolean isStopped() {
|
||||
return !this.isAlive();
|
||||
}
|
||||
|
||||
public int getJobID() {
|
||||
return this.jobID;
|
||||
}
|
||||
|
||||
public void setJobID(final int id) {
|
||||
if (this.jobID != -1) throw new IllegalStateException("job ID already assigned");
|
||||
this.jobID = id;
|
||||
}
|
||||
|
||||
public long getTotalRuntime() {
|
||||
return (this.globalEnd == 0)?System.currentTimeMillis()-(this.globalStart+this.globalPauseDuration):this.globalEnd-(this.globalStart+this.globalPauseDuration);
|
||||
}
|
||||
|
||||
public long getElapsedTime() {
|
||||
if(this.paused) {
|
||||
this.globalPauseDuration += System.currentTimeMillis()-this.globalPauseLast;
|
||||
this.globalPauseLast = System.currentTimeMillis();
|
||||
}
|
||||
return isStopped()?this.globalEnd-(this.globalStart+this.globalPauseDuration):System.currentTimeMillis()-(this.globalStart+this.globalPauseDuration);
|
||||
}
|
||||
|
||||
public String getJobType() {
|
||||
return this.jobType;
|
||||
}
|
||||
|
||||
public abstract long getEstimatedTime();
|
||||
public abstract String getJobName();
|
||||
public abstract int getProcessingStatusPercent();
|
||||
|
||||
}
|
@ -1,25 +0,0 @@
|
||||
package de.anomic.crawler;
|
||||
|
||||
public interface Importer {
|
||||
|
||||
// functions to pause and continue importing
|
||||
public boolean isPaused();
|
||||
public void pauseIt();
|
||||
public void continueIt();
|
||||
public void stopIt() throws InterruptedException;
|
||||
public boolean isStopped();
|
||||
|
||||
// getting status information
|
||||
public long getTotalRuntime();
|
||||
public long getElapsedTime();
|
||||
public long getEstimatedTime();
|
||||
public int getProcessingStatusPercent();
|
||||
|
||||
public int getJobID();
|
||||
public void setJobID(int id);
|
||||
public String getJobName();
|
||||
public String getJobType();
|
||||
public String getError();
|
||||
public String getStatus();
|
||||
public void startIt();
|
||||
}
|
@ -1,104 +0,0 @@
|
||||
package de.anomic.crawler;
|
||||
|
||||
import java.util.Vector;
|
||||
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
|
||||
public class ImporterManager {
|
||||
|
||||
public final Vector<Importer> finishedJobs;
|
||||
public final ThreadGroup runningJobs;
|
||||
public int currMaxJobNr;
|
||||
|
||||
public ImporterManager() {
|
||||
this.finishedJobs = new Vector<Importer>();
|
||||
this.runningJobs = new ThreadGroup("ImporterThreads");
|
||||
this.currMaxJobNr = 0;
|
||||
}
|
||||
|
||||
public int generateUniqueJobID() {
|
||||
int jobID;
|
||||
synchronized(this.runningJobs) {
|
||||
jobID = this.currMaxJobNr;
|
||||
this.currMaxJobNr++;
|
||||
}
|
||||
return jobID;
|
||||
}
|
||||
|
||||
public Importer[] getRunningImporter() {
|
||||
final Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2];
|
||||
final int activeCount = this.runningJobs.enumerate(importThreads);
|
||||
final Importer[] importers = new Importer[activeCount];
|
||||
for (int i = 0; i < activeCount; i++) {
|
||||
importers[i] = (Importer) importThreads[i];
|
||||
}
|
||||
return importers;
|
||||
}
|
||||
|
||||
public Importer[] getFinishedImporter() {
|
||||
return this.finishedJobs.toArray(new Importer[this.finishedJobs.size()]);
|
||||
}
|
||||
|
||||
public Importer getImporterByID(final int jobID) {
|
||||
|
||||
final Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2];
|
||||
|
||||
for(final Thread importThread : importThreads) {
|
||||
final Importer currThread = (Importer) importThread;
|
||||
if (currThread.getJobID() == jobID) {
|
||||
return currThread;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Can be used to close all still running importer threads
|
||||
* e.g. on server shutdown
|
||||
*/
|
||||
public void close() {
|
||||
/* clear the finished thread list */
|
||||
this.finishedJobs.clear();
|
||||
|
||||
/* waiting for all threads to finish */
|
||||
int threadCount = this.runningJobs.activeCount();
|
||||
final Thread[] threadList = new Thread[threadCount];
|
||||
threadCount = this.runningJobs.enumerate(threadList);
|
||||
|
||||
if (threadCount == 0) return;
|
||||
|
||||
final Log log = new Log("DB-IMPORT");
|
||||
try {
|
||||
// trying to gracefull stop all still running sessions ...
|
||||
log.logInfo("Signaling shutdown to " + threadCount + " remaining dbImporter threads ...");
|
||||
for (final Thread currentThread : threadList) {
|
||||
if (currentThread.isAlive()) {
|
||||
((Importer)currentThread).stopIt();
|
||||
}
|
||||
}
|
||||
|
||||
// waiting a few ms for the session objects to continue processing
|
||||
try { Thread.sleep(500); } catch (final InterruptedException ex) {}
|
||||
|
||||
// interrupting all still running or pooled threads ...
|
||||
log.logInfo("Sending interruption signal to " + runningJobs.activeCount() + " remaining dbImporter threads ...");
|
||||
runningJobs.interrupt();
|
||||
|
||||
// we need to use a timeout here because of missing interruptable session threads ...
|
||||
if (log.isFine()) log.logFine("Waiting for " + runningJobs.activeCount() + " remaining dbImporter threads to finish shutdown ...");
|
||||
int currentThreadIdx = 0;
|
||||
for (final Thread currentThread : threadList) {
|
||||
if (currentThread.isAlive()) {
|
||||
if (log.isFine()) log.logFine("Waiting for dbImporter thread '" + currentThread.getName() + "' [" + currentThreadIdx++ + "] to finish shutdown.");
|
||||
try { currentThread.join(500); } catch (final InterruptedException ex) {}
|
||||
}
|
||||
}
|
||||
|
||||
log.logInfo("Shutdown of remaining dbImporter threads finished.");
|
||||
} catch (final Exception e) {
|
||||
log.logSevere("Unexpected error while trying to shutdown all remaining dbImporter threads.",e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,358 +0,0 @@
|
||||
// SitemapParser.java
|
||||
// ------------------------
|
||||
// part of YaCy
|
||||
// (C) by Michael Peter Christen; mc@yacy.net
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2007
|
||||
//
|
||||
// this file is contributed by Martin Thelian
|
||||
// last major change: $LastChangedDate$ by $LastChangedBy$
|
||||
// Revision: $LastChangedRevision$
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.data;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.text.ParseException;
|
||||
import java.util.Date;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import net.yacy.cora.protocol.HeaderFramework;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.cora.protocol.ResponseHeader;
|
||||
import net.yacy.cora.protocol.http.HTTPClient;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.io.ByteCountInputStream;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.crawler.retrieval.HTTPLoader;
|
||||
import de.anomic.crawler.retrieval.Request;
|
||||
//import de.anomic.http.client.Client;
|
||||
//import de.anomic.http.server.ResponseContainer;
|
||||
import de.anomic.search.Segments;
|
||||
import de.anomic.search.Switchboard;
|
||||
|
||||
/**
|
||||
* Class to parse a sitemap file.<br>
|
||||
* An example sitemap file is depicted below:<br>
|
||||
*
|
||||
* <pre>
|
||||
* <?xml version="1.0" encoding="UTF-8"?>
|
||||
* <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
* <url>
|
||||
* <loc>http://www.example.com/</loc>
|
||||
* <lastmod>2005-01-01</lastmod>
|
||||
* <changefreq>monthly</changefreq>
|
||||
* <priority>0.8</priority>
|
||||
* </url>
|
||||
* </urlset>
|
||||
* </pre>
|
||||
*
|
||||
* A real example can be found here: http://www.xt-service.de/sitemap.xml An example robots.txt containing a sitemap
|
||||
* URL: http://notepad.emaillink.de/robots.txt
|
||||
*
|
||||
* @see Protocol at sitemaps.org <a href="http://www.sitemaps.org/protocol.php">http://www.sitemaps.org/protocol.php</a>
|
||||
* @see Protocol at google.com <a
|
||||
* href="https://www.google.com/webmasters/tools/docs/en/protocol.html">https://www.google.com/webmasters/tools/docs/en/protocol.html</a>
|
||||
*/
|
||||
public class SitemapParser extends DefaultHandler {
|
||||
public static final String XMLNS_SITEMAPS_ORG = "http://www.sitemaps.org/schemas/sitemap/0.9";
|
||||
public static final String XMLNS_SITEMAPS_GOOGLE = "http://www.google.com/schemas/sitemap/0.84";
|
||||
|
||||
public static final String SITEMAP_XMLNS = "xmlns";
|
||||
public static final String SITEMAP_URLSET = "urlset";
|
||||
public static final String SITEMAP_URL = "url";
|
||||
public static final String SITEMAP_URL_LOC = "loc";
|
||||
public static final String SITEMAP_URL_LASTMOD = "lastmod";
|
||||
public static final String SITEMAP_URL_CHANGEFREQ = "changefreq";
|
||||
public static final String SITEMAP_URL_PRIORITY = "priority";
|
||||
|
||||
/**
|
||||
* The crawling profile used to parse the URLs contained in the sitemap file
|
||||
*/
|
||||
private CrawlProfile crawlingProfile = null;
|
||||
|
||||
/**
|
||||
* Name of the current XML element
|
||||
*/
|
||||
private String currentElement = null;
|
||||
|
||||
/**
|
||||
* A special stream to count how many bytes were processed so far
|
||||
*/
|
||||
private int streamCounter = 0;
|
||||
|
||||
/**
|
||||
* The total length of the sitemap file
|
||||
*/
|
||||
private long contentLength;
|
||||
|
||||
/**
|
||||
* The amount of urls processes so far
|
||||
*/
|
||||
private int urlCounter = 0;
|
||||
|
||||
/**
|
||||
* the logger
|
||||
*/
|
||||
private static final Log logger = new Log("SITEMAP");
|
||||
|
||||
/**
|
||||
* The location of the sitemap file
|
||||
*/
|
||||
private DigestURI siteMapURL = null;
|
||||
|
||||
/**
|
||||
* The next URL to enqueue
|
||||
*/
|
||||
private String nextURL = null;
|
||||
|
||||
/**
|
||||
* last modification date of the {@link #nextURL}
|
||||
*/
|
||||
private Date lastMod = null;
|
||||
private final Switchboard sb;
|
||||
|
||||
public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile theCrawlingProfile) {
|
||||
assert sitemap != null;
|
||||
this.sb = sb;
|
||||
this.siteMapURL = sitemap;
|
||||
|
||||
if (theCrawlingProfile == null) {
|
||||
// create a new profile
|
||||
this.crawlingProfile = createProfile(this.siteMapURL.getHost(), this.siteMapURL);
|
||||
} else {
|
||||
// use an existing profile
|
||||
this.crawlingProfile = theCrawlingProfile;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Function to download and parse the sitemap file
|
||||
*/
|
||||
public void parse() {
|
||||
// download document
|
||||
final RequestHeader requestHeader = new RequestHeader();
|
||||
requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
|
||||
// final Client client = new Client(5000, requestHeader);
|
||||
// ResponseContainer res = null;
|
||||
final HTTPClient client = new HTTPClient();
|
||||
client.setTimout(5000);
|
||||
client.setHeader(requestHeader.entrySet());
|
||||
try {
|
||||
// res = client.GET(siteMapURL.toString());
|
||||
// if (res.getStatusCode() != 200) {
|
||||
// logger.logWarning("Unable to download the sitemap file " + this.siteMapURL +
|
||||
// "\nServer returned status: " + res.getStatusLine());
|
||||
// return;
|
||||
// }
|
||||
try {
|
||||
client.GET(siteMapURL.toString());
|
||||
if (client.getStatusCode() != 200) {
|
||||
logger.logWarning("Unable to download the sitemap file " + this.siteMapURL +
|
||||
"\nServer returned status: " + client.getHttpResponse().getStatusLine());
|
||||
return;
|
||||
}
|
||||
|
||||
// getting some metadata
|
||||
// final String contentMimeType = res.getResponseHeader().mime();
|
||||
// this.contentLength = res.getResponseHeader().getContentLength();
|
||||
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
|
||||
final String contentMimeType = header.mime();
|
||||
this.contentLength = header.getContentLength();
|
||||
|
||||
// try {
|
||||
// InputStream contentStream = res.getDataAsStream();
|
||||
InputStream contentStream = client.getContentstream();
|
||||
if ((contentMimeType != null) &&
|
||||
(contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
|
||||
if (logger.isFine()) logger.logFine("Sitemap file has mimetype " + contentMimeType);
|
||||
contentStream = new GZIPInputStream(contentStream);
|
||||
}
|
||||
|
||||
final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
|
||||
// parse it
|
||||
logger.logInfo("Start parsing sitemap file " + this.siteMapURL + "\n\tMimeType: " + contentMimeType +
|
||||
"\n\tLength: " + this.contentLength);
|
||||
final SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser();
|
||||
saxParser.parse(counterStream, this);
|
||||
streamCounter += counterStream.getCount();
|
||||
} finally {
|
||||
// res.closeStream();
|
||||
client.finish();
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e);
|
||||
// } finally {
|
||||
// if (res != null) {
|
||||
// // release connection
|
||||
// res.closeStream();
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the total length of the sitemap file in bytes or <code>-1</code> if the length is unknown
|
||||
*/
|
||||
public long getTotalLength() {
|
||||
return this.contentLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the amount of bytes of the sitemap file that were downloaded so far
|
||||
*/
|
||||
public long getProcessedLength() {
|
||||
return streamCounter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the amount of URLs that were successfully enqueued so far
|
||||
*/
|
||||
public long getUrlcount() {
|
||||
return this.urlCounter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param localName local name
|
||||
* @param qName qualified name
|
||||
* @see DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
|
||||
*/
|
||||
@Override
|
||||
public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes attrs) throws SAXException {
|
||||
this.currentElement = qName;
|
||||
|
||||
// testing if the namespace is known
|
||||
if (qName.equalsIgnoreCase(SITEMAP_URLSET)) {
|
||||
final String namespace = attrs.getValue(SITEMAP_XMLNS);
|
||||
if ((namespace == null) ||
|
||||
((!namespace.equals(XMLNS_SITEMAPS_ORG)) && (!namespace.equals(XMLNS_SITEMAPS_GOOGLE))))
|
||||
throw new SAXException("Unknown sitemap namespace: " + namespace);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param localName local name
|
||||
* @param qName qualified name
|
||||
* @throws SAXException
|
||||
* @see DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
|
||||
*/
|
||||
@Override
|
||||
public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException {
|
||||
this.currentElement = "";
|
||||
|
||||
if (qName.equalsIgnoreCase(SITEMAP_URL)) {
|
||||
if (this.nextURL == null)
|
||||
return;
|
||||
|
||||
// get the url hash
|
||||
byte[] nexturlhash = null;
|
||||
DigestURI url = null;
|
||||
try {
|
||||
url = new DigestURI(this.nextURL, null);
|
||||
nexturlhash = url.hash();
|
||||
} catch (final MalformedURLException e1) {
|
||||
}
|
||||
|
||||
// check if the url is known and needs to be recrawled
|
||||
if (this.lastMod != null) {
|
||||
final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash);
|
||||
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
|
||||
// the url was already loaded. we need to check the date
|
||||
final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash, null, 0);
|
||||
if (oldEntry != null) {
|
||||
final Date modDate = oldEntry.moddate();
|
||||
// check if modDate is null
|
||||
if (modDate.after(this.lastMod))
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// URL needs to crawled
|
||||
this.sb.crawlStacker.enqueueEntry(new Request(
|
||||
this.sb.peers.mySeed().hash.getBytes(),
|
||||
url,
|
||||
null, // this.siteMapURL.toString(),
|
||||
this.nextURL,
|
||||
new Date(),
|
||||
this.crawlingProfile.handle(),
|
||||
0,
|
||||
0,
|
||||
0
|
||||
));
|
||||
logger.logInfo("New URL '" + this.nextURL + "' added for crawling.");
|
||||
this.urlCounter++;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(final char[] buf, final int offset, final int len) throws SAXException {
|
||||
if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LOC)) {
|
||||
// TODO: we need to decode the URL here
|
||||
this.nextURL = (new String(buf, offset, len)).trim();
|
||||
if (!this.nextURL.startsWith("http") && !this.nextURL.startsWith("https")) {
|
||||
logger.logInfo("The url '" + this.nextURL + "' has a wrong format. Ignore it.");
|
||||
this.nextURL = null;
|
||||
}
|
||||
} else if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LASTMOD)) {
|
||||
final String dateStr = new String(buf, offset, len);
|
||||
try {
|
||||
this.lastMod = DateFormatter.parseISO8601(dateStr);
|
||||
} catch (final ParseException e) {
|
||||
logger.logInfo("Unable to parse datestring '" + dateStr + "'");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private CrawlProfile createProfile(final String domainName, final DigestURI sitemapURL) {
|
||||
CrawlProfile p = new CrawlProfile(
|
||||
domainName, sitemapURL,
|
||||
// crawling Filter
|
||||
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
|
||||
// Depth
|
||||
0,
|
||||
// force recrawling
|
||||
0,
|
||||
// disable Auto-Dom-Filter
|
||||
-1, -1,
|
||||
// allow crawling of dynamic URLs
|
||||
true,
|
||||
// index text + media
|
||||
true, true,
|
||||
// don't store downloaded pages to Web Cache
|
||||
false,
|
||||
// store to TX cache
|
||||
true,
|
||||
// remote Indexing disabled
|
||||
false,
|
||||
// exclude stop-words
|
||||
true, true, true,
|
||||
CrawlProfile.CacheStrategy.IFFRESH);
|
||||
this.sb.crawler.profilesActiveCrawls.put(p.handle().getBytes(), p);
|
||||
return p;
|
||||
}
|
||||
}
|
@ -0,0 +1,151 @@
|
||||
/**
|
||||
* sitemapParser.java
|
||||
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
|
||||
* First released 08.09.2010 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
package net.yacy.document.parser;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import org.w3c.dom.CharacterData;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
|
||||
public class sitemapParser extends AbstractParser implements Parser {
|
||||
|
||||
public sitemapParser() {
|
||||
super("RSS Parser");
|
||||
// unfortunately sitemap files have neither a mime type nor a typical file extension.
|
||||
//SUPPORTED_EXTENSIONS.add("php");
|
||||
//SUPPORTED_EXTENSIONS.add("xml");
|
||||
}
|
||||
|
||||
public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
|
||||
SitemapReader sitemap;
|
||||
try {
|
||||
sitemap = new SitemapReader(source);
|
||||
} catch (IOException e) {
|
||||
throw new Parser.Failure("Load error:" + e.getMessage(), url);
|
||||
}
|
||||
|
||||
List<Document> docs = new ArrayList<Document>();
|
||||
MultiProtocolURI uri;
|
||||
Document doc;
|
||||
for (SitemapEntry item: sitemap) try {
|
||||
uri = new MultiProtocolURI(item.loc);
|
||||
doc = new Document(
|
||||
uri,
|
||||
TextParser.mimeOf(url),
|
||||
charset,
|
||||
null,
|
||||
null,
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
new String[0],
|
||||
"",
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
new HashMap<MultiProtocolURI, ImageEntry>(),
|
||||
false);
|
||||
docs.add(doc);
|
||||
} catch (MalformedURLException e) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Document[] da = new Document[docs.size()];
|
||||
docs.toArray(da);
|
||||
return da;
|
||||
}
|
||||
|
||||
public static SitemapReader parse(InputStream stream) throws IOException {
|
||||
return new SitemapReader(stream);
|
||||
}
|
||||
|
||||
/**
|
||||
* for schemas see:
|
||||
* http://www.sitemaps.org/schemas/sitemap/0.9
|
||||
* http://www.google.com/schemas/sitemap/0.84
|
||||
*/
|
||||
public static class SitemapReader extends ArrayList<SitemapEntry> {
|
||||
private static final long serialVersionUID = 1337L;
|
||||
public SitemapReader(InputStream source) throws IOException {
|
||||
org.w3c.dom.Document doc;
|
||||
try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); }
|
||||
catch (ParserConfigurationException e) { throw new IOException (e); }
|
||||
catch (SAXException e) { throw new IOException (e); }
|
||||
NodeList nodes = doc.getElementsByTagName("url");
|
||||
for (int i = 0; i < nodes.getLength(); i++)
|
||||
this.add(new SitemapEntry((Element) nodes.item(i)));
|
||||
}
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (SitemapEntry entry: this) sb.append(entry.toString());
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
public static class SitemapEntry {
|
||||
public String loc, lastmod, changefreq, priority;
|
||||
public SitemapEntry(Element element) {
|
||||
loc = val(element, "loc", "");
|
||||
lastmod = val(element, "lastmod", "");
|
||||
changefreq = val(element, "changefreq", "");
|
||||
priority = val(element, "priority", "");
|
||||
}
|
||||
private String val(Element parent, String label, String dflt) {
|
||||
Element e = (Element) parent.getElementsByTagName(label).item(0);
|
||||
if (e == null) return dflt;
|
||||
Node child = e.getFirstChild();
|
||||
return (child instanceof CharacterData) ? ((CharacterData) child).getData() : dflt;
|
||||
}
|
||||
public String url() {
|
||||
return this.loc;
|
||||
}
|
||||
public Date lastmod(Date dflt) {
|
||||
try {
|
||||
return DateFormatter.parseISO8601(lastmod);
|
||||
} catch (final ParseException e) {
|
||||
return dflt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue