From 114bdd8ba790ed49bf9c0992a20f27af50bd6940 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 8 Sep 2010 14:13:15 +0000 Subject: [PATCH] fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Crawler_p.java | 8 +- .../de/anomic/crawler/AbstractImporter.java | 105 ----- source/de/anomic/crawler/Importer.java | 25 -- source/de/anomic/crawler/ImporterManager.java | 104 ----- source/de/anomic/crawler/SitemapImporter.java | 162 +++++--- source/de/anomic/data/SitemapParser.java | 358 ------------------ source/de/anomic/search/Switchboard.java | 8 +- .../net/yacy/document/parser/pdfParser.java | 4 +- .../yacy/document/parser/sitemapParser.java | 151 ++++++++ 9 files changed, 260 insertions(+), 665 deletions(-) delete mode 100644 source/de/anomic/crawler/AbstractImporter.java delete mode 100644 source/de/anomic/crawler/Importer.java delete mode 100644 source/de/anomic/crawler/ImporterManager.java delete mode 100644 source/de/anomic/data/SitemapParser.java create mode 100644 source/net/yacy/document/parser/sitemapParser.java diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index a6cc74311..79c0abf76 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -451,11 +451,9 @@ public class Crawler_p { sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); // create a new sitemap importer - final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new DigestURI(sitemapURLStr, null), pe); - if (importerThread != null) { - importerThread.setJobID(sb.dbImportManager.generateUniqueJobID()); - importerThread.startIt(); - } + final SitemapImporter importer = new SitemapImporter(sb, new DigestURI(sitemapURLStr, null), pe); + importer.start(); + } catch (final Exception e) { // mist prop.put("info", "6");//Error with url diff --git a/source/de/anomic/crawler/AbstractImporter.java b/source/de/anomic/crawler/AbstractImporter.java deleted file mode 100644 index 57e9fccdd..000000000 --- a/source/de/anomic/crawler/AbstractImporter.java +++ /dev/null @@ -1,105 +0,0 @@ -package de.anomic.crawler; - -public abstract class AbstractImporter extends Thread implements Importer { - - private int jobID = -1; - private String jobType; - private boolean stopped = false; - private boolean paused = false; - private long globalStart = System.currentTimeMillis(); - protected long globalEnd; - private long globalPauseLast; - private long globalPauseDuration; - private String error; - - public AbstractImporter(final String theJobType) { - this.jobType = theJobType; - - // initializing the logger and setting a more verbose thread name - this.setName("IMPORT_" + this.jobType + "_" + this.jobID); - } - - public String getError() { - return this.error; - } - - public void startIt() { - this.start(); - } - - public void stopIt() throws InterruptedException { - this.stopped = true; - this.continueIt(); - this.join(); - } - - public void pauseIt() { - synchronized(this) { - this.globalPauseLast = System.currentTimeMillis(); - this.paused = true; - } - } - - public void continueIt() { - synchronized(this) { - if (this.paused) { - this.globalPauseDuration += System.currentTimeMillis()-this.globalPauseLast; - this.paused = false; - this.notifyAll(); - } - } - } - - public boolean isPaused() { - synchronized(this) { - return this.paused; - } - } - - protected boolean isAborted() { - synchronized(this) { - if (this.paused) { - try { - this.wait(); - } - catch (final InterruptedException e){} - } - } - - return (this.stopped) || Thread.currentThread().isInterrupted(); - } - - public boolean isStopped() { - return !this.isAlive(); - } - - public int getJobID() { - return this.jobID; - } - - public void setJobID(final int id) { - if (this.jobID != -1) throw new IllegalStateException("job ID already assigned"); - this.jobID = id; - } - - public long getTotalRuntime() { - return (this.globalEnd == 0)?System.currentTimeMillis()-(this.globalStart+this.globalPauseDuration):this.globalEnd-(this.globalStart+this.globalPauseDuration); - } - - public long getElapsedTime() { - if(this.paused) { - this.globalPauseDuration += System.currentTimeMillis()-this.globalPauseLast; - this.globalPauseLast = System.currentTimeMillis(); - } - return isStopped()?this.globalEnd-(this.globalStart+this.globalPauseDuration):System.currentTimeMillis()-(this.globalStart+this.globalPauseDuration); - } - - public String getJobType() { - return this.jobType; - } - - public abstract long getEstimatedTime(); - public abstract String getJobName(); - public abstract int getProcessingStatusPercent(); - -} diff --git a/source/de/anomic/crawler/Importer.java b/source/de/anomic/crawler/Importer.java deleted file mode 100644 index 07f54108e..000000000 --- a/source/de/anomic/crawler/Importer.java +++ /dev/null @@ -1,25 +0,0 @@ -package de.anomic.crawler; - -public interface Importer { - - // functions to pause and continue importing - public boolean isPaused(); - public void pauseIt(); - public void continueIt(); - public void stopIt() throws InterruptedException; - public boolean isStopped(); - - // getting status information - public long getTotalRuntime(); - public long getElapsedTime(); - public long getEstimatedTime(); - public int getProcessingStatusPercent(); - - public int getJobID(); - public void setJobID(int id); - public String getJobName(); - public String getJobType(); - public String getError(); - public String getStatus(); - public void startIt(); -} diff --git a/source/de/anomic/crawler/ImporterManager.java b/source/de/anomic/crawler/ImporterManager.java deleted file mode 100644 index e4582a23a..000000000 --- a/source/de/anomic/crawler/ImporterManager.java +++ /dev/null @@ -1,104 +0,0 @@ -package de.anomic.crawler; - -import java.util.Vector; - -import net.yacy.kelondro.logging.Log; - - -public class ImporterManager { - - public final Vector finishedJobs; - public final ThreadGroup runningJobs; - public int currMaxJobNr; - - public ImporterManager() { - this.finishedJobs = new Vector(); - this.runningJobs = new ThreadGroup("ImporterThreads"); - this.currMaxJobNr = 0; - } - - public int generateUniqueJobID() { - int jobID; - synchronized(this.runningJobs) { - jobID = this.currMaxJobNr; - this.currMaxJobNr++; - } - return jobID; - } - - public Importer[] getRunningImporter() { - final Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2]; - final int activeCount = this.runningJobs.enumerate(importThreads); - final Importer[] importers = new Importer[activeCount]; - for (int i = 0; i < activeCount; i++) { - importers[i] = (Importer) importThreads[i]; - } - return importers; - } - - public Importer[] getFinishedImporter() { - return this.finishedJobs.toArray(new Importer[this.finishedJobs.size()]); - } - - public Importer getImporterByID(final int jobID) { - - final Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2]; - - for(final Thread importThread : importThreads) { - final Importer currThread = (Importer) importThread; - if (currThread.getJobID() == jobID) { - return currThread; - } - } - return null; - } - - /** - * Can be used to close all still running importer threads - * e.g. on server shutdown - */ - public void close() { - /* clear the finished thread list */ - this.finishedJobs.clear(); - - /* waiting for all threads to finish */ - int threadCount = this.runningJobs.activeCount(); - final Thread[] threadList = new Thread[threadCount]; - threadCount = this.runningJobs.enumerate(threadList); - - if (threadCount == 0) return; - - final Log log = new Log("DB-IMPORT"); - try { - // trying to gracefull stop all still running sessions ... - log.logInfo("Signaling shutdown to " + threadCount + " remaining dbImporter threads ..."); - for (final Thread currentThread : threadList) { - if (currentThread.isAlive()) { - ((Importer)currentThread).stopIt(); - } - } - - // waiting a few ms for the session objects to continue processing - try { Thread.sleep(500); } catch (final InterruptedException ex) {} - - // interrupting all still running or pooled threads ... - log.logInfo("Sending interruption signal to " + runningJobs.activeCount() + " remaining dbImporter threads ..."); - runningJobs.interrupt(); - - // we need to use a timeout here because of missing interruptable session threads ... - if (log.isFine()) log.logFine("Waiting for " + runningJobs.activeCount() + " remaining dbImporter threads to finish shutdown ..."); - int currentThreadIdx = 0; - for (final Thread currentThread : threadList) { - if (currentThread.isAlive()) { - if (log.isFine()) log.logFine("Waiting for dbImporter thread '" + currentThread.getName() + "' [" + currentThreadIdx++ + "] to finish shutdown."); - try { currentThread.join(500); } catch (final InterruptedException ex) {} - } - } - - log.logInfo("Shutdown of remaining dbImporter threads finished."); - } catch (final Exception e) { - log.logSevere("Unexpected error while trying to shutdown all remaining dbImporter threads.",e); - } - } - -} diff --git a/source/de/anomic/crawler/SitemapImporter.java b/source/de/anomic/crawler/SitemapImporter.java index ebe502952..874bf544e 100644 --- a/source/de/anomic/crawler/SitemapImporter.java +++ b/source/de/anomic/crawler/SitemapImporter.java @@ -25,75 +25,117 @@ package de.anomic.crawler; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.util.Date; +import java.util.zip.GZIPInputStream; + +import net.yacy.cora.protocol.HeaderFramework; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.document.parser.sitemapParser; import net.yacy.kelondro.data.meta.DigestURI; -import de.anomic.data.SitemapParser; +import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.io.ByteCountInputStream; +import net.yacy.kelondro.logging.Log; +import de.anomic.crawler.retrieval.HTTPLoader; +import de.anomic.crawler.retrieval.Request; +import de.anomic.search.Segments; import de.anomic.search.Switchboard; -public class SitemapImporter extends AbstractImporter implements Importer { +public class SitemapImporter extends Thread { + + private CrawlProfile crawlingProfile = null; + private static final Log logger = new Log("SITEMAP"); + private DigestURI siteMapURL = null; + private final Switchboard sb; + + public SitemapImporter(final Switchboard sb, final DigestURI sitemapURL, final CrawlProfile profileEntry) { + assert sitemapURL != null; + this.sb = sb; + this.siteMapURL = sitemapURL; + assert profileEntry != null; + this.crawlingProfile = profileEntry; + } - private final SitemapParser parser; - private final DigestURI sitemapURL; - private final ImporterManager superviser; - - public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile profileEntry) throws ImporterException { - super("sitemap"); - this.superviser = importManager; + public void run() { + // download document + final RequestHeader requestHeader = new RequestHeader(); + requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); + final HTTPClient client = new HTTPClient(); + client.setTimout(5000); + client.setHeader(requestHeader.entrySet()); try { - // getting the sitemap URL - this.sitemapURL = sitemapURL; - - // creating the sitemap parser - this.parser = new SitemapParser(sb, this.sitemapURL, profileEntry); + try { + client.GET(siteMapURL.toString()); + if (client.getStatusCode() != 200) { + logger.logWarning("Unable to download the sitemap file " + this.siteMapURL + + "\nServer returned status: " + client.getHttpResponse().getStatusLine()); + return; + } + + // get some metadata + final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + final String contentMimeType = header.mime(); + + InputStream contentStream = client.getContentstream(); + if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) { + if (logger.isFine()) logger.logFine("Sitemap file has mimetype " + contentMimeType); + contentStream = new GZIPInputStream(contentStream); + } + + final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null); + // parse it + logger.logInfo("Start parsing sitemap file " + this.siteMapURL + "\n\tMimeType: " + contentMimeType + "\n\tLength: " + header.getContentLength()); + sitemapParser.SitemapReader parser = sitemapParser.parse(counterStream); + for (sitemapParser.SitemapEntry entry: parser) process(entry); + } finally { + client.finish(); + } } catch (final Exception e) { - throw new ImporterException("Unable to initialize Importer",e); + logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e); } } - - - - public long getEstimatedTime() { - final long t = getElapsedTime(); - final int p = getProcessingStatusPercent(); - return (p==0)?0:(t/p)*(100-p); - } - /** - * @see Importer#getJobName() - */ - public String getJobName() { - return this.sitemapURL.toString(); - } + public void process(sitemapParser.SitemapEntry entry) { - /** - * @see Importer#getProcessingStatusPercent() - */ - public int getProcessingStatusPercent() { - if (this.parser == null) return 0; - - final long total = this.parser.getTotalLength(); - final long processed = this.parser.getProcessedLength(); - - if (total <= 1) return 0; - return (int) ((processed*100)/ total); - } + // get the url hash + byte[] nexturlhash = null; + DigestURI url = null; + try { + url = new DigestURI(entry.url(), null); + nexturlhash = url.hash(); + } catch (final MalformedURLException e1) { + } - /** - * @see Importer#getStatus() - */ - public String getStatus() { - final StringBuilder theStatus = new StringBuilder(); - - theStatus.append("#URLs=").append((this.parser==null)?0:this.parser.getUrlcount()); - - return theStatus.toString(); - } - - public void run() { - try { - this.parser.parse(); - } finally { - this.globalEnd = System.currentTimeMillis(); - this.superviser.finishedJobs.add(this); - } - } + // check if the url is known and needs to be recrawled + Date lastMod = entry.lastmod(null); + if (lastMod != null) { + final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash); + if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) { + // the url was already loaded. we need to check the date + final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash, null, 0); + if (oldEntry != null) { + final Date modDate = oldEntry.moddate(); + // check if modDate is null + if (modDate.after(lastMod)) return; + } + } + } + + // URL needs to crawled + this.sb.crawlStacker.enqueueEntry(new Request( + this.sb.peers.mySeed().hash.getBytes(), + url, + null, // this.siteMapURL.toString(), + entry.url(), + new Date(), + this.crawlingProfile.handle(), + 0, + 0, + 0 + )); + logger.logInfo("New URL '" + entry.url() + "' added for loading."); + } } diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java deleted file mode 100644 index 172eb1a57..000000000 --- a/source/de/anomic/data/SitemapParser.java +++ /dev/null @@ -1,358 +0,0 @@ -// SitemapParser.java -// ------------------------ -// part of YaCy -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2007 -// -// this file is contributed by Martin Thelian -// last major change: $LastChangedDate$ by $LastChangedBy$ -// Revision: $LastChangedRevision$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.data; - -import java.io.InputStream; -import java.net.MalformedURLException; -import java.text.ParseException; -import java.util.Date; -import java.util.zip.GZIPInputStream; - -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -import net.yacy.cora.protocol.HeaderFramework; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.cora.protocol.http.HTTPClient; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.io.ByteCountInputStream; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.DateFormatter; - -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.crawler.retrieval.Request; -//import de.anomic.http.client.Client; -//import de.anomic.http.server.ResponseContainer; -import de.anomic.search.Segments; -import de.anomic.search.Switchboard; - -/** - * Class to parse a sitemap file.
- * An example sitemap file is depicted below:
- * - *
- * <?xml version="1.0" encoding="UTF-8"?>
- * <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
- *    <url>
- *       <loc>http://www.example.com/</loc>
- *       <lastmod>2005-01-01</lastmod>
- *       <changefreq>monthly</changefreq>
- *       <priority>0.8</priority>
- *    </url>
- * </urlset> 
- * 
- * - * A real example can be found here: http://www.xt-service.de/sitemap.xml An example robots.txt containing a sitemap - * URL: http://notepad.emaillink.de/robots.txt - * - * @see Protocol at sitemaps.org http://www.sitemaps.org/protocol.php - * @see Protocol at google.com https://www.google.com/webmasters/tools/docs/en/protocol.html - */ -public class SitemapParser extends DefaultHandler { - public static final String XMLNS_SITEMAPS_ORG = "http://www.sitemaps.org/schemas/sitemap/0.9"; - public static final String XMLNS_SITEMAPS_GOOGLE = "http://www.google.com/schemas/sitemap/0.84"; - - public static final String SITEMAP_XMLNS = "xmlns"; - public static final String SITEMAP_URLSET = "urlset"; - public static final String SITEMAP_URL = "url"; - public static final String SITEMAP_URL_LOC = "loc"; - public static final String SITEMAP_URL_LASTMOD = "lastmod"; - public static final String SITEMAP_URL_CHANGEFREQ = "changefreq"; - public static final String SITEMAP_URL_PRIORITY = "priority"; - - /** - * The crawling profile used to parse the URLs contained in the sitemap file - */ - private CrawlProfile crawlingProfile = null; - - /** - * Name of the current XML element - */ - private String currentElement = null; - - /** - * A special stream to count how many bytes were processed so far - */ - private int streamCounter = 0; - - /** - * The total length of the sitemap file - */ - private long contentLength; - - /** - * The amount of urls processes so far - */ - private int urlCounter = 0; - - /** - * the logger - */ - private static final Log logger = new Log("SITEMAP"); - - /** - * The location of the sitemap file - */ - private DigestURI siteMapURL = null; - - /** - * The next URL to enqueue - */ - private String nextURL = null; - - /** - * last modification date of the {@link #nextURL} - */ - private Date lastMod = null; - private final Switchboard sb; - - public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile theCrawlingProfile) { - assert sitemap != null; - this.sb = sb; - this.siteMapURL = sitemap; - - if (theCrawlingProfile == null) { - // create a new profile - this.crawlingProfile = createProfile(this.siteMapURL.getHost(), this.siteMapURL); - } else { - // use an existing profile - this.crawlingProfile = theCrawlingProfile; - } - } - - /** - * Function to download and parse the sitemap file - */ - public void parse() { - // download document - final RequestHeader requestHeader = new RequestHeader(); - requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); -// final Client client = new Client(5000, requestHeader); -// ResponseContainer res = null; - final HTTPClient client = new HTTPClient(); - client.setTimout(5000); - client.setHeader(requestHeader.entrySet()); - try { -// res = client.GET(siteMapURL.toString()); -// if (res.getStatusCode() != 200) { -// logger.logWarning("Unable to download the sitemap file " + this.siteMapURL + -// "\nServer returned status: " + res.getStatusLine()); -// return; -// } - try { - client.GET(siteMapURL.toString()); - if (client.getStatusCode() != 200) { - logger.logWarning("Unable to download the sitemap file " + this.siteMapURL + - "\nServer returned status: " + client.getHttpResponse().getStatusLine()); - return; - } - - // getting some metadata -// final String contentMimeType = res.getResponseHeader().mime(); -// this.contentLength = res.getResponseHeader().getContentLength(); - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - final String contentMimeType = header.mime(); - this.contentLength = header.getContentLength(); - -// try { -// InputStream contentStream = res.getDataAsStream(); - InputStream contentStream = client.getContentstream(); - if ((contentMimeType != null) && - (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) { - if (logger.isFine()) logger.logFine("Sitemap file has mimetype " + contentMimeType); - contentStream = new GZIPInputStream(contentStream); - } - - final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null); - // parse it - logger.logInfo("Start parsing sitemap file " + this.siteMapURL + "\n\tMimeType: " + contentMimeType + - "\n\tLength: " + this.contentLength); - final SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser(); - saxParser.parse(counterStream, this); - streamCounter += counterStream.getCount(); - } finally { -// res.closeStream(); - client.finish(); - } - } catch (final Exception e) { - logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e); -// } finally { -// if (res != null) { -// // release connection -// res.closeStream(); -// } - } - } - - /** - * @return the total length of the sitemap file in bytes or -1 if the length is unknown - */ - public long getTotalLength() { - return this.contentLength; - } - - /** - * @return the amount of bytes of the sitemap file that were downloaded so far - */ - public long getProcessedLength() { - return streamCounter; - } - - /** - * @return the amount of URLs that were successfully enqueued so far - */ - public long getUrlcount() { - return this.urlCounter; - } - - /** - * @param localName local name - * @param qName qualified name - * @see DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) - */ - @Override - public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes attrs) throws SAXException { - this.currentElement = qName; - - // testing if the namespace is known - if (qName.equalsIgnoreCase(SITEMAP_URLSET)) { - final String namespace = attrs.getValue(SITEMAP_XMLNS); - if ((namespace == null) || - ((!namespace.equals(XMLNS_SITEMAPS_ORG)) && (!namespace.equals(XMLNS_SITEMAPS_GOOGLE)))) - throw new SAXException("Unknown sitemap namespace: " + namespace); - } - } - - /** - * @param localName local name - * @param qName qualified name - * @throws SAXException - * @see DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String) - */ - @Override - public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException { - this.currentElement = ""; - - if (qName.equalsIgnoreCase(SITEMAP_URL)) { - if (this.nextURL == null) - return; - - // get the url hash - byte[] nexturlhash = null; - DigestURI url = null; - try { - url = new DigestURI(this.nextURL, null); - nexturlhash = url.hash(); - } catch (final MalformedURLException e1) { - } - - // check if the url is known and needs to be recrawled - if (this.lastMod != null) { - final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash); - if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) { - // the url was already loaded. we need to check the date - final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash, null, 0); - if (oldEntry != null) { - final Date modDate = oldEntry.moddate(); - // check if modDate is null - if (modDate.after(this.lastMod)) - return; - } - } - } - - // URL needs to crawled - this.sb.crawlStacker.enqueueEntry(new Request( - this.sb.peers.mySeed().hash.getBytes(), - url, - null, // this.siteMapURL.toString(), - this.nextURL, - new Date(), - this.crawlingProfile.handle(), - 0, - 0, - 0 - )); - logger.logInfo("New URL '" + this.nextURL + "' added for crawling."); - this.urlCounter++; - } - } - - @Override - public void characters(final char[] buf, final int offset, final int len) throws SAXException { - if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LOC)) { - // TODO: we need to decode the URL here - this.nextURL = (new String(buf, offset, len)).trim(); - if (!this.nextURL.startsWith("http") && !this.nextURL.startsWith("https")) { - logger.logInfo("The url '" + this.nextURL + "' has a wrong format. Ignore it."); - this.nextURL = null; - } - } else if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LASTMOD)) { - final String dateStr = new String(buf, offset, len); - try { - this.lastMod = DateFormatter.parseISO8601(dateStr); - } catch (final ParseException e) { - logger.logInfo("Unable to parse datestring '" + dateStr + "'"); - } - } - } - - private CrawlProfile createProfile(final String domainName, final DigestURI sitemapURL) { - CrawlProfile p = new CrawlProfile( - domainName, sitemapURL, - // crawling Filter - CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, - // Depth - 0, - // force recrawling - 0, - // disable Auto-Dom-Filter - -1, -1, - // allow crawling of dynamic URLs - true, - // index text + media - true, true, - // don't store downloaded pages to Web Cache - false, - // store to TX cache - true, - // remote Indexing disabled - false, - // exclude stop-words - true, true, true, - CrawlProfile.CacheStrategy.IFFRESH); - this.sb.crawler.profilesActiveCrawls.put(p.handle().getBytes(), p); - return p; - } -} diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index f221d46b7..902f907ec 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -116,7 +116,6 @@ import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.CrawlStacker; import de.anomic.crawler.CrawlSwitchboard; -import de.anomic.crawler.ImporterManager; import de.anomic.crawler.NoticedURL; import de.anomic.crawler.ResourceObserver; import de.anomic.crawler.ResultImages; @@ -216,7 +215,6 @@ public final class Switchboard extends serverSwitch { public userDB userDB; public bookmarksDB bookmarksDB; public WebStructureGraph webStructure; - public ImporterManager dbImportManager; public ArrayList localSearches; // array of search result properties as HashMaps public ArrayList remoteSearches; // array of search result properties as HashMaps public ConcurrentHashMap> localSearchTracker, remoteSearchTracker; // mappings from requesting host to a TreeSet of Long(access time) @@ -672,7 +670,6 @@ public final class Switchboard extends serverSwitch { //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true); //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/kiosk/archiv/ct/2003/4/20"), query, true, 260); - this.dbImportManager = new ImporterManager(); this.trail = new ArrayList(); log.logConfig("Finished Switchboard Initialization"); @@ -1152,7 +1149,6 @@ public final class Switchboard extends serverSwitch { indexingAnalysisProcessor.awaitShutdown(12000); indexingStorageProcessor.awaitShutdown(12000); crawlStacker.close(); - this.dbImportManager.close(); // de.anomic.http.client.Client.closeAllConnections(); wikiDB.close(); blogDB.close(); @@ -1526,6 +1522,7 @@ public final class Switchboard extends serverSwitch { ConnectionInfo.cleanUp(); // do transmission of CR-files + /* checkInterruption(); int count = rankingOwnDistribution.size() / 100; if (count == 0) count = 1; @@ -1534,6 +1531,7 @@ public final class Switchboard extends serverSwitch { rankingOwnDistribution.transferRanking(count); rankingOtherDistribution.transferRanking(1); } + */ // clean up delegated stack checkInterruption(); @@ -1753,7 +1751,7 @@ public final class Switchboard extends serverSwitch { try { // parse the document documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b); - assert(documents != null) : "Unexpected error. Parser returned null."; + if (documents == null) throw new Parser.Failure("Parser returned null.", response.url()); } catch (final Parser.Failure e) { this.log.logWarning("Unable to parse the resource '" + response.url() + "'. " + e.getMessage()); addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage()); diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 307d96274..a813d6fee 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -135,10 +135,10 @@ public class pdfParser extends AbstractParser implements Parser { } finally { try {pdfDoc.close();} catch (IOException e) {} } + pdfDoc = null; String[] docKeywords = null; if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,"); - if (docTitle == null) docTitle = docSubject; byte[] contentBytes; @@ -147,8 +147,6 @@ public class pdfParser extends AbstractParser implements Parser { } catch (UnsupportedEncodingException e) { Log.logException(e); throw new Parser.Failure(e.getMessage(), location); - } finally { - try {pdfDoc.close();} catch (IOException e) {} } // clear resources in pdfbox. they say that is resolved but it's not. see: diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java new file mode 100644 index 000000000..d89397acf --- /dev/null +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -0,0 +1,151 @@ +/** + * sitemapParser.java + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 08.09.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.document.parser; + +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.CharacterData; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.TextParser; +import net.yacy.document.parser.html.ImageEntry; +import net.yacy.kelondro.util.DateFormatter; + +public class sitemapParser extends AbstractParser implements Parser { + + public sitemapParser() { + super("RSS Parser"); + // unfortunately sitemap files have neither a mime type nor a typical file extension. + //SUPPORTED_EXTENSIONS.add("php"); + //SUPPORTED_EXTENSIONS.add("xml"); + } + + public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { + SitemapReader sitemap; + try { + sitemap = new SitemapReader(source); + } catch (IOException e) { + throw new Parser.Failure("Load error:" + e.getMessage(), url); + } + + List docs = new ArrayList(); + MultiProtocolURI uri; + Document doc; + for (SitemapEntry item: sitemap) try { + uri = new MultiProtocolURI(item.loc); + doc = new Document( + uri, + TextParser.mimeOf(url), + charset, + null, + null, + "", + "", + "", + new String[0], + "", + null, + null, + null, + new HashMap(), + false); + docs.add(doc); + } catch (MalformedURLException e) { + continue; + } + + Document[] da = new Document[docs.size()]; + docs.toArray(da); + return da; + } + + public static SitemapReader parse(InputStream stream) throws IOException { + return new SitemapReader(stream); + } + + /** + * for schemas see: + * http://www.sitemaps.org/schemas/sitemap/0.9 + * http://www.google.com/schemas/sitemap/0.84 + */ + public static class SitemapReader extends ArrayList { + private static final long serialVersionUID = 1337L; + public SitemapReader(InputStream source) throws IOException { + org.w3c.dom.Document doc; + try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); } + catch (ParserConfigurationException e) { throw new IOException (e); } + catch (SAXException e) { throw new IOException (e); } + NodeList nodes = doc.getElementsByTagName("url"); + for (int i = 0; i < nodes.getLength(); i++) + this.add(new SitemapEntry((Element) nodes.item(i))); + } + public String toString() { + StringBuilder sb = new StringBuilder(); + for (SitemapEntry entry: this) sb.append(entry.toString()); + return sb.toString(); + } + } + + public static class SitemapEntry { + public String loc, lastmod, changefreq, priority; + public SitemapEntry(Element element) { + loc = val(element, "loc", ""); + lastmod = val(element, "lastmod", ""); + changefreq = val(element, "changefreq", ""); + priority = val(element, "priority", ""); + } + private String val(Element parent, String label, String dflt) { + Element e = (Element) parent.getElementsByTagName(label).item(0); + if (e == null) return dflt; + Node child = e.getFirstChild(); + return (child instanceof CharacterData) ? ((CharacterData) child).getData() : dflt; + } + public String url() { + return this.loc; + } + public Date lastmod(Date dflt) { + try { + return DateFormatter.parseISO8601(lastmod); + } catch (final ParseException e) { + return dflt; + } + } + } + +}