fixed old sitemap importer which was not able to parse urls containing post elements

- removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 114bdd8ba7
parent b73ea6581d
commit 114bdd8ba7
9 changed files with 260 additions and 665 deletions
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -451,11 +451,9 @@ public class Crawler_p {
                    		sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
                    		
                    		// create a new sitemap importer
-                    		final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new DigestURI(sitemapURLStr, null), pe);
-                    		if (importerThread != null) {
-                    		    importerThread.setJobID(sb.dbImportManager.generateUniqueJobID());
-                    			importerThread.startIt();
-                    		}
+                    		final SitemapImporter importer = new SitemapImporter(sb, new DigestURI(sitemapURLStr, null), pe);
+                    		importer.start();
+                    		
                    	} catch (final Exception e) {
                    		// mist
                    		prop.put("info", "6");//Error with url
--- a/source/de/anomic/crawler/AbstractImporter.java
+++ b/source/de/anomic/crawler/AbstractImporter.java
@ -1,105 +0,0 @@
-package de.anomic.crawler;
-
-public abstract class AbstractImporter extends Thread implements Importer {
-
-    private int jobID = -1;
-    private String jobType;
-    private boolean stopped = false;
-    private boolean paused = false;
-    private long globalStart = System.currentTimeMillis();
-    protected long globalEnd;
-    private long globalPauseLast;
-    private long globalPauseDuration;
-    private String error;
-
-    public AbstractImporter(final String theJobType) {
-    	this.jobType = theJobType;
-
-        // initializing the logger and setting a more verbose thread name
-        this.setName("IMPORT_" + this.jobType + "_" + this.jobID);
-    }
-    
-    public String getError() {
-        return this.error;
-    }
-    
-    public void startIt() {
-        this.start();
-    }
-    
-    public void stopIt() throws InterruptedException {
-        this.stopped = true;
-        this.continueIt();
-        this.join();
-    }
-    
-    public void pauseIt() {
-        synchronized(this) {
-        	this.globalPauseLast = System.currentTimeMillis();
-            this.paused = true;
-        }
-    }
-    
-    public void continueIt() {
-        synchronized(this) {
-            if (this.paused) {
-            	this.globalPauseDuration += System.currentTimeMillis()-this.globalPauseLast;
-                this.paused = false;
-                this.notifyAll();
-            }
-        }
-    }
-    
-    public boolean isPaused() {
-        synchronized(this) {
-            return this.paused;
-        }
-    }
-    
-    protected boolean isAborted() {
-        synchronized(this) {
-            if (this.paused) {
-                try {
-                    this.wait();
-                }
-                catch (final InterruptedException e){}
-            }
-        }
-        
-        return (this.stopped) || Thread.currentThread().isInterrupted();
-    }    
-    
-    public boolean isStopped() {
-        return !this.isAlive();
-    }
-    
-    public int getJobID() {
-        return this.jobID;
-    }
-    
-    public void setJobID(final int id) {
-    	if (this.jobID != -1) throw new IllegalStateException("job ID already assigned");
-    	this.jobID = id;
-    }
-    
-    public long getTotalRuntime() {
-        return (this.globalEnd == 0)?System.currentTimeMillis()-(this.globalStart+this.globalPauseDuration):this.globalEnd-(this.globalStart+this.globalPauseDuration);
-    }    
-    
-    public long getElapsedTime() {
-    	if(this.paused) {
-    		this.globalPauseDuration += System.currentTimeMillis()-this.globalPauseLast;
-        	this.globalPauseLast = System.currentTimeMillis();
-    	}
-        return isStopped()?this.globalEnd-(this.globalStart+this.globalPauseDuration):System.currentTimeMillis()-(this.globalStart+this.globalPauseDuration);
-    }
-
-    public String getJobType() {
-        return this.jobType;
-    }
-    
-    public abstract long getEstimatedTime();
-    public abstract String getJobName();
-    public abstract int getProcessingStatusPercent();
-
-}
--- a/source/de/anomic/crawler/Importer.java
+++ b/source/de/anomic/crawler/Importer.java
@ -1,25 +0,0 @@
-package de.anomic.crawler;
-
-public interface Importer {
-
-    // functions to pause and continue importing
-    public boolean isPaused();
-    public void pauseIt();
-    public void continueIt();
-    public void stopIt() throws InterruptedException;
-    public boolean isStopped();
-    
-    // getting status information
-    public long getTotalRuntime();
-    public long getElapsedTime();
-    public long getEstimatedTime();
-    public int getProcessingStatusPercent();
-    
-    public int getJobID();
-    public void setJobID(int id);
-    public String getJobName();
-    public String getJobType();
-    public String getError();
-    public String getStatus();
-    public void startIt();    
-}
--- a/source/de/anomic/crawler/ImporterManager.java
+++ b/source/de/anomic/crawler/ImporterManager.java
@ -1,104 +0,0 @@
-package de.anomic.crawler;
-
-import java.util.Vector;
-
-import net.yacy.kelondro.logging.Log;
-
-
-public class ImporterManager {
-
-    public final Vector<Importer> finishedJobs;
-    public final ThreadGroup runningJobs;
-    public  int currMaxJobNr;
-    
-    public ImporterManager() {
-        this.finishedJobs = new Vector<Importer>();
-        this.runningJobs = new ThreadGroup("ImporterThreads");
-        this.currMaxJobNr = 0;
-    }
-    
-    public int generateUniqueJobID() {
-        int jobID;
-        synchronized(this.runningJobs) {
-            jobID = this.currMaxJobNr;
-            this.currMaxJobNr++;
-        }
-        return jobID;
-    }
-    
-    public Importer[] getRunningImporter() {
-        final Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2];
-        final int activeCount = this.runningJobs.enumerate(importThreads);
-        final Importer[] importers = new Importer[activeCount];
-        for (int i = 0; i < activeCount; i++) {
-            importers[i] = (Importer) importThreads[i];
-        }
-        return importers;
-    }
-    
-    public Importer[] getFinishedImporter() {
-        return this.finishedJobs.toArray(new Importer[this.finishedJobs.size()]);
-    }
-    
-    public Importer getImporterByID(final int jobID) {
-
-        final Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2];
-
-        for(final Thread importThread : importThreads) {
-            final Importer currThread = (Importer) importThread;
-            if (currThread.getJobID() == jobID) {
-                return currThread;
-            }                    
-        }        
-        return null;        
-    }
-    
-    /**
-     * Can be used to close all still running importer threads
-     * e.g. on server shutdown
-     */
-    public void close() {
-        /* clear the finished thread list */
-        this.finishedJobs.clear();
-        
-        /* waiting for all threads to finish */
-        int threadCount  = this.runningJobs.activeCount();    
-        final Thread[] threadList = new Thread[threadCount];     
-        threadCount = this.runningJobs.enumerate(threadList);
-        
-        if (threadCount == 0) return;
-        
-        final Log log = new Log("DB-IMPORT");
-        try {
-            // trying to gracefull stop all still running sessions ...
-            log.logInfo("Signaling shutdown to " + threadCount + " remaining dbImporter threads ...");
-            for (final Thread currentThread : threadList)  {
-                if (currentThread.isAlive()) {
-                    ((Importer)currentThread).stopIt();
-                }
-            }      
-            
-            // waiting a few ms for the session objects to continue processing
-            try { Thread.sleep(500); } catch (final InterruptedException ex) {}    
-            
-            // interrupting all still running or pooled threads ...
-            log.logInfo("Sending interruption signal to " + runningJobs.activeCount() + " remaining dbImporter threads ...");
-            runningJobs.interrupt();  
-            
-            // we need to use a timeout here because of missing interruptable session threads ...
-            if (log.isFine()) log.logFine("Waiting for " + runningJobs.activeCount() + " remaining dbImporter threads to finish shutdown ...");
-            int currentThreadIdx = 0;
-            for (final Thread currentThread : threadList)  {
-                if (currentThread.isAlive()) {
-                    if (log.isFine()) log.logFine("Waiting for dbImporter thread '" + currentThread.getName() + "' [" + currentThreadIdx++ + "] to finish shutdown.");
-                    try { currentThread.join(500); } catch (final InterruptedException ex) {}
-                }
-            }
-            
-            log.logInfo("Shutdown of remaining dbImporter threads finished.");
-        } catch (final Exception e) {
-            log.logSevere("Unexpected error while trying to shutdown all remaining dbImporter threads.",e);
-        }
-    }
-    
-}
--- a/source/de/anomic/crawler/SitemapImporter.java
+++ b/source/de/anomic/crawler/SitemapImporter.java
@ -25,75 +25,117 @@

 package de.anomic.crawler;

+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.util.Date;
+import java.util.zip.GZIPInputStream;
+
+import net.yacy.cora.protocol.HeaderFramework;
+import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.cora.protocol.ResponseHeader;
+import net.yacy.cora.protocol.http.HTTPClient;
+import net.yacy.document.parser.sitemapParser;
 import net.yacy.kelondro.data.meta.DigestURI;
-import de.anomic.data.SitemapParser;
+import net.yacy.kelondro.data.meta.URIMetadataRow;
+import net.yacy.kelondro.io.ByteCountInputStream;
+import net.yacy.kelondro.logging.Log;
+import de.anomic.crawler.retrieval.HTTPLoader;
+import de.anomic.crawler.retrieval.Request;
+import de.anomic.search.Segments;
 import de.anomic.search.Switchboard;

-public class SitemapImporter extends AbstractImporter implements Importer {
+public class SitemapImporter extends Thread {
+
+    private CrawlProfile crawlingProfile = null;
+    private static final Log logger = new Log("SITEMAP");
+    private DigestURI siteMapURL = null;
+    private final Switchboard sb;
+    
+    public SitemapImporter(final Switchboard sb, final DigestURI sitemapURL, final CrawlProfile profileEntry) {
+        assert sitemapURL != null;
+        this.sb = sb;
+        this.siteMapURL = sitemapURL;
+        assert profileEntry != null;
+        this.crawlingProfile = profileEntry;
+    }

-	private final SitemapParser parser;
-	private final DigestURI sitemapURL;
-	private final ImporterManager superviser;
-	
-	public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile profileEntry) throws ImporterException {
-		super("sitemap");
-		this.superviser = importManager;
+    public void run() {
+        // download document
+        final RequestHeader requestHeader = new RequestHeader();
+        requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
+        final HTTPClient client = new HTTPClient();
+        client.setTimout(5000);
+        client.setHeader(requestHeader.entrySet());
        try {
-            // getting the sitemap URL
-            this.sitemapURL = sitemapURL;
-            
-            // creating the sitemap parser
-            this.parser = new SitemapParser(sb, this.sitemapURL, profileEntry);
+            try {
+                client.GET(siteMapURL.toString());
+                if (client.getStatusCode() != 200) {
+                    logger.logWarning("Unable to download the sitemap file " + this.siteMapURL +
+                            "\nServer returned status: " + client.getHttpResponse().getStatusLine());
+                    return;
+                }
+
+                // get some metadata
+                final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
+                final String contentMimeType = header.mime();
+
+                InputStream contentStream = client.getContentstream();
+                if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
+                    if (logger.isFine()) logger.logFine("Sitemap file has mimetype " + contentMimeType);
+                    contentStream = new GZIPInputStream(contentStream);
+                }
+
+                final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
+                // parse it
+                logger.logInfo("Start parsing sitemap file " + this.siteMapURL + "\n\tMimeType: " + contentMimeType + "\n\tLength:   " + header.getContentLength());
+                sitemapParser.SitemapReader parser = sitemapParser.parse(counterStream);
+                for (sitemapParser.SitemapEntry entry: parser) process(entry);
+            } finally {
+                client.finish();
+            }
        } catch (final Exception e) {
-            throw new ImporterException("Unable to initialize Importer",e);
+            logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e);
        }
    }
-    
-    
-    
-	public long getEstimatedTime() {
-		final long t = getElapsedTime();
-		final int p = getProcessingStatusPercent();
-		return (p==0)?0:(t/p)*(100-p);
-	}

-	/**
-	 * @see Importer#getJobName()
-	 */
-	public String getJobName() {
-		return this.sitemapURL.toString();
-	}
+    public void process(sitemapParser.SitemapEntry entry) {

-	/**
-	 * @see Importer#getProcessingStatusPercent()
-	 */
-	public int getProcessingStatusPercent() {
-		if (this.parser == null) return 0;
-		
-		final long total = this.parser.getTotalLength();
-		final long processed = this.parser.getProcessedLength();
-		
-		if (total <= 1) return 0;		
-		return (int) ((processed*100)/ total);
-	}
+        // get the url hash
+        byte[] nexturlhash = null;
+        DigestURI url = null;
+        try {
+            url = new DigestURI(entry.url(), null);
+            nexturlhash = url.hash();
+        } catch (final MalformedURLException e1) {
+        }

-	/**
-	 * @see Importer#getStatus()
-	 */
-	public String getStatus() {
-        final StringBuilder theStatus = new StringBuilder();
-        
-        theStatus.append("#URLs=").append((this.parser==null)?0:this.parser.getUrlcount());
-        
-        return theStatus.toString();
-	}
-	
-	public void run() {
-		try {
-			this.parser.parse();
-		} finally {
-			this.globalEnd = System.currentTimeMillis();
-			this.superviser.finishedJobs.add(this);			
-		}
-	}
+        // check if the url is known and needs to be recrawled
+        Date lastMod = entry.lastmod(null);
+        if (lastMod != null) {
+            final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash);
+            if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
+                // the url was already loaded. we need to check the date
+                final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash, null, 0);
+                if (oldEntry != null) {
+                    final Date modDate = oldEntry.moddate();
+                    // check if modDate is null
+                    if (modDate.after(lastMod)) return;
+                }
+            }
+        }
+
+        // URL needs to crawled
+        this.sb.crawlStacker.enqueueEntry(new Request(
+                this.sb.peers.mySeed().hash.getBytes(),
+                url,
+                null, // this.siteMapURL.toString(),
+                entry.url(),
+                new Date(),
+                this.crawlingProfile.handle(),
+                0,
+                0,
+                0
+                ));
+        logger.logInfo("New URL '" + entry.url() + "' added for loading.");
+    }
 }
--- a/source/de/anomic/data/SitemapParser.java
+++ b/source/de/anomic/data/SitemapParser.java
@ -1,358 +0,0 @@
-// SitemapParser.java
-// ------------------------
-// part of YaCy
-// (C) by Michael Peter Christen; mc@yacy.net
-// first published on http://www.anomic.de
-// Frankfurt, Germany, 2007
-//
-// this file is contributed by Martin Thelian
-// last major change: $LastChangedDate$ by $LastChangedBy$
-// Revision: $LastChangedRevision$
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-package de.anomic.data;
-
-import java.io.InputStream;
-import java.net.MalformedURLException;
-import java.text.ParseException;
-import java.util.Date;
-import java.util.zip.GZIPInputStream;
-
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
-import net.yacy.cora.protocol.HeaderFramework;
-import net.yacy.cora.protocol.RequestHeader;
-import net.yacy.cora.protocol.ResponseHeader;
-import net.yacy.cora.protocol.http.HTTPClient;
-import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.data.meta.URIMetadataRow;
-import net.yacy.kelondro.io.ByteCountInputStream;
-import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.util.DateFormatter;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import de.anomic.crawler.CrawlProfile;
-import de.anomic.crawler.retrieval.HTTPLoader;
-import de.anomic.crawler.retrieval.Request;
-//import de.anomic.http.client.Client;
-//import de.anomic.http.server.ResponseContainer;
-import de.anomic.search.Segments;
-import de.anomic.search.Switchboard;
-
-/**
- * Class to parse a sitemap file.<br>
- * An example sitemap file is depicted below:<br>
- * 
- * <pre>
- * &lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
- * &lt;urlset xmlns=&quot;http://www.sitemaps.org/schemas/sitemap/0.9&quot;&gt;
- *    &lt;url&gt;
- *       &lt;loc&gt;http://www.example.com/&lt;/loc&gt;
- *       &lt;lastmod&gt;2005-01-01&lt;/lastmod&gt;
- *       &lt;changefreq&gt;monthly&lt;/changefreq&gt;
- *       &lt;priority&gt;0.8&lt;/priority&gt;
- *    &lt;/url&gt;
- * &lt;/urlset&gt; 
- * </pre>
- * 
- * A real example can be found here: http://www.xt-service.de/sitemap.xml An example robots.txt containing a sitemap
- * URL: http://notepad.emaillink.de/robots.txt
- * 
- * @see Protocol at sitemaps.org <a href="http://www.sitemaps.org/protocol.php">http://www.sitemaps.org/protocol.php</a>
- * @see Protocol at google.com <a
- *      href="https://www.google.com/webmasters/tools/docs/en/protocol.html">https://www.google.com/webmasters/tools/docs/en/protocol.html</a>
- */
-public class SitemapParser extends DefaultHandler {
-    public static final String XMLNS_SITEMAPS_ORG = "http://www.sitemaps.org/schemas/sitemap/0.9";
-    public static final String XMLNS_SITEMAPS_GOOGLE = "http://www.google.com/schemas/sitemap/0.84";
-
-    public static final String SITEMAP_XMLNS = "xmlns";
-    public static final String SITEMAP_URLSET = "urlset";
-    public static final String SITEMAP_URL = "url";
-    public static final String SITEMAP_URL_LOC = "loc";
-    public static final String SITEMAP_URL_LASTMOD = "lastmod";
-    public static final String SITEMAP_URL_CHANGEFREQ = "changefreq";
-    public static final String SITEMAP_URL_PRIORITY = "priority";
-
-    /**
-     * The crawling profile used to parse the URLs contained in the sitemap file
-     */
-    private CrawlProfile crawlingProfile = null;
-
-    /**
-     * Name of the current XML element
-     */
-    private String currentElement = null;
-
-    /**
-     * A special stream to count how many bytes were processed so far
-     */
-    private int streamCounter = 0;
-
-    /**
-     * The total length of the sitemap file
-     */
-    private long contentLength;
-
-    /**
-     * The amount of urls processes so far
-     */
-    private int urlCounter = 0;
-
-    /**
-     * the logger
-     */
-    private static final Log logger = new Log("SITEMAP");
-
-    /**
-     * The location of the sitemap file
-     */
-    private DigestURI siteMapURL = null;
-
-    /**
-     * The next URL to enqueue
-     */
-    private String nextURL = null;
-
-    /**
-     * last modification date of the {@link #nextURL}
-     */
-    private Date lastMod = null;
-    private final Switchboard sb;
-    
-    public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile theCrawlingProfile) {
-        assert sitemap != null;
-        this.sb = sb;
-        this.siteMapURL = sitemap;
-
-        if (theCrawlingProfile == null) {
-            // create a new profile
-            this.crawlingProfile = createProfile(this.siteMapURL.getHost(), this.siteMapURL);
-        } else {
-            // use an existing profile
-            this.crawlingProfile = theCrawlingProfile;
-        }
-    }
-
-    /**
-     * Function to download and parse the sitemap file
-     */
-    public void parse() {
-        // download document
-        final RequestHeader requestHeader = new RequestHeader();
-        requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
-//        final Client client = new Client(5000, requestHeader);
-//        ResponseContainer res = null;
-        final HTTPClient client = new HTTPClient();
-        client.setTimout(5000);
-        client.setHeader(requestHeader.entrySet());
-        try {
-//            res = client.GET(siteMapURL.toString());
-//            if (res.getStatusCode() != 200) {
-//                logger.logWarning("Unable to download the sitemap file " + this.siteMapURL +
-//                        "\nServer returned status: " + res.getStatusLine());
-//                return;
-//            }
-        	try {
-	        	client.GET(siteMapURL.toString());
-	            if (client.getStatusCode() != 200) {
-	                logger.logWarning("Unable to download the sitemap file " + this.siteMapURL +
-	                        "\nServer returned status: " + client.getHttpResponse().getStatusLine());
-	                return;
-	            }
-
-            // getting some metadata
-//            final String contentMimeType = res.getResponseHeader().mime();
-//            this.contentLength = res.getResponseHeader().getContentLength();
-	            final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
-	            final String contentMimeType = header.mime();
-	            this.contentLength = header.getContentLength();
-
-//            try {
-//                InputStream contentStream = res.getDataAsStream();
-	            InputStream contentStream = client.getContentstream();
-                if ((contentMimeType != null) &&
-                        (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
-                    if (logger.isFine()) logger.logFine("Sitemap file has mimetype " + contentMimeType);
-                    contentStream = new GZIPInputStream(contentStream);
-                }
-
-                final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
-                // parse it
-                logger.logInfo("Start parsing sitemap file " + this.siteMapURL + "\n\tMimeType: " + contentMimeType +
-                        "\n\tLength:   " + this.contentLength);
-                final SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser();
-                saxParser.parse(counterStream, this);
-                streamCounter += counterStream.getCount();
-            } finally {
-//                res.closeStream();
-            	client.finish();
-            }
-        } catch (final Exception e) {
-            logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e);
-//        } finally {
-//            if (res != null) {
-//                // release connection
-//                res.closeStream();
-//            }
-        }
-    }
-
-    /**
-     * @return the total length of the sitemap file in bytes or <code>-1</code> if the length is unknown
-     */
-    public long getTotalLength() {
-        return this.contentLength;
-    }
-
-    /**
-     * @return the amount of bytes of the sitemap file that were downloaded so far
-     */
-    public long getProcessedLength() {
-        return streamCounter;
-    }
-
-    /**
-     * @return the amount of URLs that were successfully enqueued so far
-     */
-    public long getUrlcount() {
-        return this.urlCounter;
-    }
-
-    /**
-     * @param localName local name
-     * @param qName qualified name
-     * @see DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
-     */
-    @Override
-    public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes attrs) throws SAXException {
-        this.currentElement = qName;
-
-        // testing if the namespace is known
-        if (qName.equalsIgnoreCase(SITEMAP_URLSET)) {
-            final String namespace = attrs.getValue(SITEMAP_XMLNS);
-            if ((namespace == null) ||
-                    ((!namespace.equals(XMLNS_SITEMAPS_ORG)) && (!namespace.equals(XMLNS_SITEMAPS_GOOGLE))))
-                throw new SAXException("Unknown sitemap namespace: " + namespace);
-        }
-    }
-
-    /**
-     * @param localName local name
-     * @param qName qualified name
-     * @throws SAXException
-     * @see DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
-     */
-    @Override
-    public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException {
-        this.currentElement = "";
-
-        if (qName.equalsIgnoreCase(SITEMAP_URL)) {
-            if (this.nextURL == null)
-                return;
-
-            // get the url hash
-            byte[] nexturlhash = null;
-            DigestURI url = null;
-            try {
-                url = new DigestURI(this.nextURL, null);
-                nexturlhash = url.hash();
-            } catch (final MalformedURLException e1) {
-            }
-
-            // check if the url is known and needs to be recrawled
-            if (this.lastMod != null) {
-                final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash);
-                if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
-                    // the url was already loaded. we need to check the date
-                    final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash, null, 0);
-                    if (oldEntry != null) {
-                        final Date modDate = oldEntry.moddate();
-                        // check if modDate is null
-                        if (modDate.after(this.lastMod))
-                            return;
-                    }
-                }
-            }
-
-            // URL needs to crawled
-            this.sb.crawlStacker.enqueueEntry(new Request(
-                    this.sb.peers.mySeed().hash.getBytes(),
-                    url,
-                    null, // this.siteMapURL.toString(),
-                    this.nextURL,
-                    new Date(),
-                    this.crawlingProfile.handle(),
-                    0,
-                    0,
-                    0
-                    ));
-            logger.logInfo("New URL '" + this.nextURL + "' added for crawling.");
-            this.urlCounter++;
-        }
-    }
-
-    @Override
-    public void characters(final char[] buf, final int offset, final int len) throws SAXException {
-        if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LOC)) {
-            // TODO: we need to decode the URL here
-            this.nextURL = (new String(buf, offset, len)).trim();
-            if (!this.nextURL.startsWith("http") && !this.nextURL.startsWith("https")) {
-                logger.logInfo("The url '" + this.nextURL + "' has a wrong format. Ignore it.");
-                this.nextURL = null;
-            }
-        } else if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LASTMOD)) {
-            final String dateStr = new String(buf, offset, len);
-            try {
-                this.lastMod = DateFormatter.parseISO8601(dateStr);
-            } catch (final ParseException e) {
-                logger.logInfo("Unable to parse datestring '" + dateStr + "'");
-            }
-        }
-    }
-
-    private CrawlProfile createProfile(final String domainName, final DigestURI sitemapURL) {
-        CrawlProfile p = new CrawlProfile(
-                domainName, sitemapURL,
-                // crawling Filter
-                CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
-                // Depth
-                0,
-                // force recrawling
-                0,
-                // disable Auto-Dom-Filter
-                -1, -1,
-                // allow crawling of dynamic URLs
-                true,
-                // index text + media
-                true, true,
-                // don't store downloaded pages to Web Cache
-                false,
-                // store to TX cache
-                true,
-                // remote Indexing disabled
-                false,
-                // exclude stop-words
-                true, true, true,
-                CrawlProfile.CacheStrategy.IFFRESH);
-        this.sb.crawler.profilesActiveCrawls.put(p.handle().getBytes(), p);
-        return p;
-    }
-}
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -116,7 +116,6 @@ import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.CrawlQueues;
 import de.anomic.crawler.CrawlStacker;
 import de.anomic.crawler.CrawlSwitchboard;
-import de.anomic.crawler.ImporterManager;
 import de.anomic.crawler.NoticedURL;
 import de.anomic.crawler.ResourceObserver;
 import de.anomic.crawler.ResultImages;
@ -216,7 +215,6 @@ public final class Switchboard extends serverSwitch {
    public  userDB                         userDB;
    public  bookmarksDB                    bookmarksDB;
    public  WebStructureGraph              webStructure;
-    public  ImporterManager                dbImportManager;
    public  ArrayList<QueryParams>         localSearches; // array of search result properties as HashMaps
    public  ArrayList<QueryParams>         remoteSearches; // array of search result properties as HashMaps
    public  ConcurrentHashMap<String, TreeSet<Long>> localSearchTracker, remoteSearchTracker; // mappings from requesting host to a TreeSet of Long(access time)
@ -672,7 +670,6 @@ public final class Switchboard extends serverSwitch {
        //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
        //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/kiosk/archiv/ct/2003/4/20"), query, true, 260);

-        this.dbImportManager = new ImporterManager();
        this.trail = new ArrayList<String>();
        
        log.logConfig("Finished Switchboard Initialization");
@ -1152,7 +1149,6 @@ public final class Switchboard extends serverSwitch {
        indexingAnalysisProcessor.awaitShutdown(12000);
        indexingStorageProcessor.awaitShutdown(12000);
        crawlStacker.close();
-        this.dbImportManager.close();
 //        de.anomic.http.client.Client.closeAllConnections();
        wikiDB.close();
        blogDB.close();
@ -1526,6 +1522,7 @@ public final class Switchboard extends serverSwitch {
            ConnectionInfo.cleanUp();
            
            // do transmission of CR-files
+            /*
            checkInterruption();
            int count = rankingOwnDistribution.size() / 100;
            if (count == 0) count = 1;
@ -1534,6 +1531,7 @@ public final class Switchboard extends serverSwitch {
                rankingOwnDistribution.transferRanking(count);
                rankingOtherDistribution.transferRanking(1);
            }
+            */
            
            // clean up delegated stack
            checkInterruption();
@ -1753,7 +1751,7 @@ public final class Switchboard extends serverSwitch {
        try {
            // parse the document
            documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b);
-            assert(documents != null) : "Unexpected error. Parser returned null.";
+            if (documents == null) throw new Parser.Failure("Parser returned null.", response.url());
        } catch (final Parser.Failure e) {
            this.log.logWarning("Unable to parse the resource '" + response.url() + "'. " + e.getMessage());
            addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -135,10 +135,10 @@ public class pdfParser extends AbstractParser implements Parser {
        } finally {
            try {pdfDoc.close();} catch (IOException e) {}
        }
+        pdfDoc = null;

        String[] docKeywords = null;
        if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
-        
        if (docTitle == null) docTitle = docSubject;
    
        byte[] contentBytes;
@ -147,8 +147,6 @@ public class pdfParser extends AbstractParser implements Parser {
        } catch (UnsupportedEncodingException e) {
            Log.logException(e);
            throw new Parser.Failure(e.getMessage(), location);
-        } finally {
-            try {pdfDoc.close();} catch (IOException e) {}
        }

        // clear resources in pdfbox. they say that is resolved but it's not. see:
--- a/source/net/yacy/document/parser/sitemapParser.java
+++ b/source/net/yacy/document/parser/sitemapParser.java
@ -0,0 +1,151 @@
+/**
+ *  sitemapParser.java
+ *  Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
+ *  First released 08.09.2010 at http://yacy.net
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *  
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+package net.yacy.document.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.CharacterData;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.document.AbstractParser;
+import net.yacy.document.Document;
+import net.yacy.document.Parser;
+import net.yacy.document.TextParser;
+import net.yacy.document.parser.html.ImageEntry;
+import net.yacy.kelondro.util.DateFormatter;
+
+public class sitemapParser extends AbstractParser implements Parser {
+
+    public sitemapParser() {
+        super("RSS Parser");
+        // unfortunately sitemap files have neither a mime type nor a typical file extension.
+        //SUPPORTED_EXTENSIONS.add("php");
+        //SUPPORTED_EXTENSIONS.add("xml");
+    }
+    
+    public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
+        SitemapReader sitemap;
+        try {
+            sitemap = new SitemapReader(source);
+        } catch (IOException e) {
+            throw new Parser.Failure("Load error:" + e.getMessage(), url);
+        }
+        
+        List<Document> docs = new ArrayList<Document>();
+        MultiProtocolURI uri;
+        Document doc;
+        for (SitemapEntry item: sitemap) try {
+            uri = new MultiProtocolURI(item.loc);
+            doc = new Document(
+                    uri,
+                    TextParser.mimeOf(url),
+                    charset,
+                    null,
+                    null,
+                    "",
+                    "",
+                    "",
+                    new String[0],
+                    "",
+                    null,
+                    null,
+                    null,
+                    new HashMap<MultiProtocolURI, ImageEntry>(),
+                    false);
+            docs.add(doc);
+        } catch (MalformedURLException e) {
+            continue;
+        }
+        
+        Document[] da = new Document[docs.size()];
+        docs.toArray(da);
+        return da;
+    }
+    
+    public static SitemapReader parse(InputStream stream) throws IOException {
+        return new SitemapReader(stream);
+    }
+
+    /**
+     * for schemas see:
+     * http://www.sitemaps.org/schemas/sitemap/0.9
+     * http://www.google.com/schemas/sitemap/0.84
+     */
+    public static class SitemapReader extends ArrayList<SitemapEntry> {
+        private static final long serialVersionUID = 1337L;
+        public SitemapReader(InputStream source) throws IOException {
+            org.w3c.dom.Document doc;
+            try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); }
+            catch (ParserConfigurationException e) { throw new IOException (e); }
+            catch (SAXException e) { throw new IOException (e); }
+            NodeList nodes = doc.getElementsByTagName("url");
+            for (int i = 0; i < nodes.getLength(); i++)
+                this.add(new SitemapEntry((Element) nodes.item(i)));
+        }
+        public String toString() {
+            StringBuilder sb = new StringBuilder();
+            for (SitemapEntry entry: this) sb.append(entry.toString());
+            return sb.toString();
+        }
+    }
+
+    public static class SitemapEntry {
+        public String loc, lastmod, changefreq, priority;
+        public SitemapEntry(Element element) {
+            loc = val(element, "loc", "");
+            lastmod  = val(element, "lastmod", "");
+            changefreq  = val(element, "changefreq", "");
+            priority  = val(element, "priority", "");
+        }
+        private String val(Element parent, String label, String dflt) {
+            Element e = (Element) parent.getElementsByTagName(label).item(0);
+            if (e == null) return dflt;
+            Node child = e.getFirstChild();
+            return (child instanceof CharacterData) ? ((CharacterData) child).getData() : dflt;
+        }
+        public String url() {
+            return this.loc;
+        }
+        public Date lastmod(Date dflt) {
+            try {
+                return DateFormatter.parseISO8601(lastmod);
+            } catch (final ParseException e) {
+                return dflt;
+            }
+        }
+    }
+    
+}