yacy_search_server/source/de/anomic/crawler/SitemapImporter.java

//SitemapImporter.java 
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2007
//
//this file was contributed by Martin Thelian
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package de.anomic.crawler;

import java.net.MalformedURLException;
import java.util.Date;

import net.yacy.cora.document.UTF8;
import net.yacy.document.parser.sitemapParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.retrieval.Request;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;

public class SitemapImporter extends Thread {

    private CrawlProfile crawlingProfile = null;
    private static final Log logger = new Log("SITEMAP");
    private DigestURI siteMapURL = null;
    private final Switchboard sb;
    
    public SitemapImporter(final Switchboard sb, final DigestURI sitemapURL, final CrawlProfile profileEntry) {
        assert sitemapURL != null;
        this.sb = sb;
        this.siteMapURL = sitemapURL;
        assert profileEntry != null;
        this.crawlingProfile = profileEntry;
    }

    public void run() {
        try {
            logger.logInfo("Start parsing sitemap file " + this.siteMapURL);
            sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL);
            for (sitemapParser.URLEntry entry: parser) process(entry);
        } catch (final Exception e) {
            logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e);
        }
    }

    public void process(sitemapParser.URLEntry entry) {

        // get the url hash
        byte[] nexturlhash = null;
        DigestURI url = null;
        try {
            url = new DigestURI(entry.url());
            nexturlhash = url.hash();
        } catch (final MalformedURLException e1) {
        }

        // check if the url is known and needs to be recrawled
        Date lastMod = entry.lastmod(null);
        if (lastMod != null) {
            final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash);
            if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
                // the url was already loaded. we need to check the date
                final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash);
                if (oldEntry != null) {
                    final Date modDate = oldEntry.moddate();
                    // check if modDate is null
                    if (modDate.after(lastMod)) return;
                }
            }
        }

        // URL needs to crawled
        this.sb.crawlStacker.enqueueEntry(new Request(
                UTF8.getBytes(this.sb.peers.mySeed().hash),
                url,
                null, // this.siteMapURL.toString(),
                entry.url(),
                entry.lastmod(new Date()),
                this.crawlingProfile.handle(),
                0,
                0,
                0,
                0
                ));
        logger.logInfo("New URL '" + entry.url() + "' added for loading.");
    }
}
refactoring: moved importer classes to crawler and plasma package git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4770 6c8d7289-2bf4-0310-a012-ef5d649a1542 17 years ago			`//SitemapImporter.java`
) Bookmarks: Ajax icon is displayed while loading title ) First version of a sitemap parser added - currently only autodetection of sitemap files is supported *) DB-Import restructured - pause/resume should work again now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago			`//------------------------`
			`//part of YaCy`
- removed superfluous copyright statement - updated my email address git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5011 6c8d7289-2bf4-0310-a012-ef5d649a1542 17 years ago			`//(C) by Michael Peter Christen; mc@yacy.net`
) Bookmarks: Ajax icon is displayed while loading title ) First version of a sitemap parser added - currently only autodetection of sitemap files is supported *) DB-Import restructured - pause/resume should work again now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago			`//first published on http://www.anomic.de`
			`//Frankfurt, Germany, 2007`
			`//`
			`//this file was contributed by Martin Thelian`
			`//last major change: $LastChangedDate$ by $LastChangedBy$`
			`//Revision: $LastChangedRevision$`
			`//`
			`//This program is free software; you can redistribute it and/or modify`
			`//it under the terms of the GNU General Public License as published by`
			`//the Free Software Foundation; either version 2 of the License, or`
			`//(at your option) any later version.`
			`//`
			`//This program is distributed in the hope that it will be useful,`
			`//but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`//GNU General Public License for more details.`
			`//`
			`//You should have received a copy of the GNU General Public License`
			`//along with this program; if not, write to the Free Software`
			`//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`

refactoring: moved importer classes to crawler and plasma package git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4770 6c8d7289-2bf4-0310-a012-ef5d649a1542 17 years ago			`package de.anomic.crawler;`
) Bookmarks: Ajax icon is displayed while loading title ) First version of a sitemap parser added - currently only autodetection of sitemap files is supported *) DB-Import restructured - pause/resume should work again now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`import java.net.MalformedURLException;`
			`import java.util.Date;`

more UTF8 getBytes() performance hacks git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7649 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`import net.yacy.cora.document.UTF8;`
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`import net.yacy.document.parser.sitemapParser;`
refactoring git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6398 6c8d7289-2bf4-0310-a012-ef5d649a1542 15 years ago			`import net.yacy.kelondro.data.meta.DigestURI;`
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`import net.yacy.kelondro.data.meta.URIMetadataRow;`
			`import net.yacy.kelondro.logging.Log;`
			`import de.anomic.crawler.retrieval.Request;`
			`import de.anomic.search.Segments;`
refactoring: - removed the plasma package. The name of that package came from a very early pre-version of YaCy, even before YaCy was named AnomicHTTPProxy. The Proxy project introduced search for cache contents using class files that had been developed during the plasma project. Information from 2002 about plasma can be found here: http://web.archive.org/web/20020802110827/http://anomic.de/AnomicPlasma/index.html We stil have one class that comes mostly unchanged from the plasma project, the Condenser class. But this is now part of the document package and all other classes in the plasma package can be assigned to other packages. - cleaned up the http package: better structure of that class and clean isolation of server and client classes. The old HTCache becomes part of the client sub-package of http. - because the plasmaSwitchboard is now part of the search package all servlets had to be touched to declare a different package source. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6232 6c8d7289-2bf4-0310-a012-ef5d649a1542 16 years ago			`import de.anomic.search.Switchboard;`
) Bookmarks: Ajax icon is displayed while loading title ) First version of a sitemap parser added - currently only autodetection of sitemap files is supported *) DB-Import restructured - pause/resume should work again now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`public class SitemapImporter extends Thread {`

			`private CrawlProfile crawlingProfile = null;`
			`private static final Log logger = new Log("SITEMAP");`
			`private DigestURI siteMapURL = null;`
			`private final Switchboard sb;`

			`public SitemapImporter(final Switchboard sb, final DigestURI sitemapURL, final CrawlProfile profileEntry) {`
			`assert sitemapURL != null;`
			`this.sb = sb;`
			`this.siteMapURL = sitemapURL;`
			`assert profileEntry != null;`
			`this.crawlingProfile = profileEntry;`
			`}`
) Bookmarks: Ajax icon is displayed while loading title ) First version of a sitemap parser added - currently only autodetection of sitemap files is supported *) DB-Import restructured - pause/resume should work again now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`public void run() {`
refactoring: moved importer classes to crawler and plasma package git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4770 6c8d7289-2bf4-0310-a012-ef5d649a1542 17 years ago			`try {`
added a sitemap entry parser and loader for sitemaps (a recursion if a sitemap refers to another sitemap) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7295 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`logger.logInfo("Start parsing sitemap file " + this.siteMapURL);`
			`sitemapParser.SitemapReader parser = sitemapParser.parse(this.siteMapURL);`
			`for (sitemapParser.URLEntry entry: parser) process(entry);`
added final where possible git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5030 6c8d7289-2bf4-0310-a012-ef5d649a1542 17 years ago			`} catch (final Exception e) {`
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e);`
refactoring: moved importer classes to crawler and plasma package git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4770 6c8d7289-2bf4-0310-a012-ef5d649a1542 17 years ago			`}`
			`}`
) Bookmarks: Ajax icon is displayed while loading title ) First version of a sitemap parser added - currently only autodetection of sitemap files is supported *) DB-Import restructured - pause/resume should work again now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago
added a sitemap entry parser and loader for sitemaps (a recursion if a sitemap refers to another sitemap) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7295 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`public void process(sitemapParser.URLEntry entry) {`
) Bookmarks: Ajax icon is displayed while loading title ) First version of a sitemap parser added - currently only autodetection of sitemap files is supported *) DB-Import restructured - pause/resume should work again now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`// get the url hash`
			`byte[] nexturlhash = null;`
			`DigestURI url = null;`
			`try {`
* add a bit documentation to DigestURI, use DigestURI(string) instead of DigestURI(string, null) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7276 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`url = new DigestURI(entry.url());`
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`nexturlhash = url.hash();`
			`} catch (final MalformedURLException e1) {`
			`}`
) Bookmarks: Ajax icon is displayed while loading title ) First version of a sitemap parser added - currently only autodetection of sitemap files is supported *) DB-Import restructured - pause/resume should work again now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`// check if the url is known and needs to be recrawled`
			`Date lastMod = entry.lastmod(null);`
			`if (lastMod != null) {`
			`final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash);`
			`if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {`
			`// the url was already loaded. we need to check the date`
code cleanup git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7713 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash);`
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`if (oldEntry != null) {`
			`final Date modDate = oldEntry.moddate();`
			`// check if modDate is null`
			`if (modDate.after(lastMod)) return;`
			`}`
			`}`
			`}`

			`// URL needs to crawled`
			`this.sb.crawlStacker.enqueueEntry(new Request(`
more UTF8 getBytes() performance hacks git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7649 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`UTF8.getBytes(this.sb.peers.mySeed().hash),`
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`url,`
			`null, // this.siteMapURL.toString(),`
			`entry.url(),`
redesign of the SortStack and SortStore classes: created a WeakPriorityBlockingQueue as special implementation of a PriorityBlockingQueue with a weak object binding. - better abstraction of ordering technique - fixed some bugs according to result numbering (distinguish different counters in Queue) - fixed a ordering bug in post-ranking (ordering was decreased instead of increased) - reversed ordering numbering using a reversed ordering. The higher the ranking number the better (now). git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7128 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`entry.lastmod(new Date()),`
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`this.crawlingProfile.handle(),`
			`0,`
			`0,`
enhanced crawler: - added a new queue 'noload' which can be filled with urls where it is already known that the content cannot be loaded. This may be because there is no parser available or the file is too big - the noload queue is emptied with the parser process which indexes the file names only - the 'start from file' functionality now also reads from ftp crawler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7368 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`0,`
fixed old sitemap importer which was not able to parse urls containing post elements - removed old parser - removed old importer framework (was only used by removed old parser) - added a new sitemap parser in parser framework - linked new parser with parser access in old sitemap processing routines git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542 14 years ago			`0`
			`));`
			`logger.logInfo("New URL '" + entry.url() + "' added for loading.");`
			`}`
) Bookmarks: Ajax icon is displayed while loading title ) First version of a sitemap parser added - currently only autodetection of sitemap files is supported *) DB-Import restructured - pause/resume should work again now git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542 18 years ago			`}`