fixed old sitemap importer which was not able to parse urls containing post elements

- removed old parser
- removed old importer framework (was only used by removed old parser)
- added a new sitemap parser in parser framework
- linked new parser with parser access in old sitemap processing routines

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7126 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent b73ea6581d
commit 114bdd8ba7

@ -451,11 +451,9 @@ public class Crawler_p {
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
// create a new sitemap importer
final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new DigestURI(sitemapURLStr, null), pe);
if (importerThread != null) {
importerThread.setJobID(sb.dbImportManager.generateUniqueJobID());
importerThread.startIt();
}
final SitemapImporter importer = new SitemapImporter(sb, new DigestURI(sitemapURLStr, null), pe);
importer.start();
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url

@ -1,105 +0,0 @@
package de.anomic.crawler;
public abstract class AbstractImporter extends Thread implements Importer {
private int jobID = -1;
private String jobType;
private boolean stopped = false;
private boolean paused = false;
private long globalStart = System.currentTimeMillis();
protected long globalEnd;
private long globalPauseLast;
private long globalPauseDuration;
private String error;
public AbstractImporter(final String theJobType) {
this.jobType = theJobType;
// initializing the logger and setting a more verbose thread name
this.setName("IMPORT_" + this.jobType + "_" + this.jobID);
}
public String getError() {
return this.error;
}
public void startIt() {
this.start();
}
public void stopIt() throws InterruptedException {
this.stopped = true;
this.continueIt();
this.join();
}
public void pauseIt() {
synchronized(this) {
this.globalPauseLast = System.currentTimeMillis();
this.paused = true;
}
}
public void continueIt() {
synchronized(this) {
if (this.paused) {
this.globalPauseDuration += System.currentTimeMillis()-this.globalPauseLast;
this.paused = false;
this.notifyAll();
}
}
}
public boolean isPaused() {
synchronized(this) {
return this.paused;
}
}
protected boolean isAborted() {
synchronized(this) {
if (this.paused) {
try {
this.wait();
}
catch (final InterruptedException e){}
}
}
return (this.stopped) || Thread.currentThread().isInterrupted();
}
public boolean isStopped() {
return !this.isAlive();
}
public int getJobID() {
return this.jobID;
}
public void setJobID(final int id) {
if (this.jobID != -1) throw new IllegalStateException("job ID already assigned");
this.jobID = id;
}
public long getTotalRuntime() {
return (this.globalEnd == 0)?System.currentTimeMillis()-(this.globalStart+this.globalPauseDuration):this.globalEnd-(this.globalStart+this.globalPauseDuration);
}
public long getElapsedTime() {
if(this.paused) {
this.globalPauseDuration += System.currentTimeMillis()-this.globalPauseLast;
this.globalPauseLast = System.currentTimeMillis();
}
return isStopped()?this.globalEnd-(this.globalStart+this.globalPauseDuration):System.currentTimeMillis()-(this.globalStart+this.globalPauseDuration);
}
public String getJobType() {
return this.jobType;
}
public abstract long getEstimatedTime();
public abstract String getJobName();
public abstract int getProcessingStatusPercent();
}

@ -1,25 +0,0 @@
package de.anomic.crawler;
public interface Importer {
// functions to pause and continue importing
public boolean isPaused();
public void pauseIt();
public void continueIt();
public void stopIt() throws InterruptedException;
public boolean isStopped();
// getting status information
public long getTotalRuntime();
public long getElapsedTime();
public long getEstimatedTime();
public int getProcessingStatusPercent();
public int getJobID();
public void setJobID(int id);
public String getJobName();
public String getJobType();
public String getError();
public String getStatus();
public void startIt();
}

@ -1,104 +0,0 @@
package de.anomic.crawler;
import java.util.Vector;
import net.yacy.kelondro.logging.Log;
public class ImporterManager {
public final Vector<Importer> finishedJobs;
public final ThreadGroup runningJobs;
public int currMaxJobNr;
public ImporterManager() {
this.finishedJobs = new Vector<Importer>();
this.runningJobs = new ThreadGroup("ImporterThreads");
this.currMaxJobNr = 0;
}
public int generateUniqueJobID() {
int jobID;
synchronized(this.runningJobs) {
jobID = this.currMaxJobNr;
this.currMaxJobNr++;
}
return jobID;
}
public Importer[] getRunningImporter() {
final Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2];
final int activeCount = this.runningJobs.enumerate(importThreads);
final Importer[] importers = new Importer[activeCount];
for (int i = 0; i < activeCount; i++) {
importers[i] = (Importer) importThreads[i];
}
return importers;
}
public Importer[] getFinishedImporter() {
return this.finishedJobs.toArray(new Importer[this.finishedJobs.size()]);
}
public Importer getImporterByID(final int jobID) {
final Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2];
for(final Thread importThread : importThreads) {
final Importer currThread = (Importer) importThread;
if (currThread.getJobID() == jobID) {
return currThread;
}
}
return null;
}
/**
* Can be used to close all still running importer threads
* e.g. on server shutdown
*/
public void close() {
/* clear the finished thread list */
this.finishedJobs.clear();
/* waiting for all threads to finish */
int threadCount = this.runningJobs.activeCount();
final Thread[] threadList = new Thread[threadCount];
threadCount = this.runningJobs.enumerate(threadList);
if (threadCount == 0) return;
final Log log = new Log("DB-IMPORT");
try {
// trying to gracefull stop all still running sessions ...
log.logInfo("Signaling shutdown to " + threadCount + " remaining dbImporter threads ...");
for (final Thread currentThread : threadList) {
if (currentThread.isAlive()) {
((Importer)currentThread).stopIt();
}
}
// waiting a few ms for the session objects to continue processing
try { Thread.sleep(500); } catch (final InterruptedException ex) {}
// interrupting all still running or pooled threads ...
log.logInfo("Sending interruption signal to " + runningJobs.activeCount() + " remaining dbImporter threads ...");
runningJobs.interrupt();
// we need to use a timeout here because of missing interruptable session threads ...
if (log.isFine()) log.logFine("Waiting for " + runningJobs.activeCount() + " remaining dbImporter threads to finish shutdown ...");
int currentThreadIdx = 0;
for (final Thread currentThread : threadList) {
if (currentThread.isAlive()) {
if (log.isFine()) log.logFine("Waiting for dbImporter thread '" + currentThread.getName() + "' [" + currentThreadIdx++ + "] to finish shutdown.");
try { currentThread.join(500); } catch (final InterruptedException ex) {}
}
}
log.logInfo("Shutdown of remaining dbImporter threads finished.");
} catch (final Exception e) {
log.logSevere("Unexpected error while trying to shutdown all remaining dbImporter threads.",e);
}
}
}

@ -25,75 +25,117 @@
package de.anomic.crawler;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.document.parser.sitemapParser;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.data.SitemapParser;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.io.ByteCountInputStream;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
public class SitemapImporter extends AbstractImporter implements Importer {
public class SitemapImporter extends Thread {
private CrawlProfile crawlingProfile = null;
private static final Log logger = new Log("SITEMAP");
private DigestURI siteMapURL = null;
private final Switchboard sb;
public SitemapImporter(final Switchboard sb, final DigestURI sitemapURL, final CrawlProfile profileEntry) {
assert sitemapURL != null;
this.sb = sb;
this.siteMapURL = sitemapURL;
assert profileEntry != null;
this.crawlingProfile = profileEntry;
}
private final SitemapParser parser;
private final DigestURI sitemapURL;
private final ImporterManager superviser;
public SitemapImporter(final Switchboard sb, final ImporterManager importManager, final DigestURI sitemapURL, final CrawlProfile profileEntry) throws ImporterException {
super("sitemap");
this.superviser = importManager;
public void run() {
// download document
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
final HTTPClient client = new HTTPClient();
client.setTimout(5000);
client.setHeader(requestHeader.entrySet());
try {
// getting the sitemap URL
this.sitemapURL = sitemapURL;
// creating the sitemap parser
this.parser = new SitemapParser(sb, this.sitemapURL, profileEntry);
try {
client.GET(siteMapURL.toString());
if (client.getStatusCode() != 200) {
logger.logWarning("Unable to download the sitemap file " + this.siteMapURL +
"\nServer returned status: " + client.getHttpResponse().getStatusLine());
return;
}
// get some metadata
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final String contentMimeType = header.mime();
InputStream contentStream = client.getContentstream();
if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
if (logger.isFine()) logger.logFine("Sitemap file has mimetype " + contentMimeType);
contentStream = new GZIPInputStream(contentStream);
}
final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
// parse it
logger.logInfo("Start parsing sitemap file " + this.siteMapURL + "\n\tMimeType: " + contentMimeType + "\n\tLength: " + header.getContentLength());
sitemapParser.SitemapReader parser = sitemapParser.parse(counterStream);
for (sitemapParser.SitemapEntry entry: parser) process(entry);
} finally {
client.finish();
}
} catch (final Exception e) {
throw new ImporterException("Unable to initialize Importer",e);
logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e);
}
}
public long getEstimatedTime() {
final long t = getElapsedTime();
final int p = getProcessingStatusPercent();
return (p==0)?0:(t/p)*(100-p);
}
/**
* @see Importer#getJobName()
*/
public String getJobName() {
return this.sitemapURL.toString();
}
public void process(sitemapParser.SitemapEntry entry) {
/**
* @see Importer#getProcessingStatusPercent()
*/
public int getProcessingStatusPercent() {
if (this.parser == null) return 0;
final long total = this.parser.getTotalLength();
final long processed = this.parser.getProcessedLength();
if (total <= 1) return 0;
return (int) ((processed*100)/ total);
}
// get the url hash
byte[] nexturlhash = null;
DigestURI url = null;
try {
url = new DigestURI(entry.url(), null);
nexturlhash = url.hash();
} catch (final MalformedURLException e1) {
}
/**
* @see Importer#getStatus()
*/
public String getStatus() {
final StringBuilder theStatus = new StringBuilder();
theStatus.append("#URLs=").append((this.parser==null)?0:this.parser.getUrlcount());
return theStatus.toString();
}
public void run() {
try {
this.parser.parse();
} finally {
this.globalEnd = System.currentTimeMillis();
this.superviser.finishedJobs.add(this);
}
}
// check if the url is known and needs to be recrawled
Date lastMod = entry.lastmod(null);
if (lastMod != null) {
final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date
final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash, null, 0);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null
if (modDate.after(lastMod)) return;
}
}
}
// URL needs to crawled
this.sb.crawlStacker.enqueueEntry(new Request(
this.sb.peers.mySeed().hash.getBytes(),
url,
null, // this.siteMapURL.toString(),
entry.url(),
new Date(),
this.crawlingProfile.handle(),
0,
0,
0
));
logger.logInfo("New URL '" + entry.url() + "' added for loading.");
}
}

@ -1,358 +0,0 @@
// SitemapParser.java
// ------------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2007
//
// this file is contributed by Martin Thelian
// last major change: $LastChangedDate$ by $LastChangedBy$
// Revision: $LastChangedRevision$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.io.ByteCountInputStream;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
//import de.anomic.http.client.Client;
//import de.anomic.http.server.ResponseContainer;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
/**
* Class to parse a sitemap file.<br>
* An example sitemap file is depicted below:<br>
*
* <pre>
* &lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;
* &lt;urlset xmlns=&quot;http://www.sitemaps.org/schemas/sitemap/0.9&quot;&gt;
* &lt;url&gt;
* &lt;loc&gt;http://www.example.com/&lt;/loc&gt;
* &lt;lastmod&gt;2005-01-01&lt;/lastmod&gt;
* &lt;changefreq&gt;monthly&lt;/changefreq&gt;
* &lt;priority&gt;0.8&lt;/priority&gt;
* &lt;/url&gt;
* &lt;/urlset&gt;
* </pre>
*
* A real example can be found here: http://www.xt-service.de/sitemap.xml An example robots.txt containing a sitemap
* URL: http://notepad.emaillink.de/robots.txt
*
* @see Protocol at sitemaps.org <a href="http://www.sitemaps.org/protocol.php">http://www.sitemaps.org/protocol.php</a>
* @see Protocol at google.com <a
* href="https://www.google.com/webmasters/tools/docs/en/protocol.html">https://www.google.com/webmasters/tools/docs/en/protocol.html</a>
*/
public class SitemapParser extends DefaultHandler {
public static final String XMLNS_SITEMAPS_ORG = "http://www.sitemaps.org/schemas/sitemap/0.9";
public static final String XMLNS_SITEMAPS_GOOGLE = "http://www.google.com/schemas/sitemap/0.84";
public static final String SITEMAP_XMLNS = "xmlns";
public static final String SITEMAP_URLSET = "urlset";
public static final String SITEMAP_URL = "url";
public static final String SITEMAP_URL_LOC = "loc";
public static final String SITEMAP_URL_LASTMOD = "lastmod";
public static final String SITEMAP_URL_CHANGEFREQ = "changefreq";
public static final String SITEMAP_URL_PRIORITY = "priority";
/**
* The crawling profile used to parse the URLs contained in the sitemap file
*/
private CrawlProfile crawlingProfile = null;
/**
* Name of the current XML element
*/
private String currentElement = null;
/**
* A special stream to count how many bytes were processed so far
*/
private int streamCounter = 0;
/**
* The total length of the sitemap file
*/
private long contentLength;
/**
* The amount of urls processes so far
*/
private int urlCounter = 0;
/**
* the logger
*/
private static final Log logger = new Log("SITEMAP");
/**
* The location of the sitemap file
*/
private DigestURI siteMapURL = null;
/**
* The next URL to enqueue
*/
private String nextURL = null;
/**
* last modification date of the {@link #nextURL}
*/
private Date lastMod = null;
private final Switchboard sb;
public SitemapParser(final Switchboard sb, final DigestURI sitemap, final CrawlProfile theCrawlingProfile) {
assert sitemap != null;
this.sb = sb;
this.siteMapURL = sitemap;
if (theCrawlingProfile == null) {
// create a new profile
this.crawlingProfile = createProfile(this.siteMapURL.getHost(), this.siteMapURL);
} else {
// use an existing profile
this.crawlingProfile = theCrawlingProfile;
}
}
/**
* Function to download and parse the sitemap file
*/
public void parse() {
// download document
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
// final Client client = new Client(5000, requestHeader);
// ResponseContainer res = null;
final HTTPClient client = new HTTPClient();
client.setTimout(5000);
client.setHeader(requestHeader.entrySet());
try {
// res = client.GET(siteMapURL.toString());
// if (res.getStatusCode() != 200) {
// logger.logWarning("Unable to download the sitemap file " + this.siteMapURL +
// "\nServer returned status: " + res.getStatusLine());
// return;
// }
try {
client.GET(siteMapURL.toString());
if (client.getStatusCode() != 200) {
logger.logWarning("Unable to download the sitemap file " + this.siteMapURL +
"\nServer returned status: " + client.getHttpResponse().getStatusLine());
return;
}
// getting some metadata
// final String contentMimeType = res.getResponseHeader().mime();
// this.contentLength = res.getResponseHeader().getContentLength();
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final String contentMimeType = header.mime();
this.contentLength = header.getContentLength();
// try {
// InputStream contentStream = res.getDataAsStream();
InputStream contentStream = client.getContentstream();
if ((contentMimeType != null) &&
(contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
if (logger.isFine()) logger.logFine("Sitemap file has mimetype " + contentMimeType);
contentStream = new GZIPInputStream(contentStream);
}
final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
// parse it
logger.logInfo("Start parsing sitemap file " + this.siteMapURL + "\n\tMimeType: " + contentMimeType +
"\n\tLength: " + this.contentLength);
final SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser();
saxParser.parse(counterStream, this);
streamCounter += counterStream.getCount();
} finally {
// res.closeStream();
client.finish();
}
} catch (final Exception e) {
logger.logWarning("Unable to parse sitemap file " + this.siteMapURL, e);
// } finally {
// if (res != null) {
// // release connection
// res.closeStream();
// }
}
}
/**
* @return the total length of the sitemap file in bytes or <code>-1</code> if the length is unknown
*/
public long getTotalLength() {
return this.contentLength;
}
/**
* @return the amount of bytes of the sitemap file that were downloaded so far
*/
public long getProcessedLength() {
return streamCounter;
}
/**
* @return the amount of URLs that were successfully enqueued so far
*/
public long getUrlcount() {
return this.urlCounter;
}
/**
* @param localName local name
* @param qName qualified name
* @see DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
*/
@Override
public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes attrs) throws SAXException {
this.currentElement = qName;
// testing if the namespace is known
if (qName.equalsIgnoreCase(SITEMAP_URLSET)) {
final String namespace = attrs.getValue(SITEMAP_XMLNS);
if ((namespace == null) ||
((!namespace.equals(XMLNS_SITEMAPS_ORG)) && (!namespace.equals(XMLNS_SITEMAPS_GOOGLE))))
throw new SAXException("Unknown sitemap namespace: " + namespace);
}
}
/**
* @param localName local name
* @param qName qualified name
* @throws SAXException
* @see DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
*/
@Override
public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException {
this.currentElement = "";
if (qName.equalsIgnoreCase(SITEMAP_URL)) {
if (this.nextURL == null)
return;
// get the url hash
byte[] nexturlhash = null;
DigestURI url = null;
try {
url = new DigestURI(this.nextURL, null);
nexturlhash = url.hash();
} catch (final MalformedURLException e1) {
}
// check if the url is known and needs to be recrawled
if (this.lastMod != null) {
final String dbocc = this.sb.urlExists(Segments.Process.LOCALCRAWLING, nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date
final URIMetadataRow oldEntry = this.sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(nexturlhash, null, 0);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null
if (modDate.after(this.lastMod))
return;
}
}
}
// URL needs to crawled
this.sb.crawlStacker.enqueueEntry(new Request(
this.sb.peers.mySeed().hash.getBytes(),
url,
null, // this.siteMapURL.toString(),
this.nextURL,
new Date(),
this.crawlingProfile.handle(),
0,
0,
0
));
logger.logInfo("New URL '" + this.nextURL + "' added for crawling.");
this.urlCounter++;
}
}
@Override
public void characters(final char[] buf, final int offset, final int len) throws SAXException {
if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LOC)) {
// TODO: we need to decode the URL here
this.nextURL = (new String(buf, offset, len)).trim();
if (!this.nextURL.startsWith("http") && !this.nextURL.startsWith("https")) {
logger.logInfo("The url '" + this.nextURL + "' has a wrong format. Ignore it.");
this.nextURL = null;
}
} else if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LASTMOD)) {
final String dateStr = new String(buf, offset, len);
try {
this.lastMod = DateFormatter.parseISO8601(dateStr);
} catch (final ParseException e) {
logger.logInfo("Unable to parse datestring '" + dateStr + "'");
}
}
}
private CrawlProfile createProfile(final String domainName, final DigestURI sitemapURL) {
CrawlProfile p = new CrawlProfile(
domainName, sitemapURL,
// crawling Filter
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
// Depth
0,
// force recrawling
0,
// disable Auto-Dom-Filter
-1, -1,
// allow crawling of dynamic URLs
true,
// index text + media
true, true,
// don't store downloaded pages to Web Cache
false,
// store to TX cache
true,
// remote Indexing disabled
false,
// exclude stop-words
true, true, true,
CrawlProfile.CacheStrategy.IFFRESH);
this.sb.crawler.profilesActiveCrawls.put(p.handle().getBytes(), p);
return p;
}
}

@ -116,7 +116,6 @@ import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.CrawlStacker;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.ImporterManager;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.ResourceObserver;
import de.anomic.crawler.ResultImages;
@ -216,7 +215,6 @@ public final class Switchboard extends serverSwitch {
public userDB userDB;
public bookmarksDB bookmarksDB;
public WebStructureGraph webStructure;
public ImporterManager dbImportManager;
public ArrayList<QueryParams> localSearches; // array of search result properties as HashMaps
public ArrayList<QueryParams> remoteSearches; // array of search result properties as HashMaps
public ConcurrentHashMap<String, TreeSet<Long>> localSearchTracker, remoteSearchTracker; // mappings from requesting host to a TreeSet of Long(access time)
@ -672,7 +670,6 @@ public final class Switchboard extends serverSwitch {
//plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
//plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/kiosk/archiv/ct/2003/4/20"), query, true, 260);
this.dbImportManager = new ImporterManager();
this.trail = new ArrayList<String>();
log.logConfig("Finished Switchboard Initialization");
@ -1152,7 +1149,6 @@ public final class Switchboard extends serverSwitch {
indexingAnalysisProcessor.awaitShutdown(12000);
indexingStorageProcessor.awaitShutdown(12000);
crawlStacker.close();
this.dbImportManager.close();
// de.anomic.http.client.Client.closeAllConnections();
wikiDB.close();
blogDB.close();
@ -1526,6 +1522,7 @@ public final class Switchboard extends serverSwitch {
ConnectionInfo.cleanUp();
// do transmission of CR-files
/*
checkInterruption();
int count = rankingOwnDistribution.size() / 100;
if (count == 0) count = 1;
@ -1534,6 +1531,7 @@ public final class Switchboard extends serverSwitch {
rankingOwnDistribution.transferRanking(count);
rankingOtherDistribution.transferRanking(1);
}
*/
// clean up delegated stack
checkInterruption();
@ -1753,7 +1751,7 @@ public final class Switchboard extends serverSwitch {
try {
// parse the document
documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b);
assert(documents != null) : "Unexpected error. Parser returned null.";
if (documents == null) throw new Parser.Failure("Parser returned null.", response.url());
} catch (final Parser.Failure e) {
this.log.logWarning("Unable to parse the resource '" + response.url() + "'. " + e.getMessage());
addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), e.getMessage());

@ -135,10 +135,10 @@ public class pdfParser extends AbstractParser implements Parser {
} finally {
try {pdfDoc.close();} catch (IOException e) {}
}
pdfDoc = null;
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
if (docTitle == null) docTitle = docSubject;
byte[] contentBytes;
@ -147,8 +147,6 @@ public class pdfParser extends AbstractParser implements Parser {
} catch (UnsupportedEncodingException e) {
Log.logException(e);
throw new Parser.Failure(e.getMessage(), location);
} finally {
try {pdfDoc.close();} catch (IOException e) {}
}
// clear resources in pdfbox. they say that is resolved but it's not. see:

@ -0,0 +1,151 @@
/**
* sitemapParser.java
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 08.09.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.CharacterData;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.util.DateFormatter;
public class sitemapParser extends AbstractParser implements Parser {
public sitemapParser() {
super("RSS Parser");
// unfortunately sitemap files have neither a mime type nor a typical file extension.
//SUPPORTED_EXTENSIONS.add("php");
//SUPPORTED_EXTENSIONS.add("xml");
}
public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
SitemapReader sitemap;
try {
sitemap = new SitemapReader(source);
} catch (IOException e) {
throw new Parser.Failure("Load error:" + e.getMessage(), url);
}
List<Document> docs = new ArrayList<Document>();
MultiProtocolURI uri;
Document doc;
for (SitemapEntry item: sitemap) try {
uri = new MultiProtocolURI(item.loc);
doc = new Document(
uri,
TextParser.mimeOf(url),
charset,
null,
null,
"",
"",
"",
new String[0],
"",
null,
null,
null,
new HashMap<MultiProtocolURI, ImageEntry>(),
false);
docs.add(doc);
} catch (MalformedURLException e) {
continue;
}
Document[] da = new Document[docs.size()];
docs.toArray(da);
return da;
}
public static SitemapReader parse(InputStream stream) throws IOException {
return new SitemapReader(stream);
}
/**
* for schemas see:
* http://www.sitemaps.org/schemas/sitemap/0.9
* http://www.google.com/schemas/sitemap/0.84
*/
public static class SitemapReader extends ArrayList<SitemapEntry> {
private static final long serialVersionUID = 1337L;
public SitemapReader(InputStream source) throws IOException {
org.w3c.dom.Document doc;
try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); }
catch (ParserConfigurationException e) { throw new IOException (e); }
catch (SAXException e) { throw new IOException (e); }
NodeList nodes = doc.getElementsByTagName("url");
for (int i = 0; i < nodes.getLength(); i++)
this.add(new SitemapEntry((Element) nodes.item(i)));
}
public String toString() {
StringBuilder sb = new StringBuilder();
for (SitemapEntry entry: this) sb.append(entry.toString());
return sb.toString();
}
}
public static class SitemapEntry {
public String loc, lastmod, changefreq, priority;
public SitemapEntry(Element element) {
loc = val(element, "loc", "");
lastmod = val(element, "lastmod", "");
changefreq = val(element, "changefreq", "");
priority = val(element, "priority", "");
}
private String val(Element parent, String label, String dflt) {
Element e = (Element) parent.getElementsByTagName(label).item(0);
if (e == null) return dflt;
Node child = e.getFirstChild();
return (child instanceof CharacterData) ? ((CharacterData) child).getData() : dflt;
}
public String url() {
return this.loc;
}
public Date lastmod(Date dflt) {
try {
return DateFormatter.parseISO8601(lastmod);
} catch (final ParseException e) {
return dflt;
}
}
}
}
Loading…
Cancel
Save