You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/crawler/FileCrawlStarterTask.java

182 lines
6.7 KiB

// FileCrawlStarterTask.java
// ---------------------------
// Copyright 2016 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.crawler;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Writer;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.util.FileUtils;
/**
* A task used trigger crawl starts from a file (HTML or any other supported
* text file) containing anchor links. It does not wait full file parsing before
* sending anchor links to the crawl stacker and thus can handle files with many
* links.
*
* @author luccioman
*/
public class FileCrawlStarterTask extends Thread {
private final static ConcurrentLog log = new ConcurrentLog(FileCrawlStarterTask.class.getSimpleName());
/** A text file containing crawl start links */
private File crawlingFile;
/** Alternative to crawlingFile : holds file content */
private String crawlingFileContent;
/** Content scraper that will scrape file content */
private ContentScraper scraper;
/** Active crawl profile */
private CrawlProfile profile;
/**
* CrawlStacker instance : will receive anchor links used as crawl starting
* points
*/
private CrawlStacker crawlStacker;
/** Hash of the peer initiating the crawl */
private final byte[] initiatorHash;
/**
* Constructor
*
* @param crawlingFile
* a text file containing crawl start links (alternatively,
* crawlingFileContent parameter can be used)
* @param crawlingFileContent
* content of a text file containing crawl start links
* (alternatively, crawlingFile parameter can be used)
* @param scraper
* ContentScraper instance used to scrape links from the file
* @param profile
* active crawl profile (must not be null)
* @param crawlStacker
* CrawlStacker instance : will receive anchor links used as
* crawl starting points (must not be null)
* @param initiatorHash
* Hash of the peer initiating the crawl
* @throws IllegalArgumentException
* when one of the required parameters is null
* @throws IOException
* when crawlingFileContent is null and crawlingFile does not
* exists or can not be read
*/
public FileCrawlStarterTask(final File crawlingFile, final String crawlingFileContent, final ContentScraper scraper,
final CrawlProfile profile, final CrawlStacker crawlStacker, final byte[] initiatorHash)
throws IllegalArgumentException, FileNotFoundException, IOException {
super(FileCrawlStarterTask.class.getSimpleName());
if (crawlingFile == null && crawlingFileContent == null) {
throw new IllegalArgumentException(
"At least one of crawlingFile or crawlingFileContent parameter must not be null");
}
if ((crawlingFileContent == null || crawlingFileContent.isEmpty()) && crawlingFile != null) {
/*
* Lets check now if the crawlingFile exists and can be read so the
* error can be synchronously reported to the caller
*/
if (!crawlingFile.exists()) {
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " does not exists");
}
if (!crawlingFile.isFile()) {
throw new FileNotFoundException(crawlingFile.getAbsolutePath() + " exists but is not a regular file");
}
if (!crawlingFile.canRead()) {
throw new IOException("Can not read : " + crawlingFile.getAbsolutePath());
}
}
this.crawlingFile = crawlingFile;
this.crawlingFileContent = crawlingFileContent;
if (scraper == null) {
throw new IllegalArgumentException("scraper parameter must not be null");
}
this.scraper = scraper;
if (profile == null) {
throw new IllegalArgumentException("profile parameter must not be null");
}
this.profile = profile;
if (crawlStacker == null) {
throw new IllegalArgumentException("crawlStacker parameter must not be null");
}
this.crawlStacker = crawlStacker;
if (initiatorHash == null) {
throw new IllegalArgumentException("initiatorHash parameter must not be null");
}
this.initiatorHash = initiatorHash;
}
/**
* Run the content scraping on the file and once detected push any anchor
* link to the crawlStacker.
*/
@Override
public void run() {
/*
* This is the listener which makes possible the push of links to the
* crawl stacker without waiting the complete end of content scraping
*/
CrawlStarterFromSraper anchorListener = new CrawlStarterFromSraper(this.crawlStacker, this.initiatorHash,
this.profile, true);
this.scraper.registerHtmlFilterEventListener(anchorListener);
final Writer writer = new TransformerWriter(null, null, this.scraper, null, false);
FileInputStream inStream = null;
try {
if (this.crawlingFile != null && this.crawlingFile.exists()) {
inStream = new FileInputStream(this.crawlingFile);
FileUtils.copy(inStream, writer);
} else {
FileUtils.copy(this.crawlingFileContent, writer);
}
writer.close();
} catch (IOException e) {
log.severe("Error parsing the crawlingFile " + this.crawlingFile.getAbsolutePath(), e);
} catch (IllegalCrawlProfileException e) {
/* We should get here when the crawl is stopped manually before termination */
log.info("Parsing crawlingFile " + this.crawlingFile.getAbsolutePath() + " terminated. Crawl profile "
+ this.profile.handle() + " is no more active.");
} catch (Exception e) {
/*
* Other errors are likely to occur when the crawl is interrupted :
* still log this at warning level to avoid polluting regular error
* log level
*/
log.warn("Error parsing the crawlingFile " + this.crawlingFile.getAbsolutePath(), e);
} finally {
if (inStream != null) {
try {
inStream.close();
} catch (IOException e) {
log.warn("Could not close crawlingFile : " + this.crawlingFile.getAbsolutePath());
}
}
}
}
}