- scanned remote search process and enhanced some data structure and synchronizations here and there

- removed concurrency overhead for small number of index normalizations as it happens during remote search
- removed 'load only parseable' constraint for snippet fetch because some resources may not have any url file extension and these had therefore not been parseable and searcheable since they may become parseable after loading when their mime type is known
- this partly fixes some problems with http://forum.yacy-websuche.de/viewtopic.php?p=20300#p20300 but more changes are necessary to get all expected search results

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6926 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 7ddb70e7c6
commit 87087f12fe

@ -563,7 +563,7 @@ public class CrawlQueues {
try { try {
request.setStatus("loading", WorkflowJob.STATUS_RUNNING); request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
Response response = sb.loader.load(request, true, maxFileSize); Response response = sb.loader.load(request, maxFileSize);
if (response == null) { if (response == null) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED); request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)"); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");

@ -28,7 +28,6 @@ import java.io.IOException;
import java.util.Date; import java.util.Date;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
@ -75,14 +74,14 @@ public final class HTTPLoader {
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000); this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
} }
public Response load(final Request entry, final boolean acceptOnlyParseable, long maxFileSize) throws IOException { public Response load(final Request entry, long maxFileSize) throws IOException {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize); Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize);
Latency.update(entry.url(), System.currentTimeMillis() - start); Latency.update(entry.url(), System.currentTimeMillis() - start);
return doc; return doc;
} }
private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount, final long maxFileSize) throws IOException { private Response load(final Request request, final int retryCount, final long maxFileSize) throws IOException {
if (retryCount < 0) { if (retryCount < 0) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded"); sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
@ -96,15 +95,6 @@ public final class HTTPLoader {
final boolean ssl = request.url().getProtocol().equals("https"); final boolean ssl = request.url().getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443 : 80; if (port < 0) port = (ssl) ? 443 : 80;
// if not the right file type then reject file
if (acceptOnlyParseable) {
String supportError = TextParser.supportsExtension(request.url());
if (supportError != null) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, supportError);
throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
}
}
// check if url is in blacklist // check if url is in blacklist
final String hostlow = host.toLowerCase(); final String hostlow = host.toLowerCase();
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) { if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
@ -138,15 +128,6 @@ public final class HTTPLoader {
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok // the transfer is ok
if (acceptOnlyParseable) {
// if the response has not the right file type then reject file
String supportError = TextParser.supports(request.url(), res.getResponseHeader().mime());
if (supportError != null) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, supportError);
throw new IOException("REJECTED WRONG MIME TYPE, mime = " + res.getResponseHeader().mime() + ": " + supportError);
}
}
// we write the new cache entry to file system directly // we write the new cache entry to file system directly
res.setAccountingName("CRAWLER"); res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData(); final byte[] responseBody = res.getData();
@ -202,7 +183,7 @@ public final class HTTPLoader {
// retry crawling with new url // retry crawling with new url
request.redirectURL(redirectionUrl); request.redirectURL(redirectionUrl);
return load(request, acceptOnlyParseable, retryCount - 1, maxFileSize); return load(request, retryCount - 1, maxFileSize);
} }
} else { } else {
// if the response has not the right response type then reject file // if the response has not the right response type then reject file

@ -367,7 +367,7 @@ public final class Switchboard extends serverSwitch {
indexSegments.segment(Segments.Process.LOCALCRAWLING), indexSegments.segment(Segments.Process.LOCALCRAWLING),
peers, peers,
true, true,
30000); 10000);
// set up local robots.txt // set up local robots.txt
this.robotstxtConfig = RobotsTxtConfig.init(this); this.robotstxtConfig = RobotsTxtConfig.init(this);
@ -894,7 +894,7 @@ public final class Switchboard extends serverSwitch {
indexSegments.segment(Segments.Process.LOCALCRAWLING), indexSegments.segment(Segments.Process.LOCALCRAWLING),
peers, peers,
true, true,
30000); 10000);
// create new web structure // create new web structure
this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map")); this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));

@ -368,7 +368,7 @@ public final class yacyClient {
} }
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static String[] search( public static int search(
final yacySeed mySeed, final yacySeed mySeed,
final String wordhashes, final String wordhashes,
final String excludehashes, final String excludehashes,
@ -392,7 +392,6 @@ public final class yacyClient {
final Bitfield constraint final Bitfield constraint
) { ) {
// send a search request to peer with remote Hash // send a search request to peer with remote Hash
// this mainly converts the words into word hashes
// INPUT: // INPUT:
// iam : complete seed of the requesting peer // iam : complete seed of the requesting peer
@ -437,7 +436,7 @@ public final class yacyClient {
} catch (final IOException e) { } catch (final IOException e) {
yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore); yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore);
//yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage()); //yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage());
return null; return -1;
} }
if (result == null || result.isEmpty()) { if (result == null || result.isEmpty()) {
@ -447,7 +446,7 @@ public final class yacyClient {
+ target.getName() + target.getName()
+ " (zero response), score=" + " (zero response), score="
+ target.selectscore); + target.selectscore);
return null; return -1;
} }
// compute all computation times // compute all computation times
@ -468,14 +467,14 @@ public final class yacyClient {
// now create a plasmaIndex out of this result // now create a plasmaIndex out of this result
// System.out.println("yacyClient: " + ((urlhashes.length() == 0) ? "primary" : "secondary")+ " search result = " + result.toString()); // debug // System.out.println("yacyClient: " + ((urlhashes.length() == 0) ? "primary" : "secondary")+ " search result = " + result.toString()); // debug
int results = 0, joincount = 0; int urlcount = 0, joincount = 0;
try { try {
results = Integer.parseInt(result.get("count")); joincount = Integer.parseInt(result.get("joincount")); // the complete number of hits at remote site
joincount = Integer.parseInt(result.get("joincount")); urlcount = Integer.parseInt(result.get("count")); // the number of hits that are returned in the result list
} catch (final NumberFormatException e) { } catch (final NumberFormatException e) {
yacyCore.log.logInfo("SEARCH failed FROM " + target.hash + ":" + target.getName() + ", wrong output format: " + e.getMessage()); yacyCore.log.logInfo("SEARCH failed FROM " + target.hash + ":" + target.getName() + ", wrong output format: " + e.getMessage());
//yacyCore.peerActions.peerDeparture(target, "search request to peer created number format exception"); //yacyCore.peerActions.peerDeparture(target, "search request to peer created number format exception");
return null; return -1;
} }
// System.out.println("***result count " + results); // System.out.println("***result count " + results);
@ -488,14 +487,13 @@ public final class yacyClient {
container[i] = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhashes.substring(i * Word.commonHashLength, (i + 1) * Word.commonHashLength).getBytes(), count); container[i] = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhashes.substring(i * Word.commonHashLength, (i + 1) * Word.commonHashLength).getBytes(), count);
} catch (RowSpaceExceededException e) { } catch (RowSpaceExceededException e) {
Log.logException(e); Log.logException(e);
return null; return -1;
} }
} }
// insert results to containers // insert results to containers
URIMetadataRow urlEntry; URIMetadataRow urlEntry;
final String[] urls = new String[results]; for (int n = 0; n < urlcount; n++) {
for (int n = 0; n < results; n++) {
// get one single search result // get one single search result
urlEntry = URIMetadataRow.importEntry(result.get("resource" + n)); urlEntry = URIMetadataRow.importEntry(result.get("resource" + n));
if (urlEntry == null) continue; if (urlEntry == null) continue;
@ -504,27 +502,26 @@ public final class yacyClient {
final URIMetadataRow.Components metadata = urlEntry.metadata(); final URIMetadataRow.Components metadata = urlEntry.metadata();
if (metadata == null) continue; if (metadata == null) continue;
if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) { if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) {
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): filtered blacklisted url " + metadata.url() + " from peer " + target.getName()); if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search: filtered blacklisted url " + metadata.url() + " from peer " + target.getName());
continue; // block with backlist continue; // block with backlist
} }
final String urlRejectReason = Switchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(metadata.url()); final String urlRejectReason = Switchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(metadata.url());
if (urlRejectReason != null) { if (urlRejectReason != null) {
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName()); if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search: rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
continue; // reject url outside of our domain continue; // reject url outside of our domain
} }
// save the url entry // save the url entry
Reference entry; Reference entry = urlEntry.word();
if (urlEntry.word() == null) { if (entry == null) {
if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search (client): no word attached from peer " + target.getName() + ", version " + target.getVersion()); if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search: no word attached from peer " + target.getName() + ", version " + target.getVersion());
continue; // no word attached continue; // no word attached
} }
// the search-result-url transports all the attributes of word indexes // the search-result-url transports all the attributes of word indexes
entry = urlEntry.word();
if (!Base64Order.enhancedCoder.equal(entry.metadataHash(), urlEntry.hash())) { if (!Base64Order.enhancedCoder.equal(entry.metadataHash(), urlEntry.hash())) {
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + new String(entry.metadataHash()) + "; url = " + metadata.url() + " from peer " + target.getName()); yacyCore.log.logInfo("remote search: url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + new String(entry.metadataHash()) + "; url = " + metadata.url() + " from peer " + target.getName());
continue; // spammed continue; // spammed
} }
@ -554,23 +551,30 @@ public final class yacyClient {
break; break;
} }
} }
// store url hash for statistics
urls[n] = new String(urlEntry.hash());
} }
// store remote result to local result container // store remote result to local result container
synchronized (containerCache) { synchronized (containerCache) {
// insert one container into the search result buffer // insert one container into the search result buffer
containerCache.add(container[0], false, joincount); // one is enough containerCache.add(container[0], false, joincount); // one is enough, only the references are used, not the word
}
// integrate remote topwords // insert the containers to the index
final String references = result.get("references"); for (ReferenceContainer<WordReference> c: container) try {
yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references); indexSegment.termIndex().add(c);
if (references != null) { } catch (Exception e) {
// add references twice, so they can be counted (must have at least 2 entries) Log.logException(e);
containerCache.addTopic(references.split(",")); }
containerCache.addTopic(references.split(",")); yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + container[0].size() + "/" + joincount + " references for joined word queries");
// integrate remote top-words/topics
final String references = result.get("references");
if (references != null && references.length() > 0) {
yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent topics: " + references);
// add references twice, so they can be counted (must have at least 2 entries)
String[] rs = references.split(",");
synchronized (containerCache) {
containerCache.addTopic(rs);
containerCache.addTopic(rs);
} }
} }
@ -592,7 +596,7 @@ public final class yacyClient {
ci = new ByteBuffer(entry.getValue().getBytes("UTF-8")); ci = new ByteBuffer(entry.getValue().getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
Log.logException(e); Log.logException(e);
return null; return -1;
} }
//System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString()); //System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString());
ReferenceContainer.decompressIndex(singleAbstract, ci, target.hash); ReferenceContainer.decompressIndex(singleAbstract, ci, target.hash);
@ -600,14 +604,8 @@ public final class yacyClient {
} }
} }
} }
if (abstractCache.size() > 0) yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + abstractCache.size() + " index abstracts");
} }
// insert the containers to the index
for (int m = 0; m < words; m++) try {
indexSegment.termIndex().add(container[m]);
} catch (Exception e) {
Log.logException(e);
}
// generate statistics // generate statistics
long searchtime; long searchtime;
@ -617,7 +615,7 @@ public final class yacyClient {
searchtime = totalrequesttime; searchtime = totalrequesttime;
} }
if (yacyCore.log.isFine()) yacyCore.log.logFine("SEARCH " if (yacyCore.log.isFine()) yacyCore.log.logFine("SEARCH "
+ results + urlcount
+ " URLS FROM " + " URLS FROM "
+ target.hash + target.hash
+ ":" + ":"
@ -627,7 +625,7 @@ public final class yacyClient {
+ ", searchtime=" + searchtime + ", netdelay=" + ", searchtime=" + searchtime + ", netdelay="
+ (totalrequesttime - searchtime) + ", references=" + (totalrequesttime - searchtime) + ", references="
+ result.get("references")); + result.get("references"));
return urls; return urlcount;
} }
public static Map<String, String> permissionMessage(final yacySeedDB seedDB, final String targetHash) { public static Map<String, String> permissionMessage(final yacySeedDB seedDB, final String targetHash) {

@ -56,7 +56,7 @@ public class yacySearch extends Thread {
final private Map<String, TreeMap<String, String>> abstractCache; final private Map<String, TreeMap<String, String>> abstractCache;
final private Blacklist blacklist; final private Blacklist blacklist;
final private yacySeed targetPeer; final private yacySeed targetPeer;
private String[] urls; private int urls;
private final int count, maxDistance; private final int count, maxDistance;
final private RankingProfile rankingProfile; final private RankingProfile rankingProfile;
final private Pattern prefer, filter; final private Pattern prefer, filter;
@ -103,7 +103,7 @@ public class yacySearch extends Thread {
this.abstractCache = abstractCache; this.abstractCache = abstractCache;
this.blacklist = blacklist; this.blacklist = blacklist;
this.targetPeer = targetPeer; this.targetPeer = targetPeer;
this.urls = null; this.urls = -1;
this.count = count; this.count = count;
this.maxDistance = maxDistance; this.maxDistance = maxDistance;
this.rankingProfile = rankingProfile; this.rankingProfile = rankingProfile;
@ -119,13 +119,11 @@ public class yacySearch extends Thread {
count, maxDistance, global, partitions, count, maxDistance, global, partitions,
targetPeer, indexSegment, crawlResults, containerCache, abstractCache, targetPeer, indexSegment, crawlResults, containerCache, abstractCache,
blacklist, rankingProfile, constraint); blacklist, rankingProfile, constraint);
if (urls != null) { if (urls >= 0) {
// urls is an array of url hashes. this is only used for log output // urls is an array of url hashes. this is only used for log output
final StringBuilder urllist = new StringBuilder(this.urls.length * 13); //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + urls.length + " links for word hash " + wordhashes + ": " + new String(urllist));
for (int i = 0; i < this.urls.length; i++) urllist.append(this.urls[i]).append(' '); peers.mySeed().incRI(urls);
yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + urls.length + " links for word hash " + wordhashes + ": " + new String(urllist)); peers.mySeed().incRU(urls);
peers.mySeed().incRI(urls.length);
peers.mySeed().incRU(urls.length);
} else { } else {
yacyCore.log.logInfo("REMOTE SEARCH - no answer from remote peer " + targetPeer.hash + ":" + targetPeer.getName()); yacyCore.log.logInfo("REMOTE SEARCH - no answer from remote peer " + targetPeer.hash + ":" + targetPeer.getName());
} }
@ -144,7 +142,7 @@ public class yacySearch extends Thread {
} }
public int links() { public int links() {
return this.urls.length; return this.urls;
} }
public int count() { public int count() {
@ -218,7 +216,8 @@ public class yacySearch extends Thread {
seed = dhtEnum.next(); seed = dhtEnum.next();
if (seed == null) continue; if (seed == null) continue;
if (seed.matchPeerTags(wordhashes)) { if (seed.matchPeerTags(wordhashes)) {
Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + seed.getPeerTags().toString()); String specialized = seed.getPeerTags().toString();
if (!specialized.equals("[*]")) Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized);
regularSeeds.remove(seed.hash); regularSeeds.remove(seed.hash);
ranking.deleteScore(seed.hash); ranking.deleteScore(seed.hash);
matchingSeeds.put(seed.hash, seed); matchingSeeds.put(seed.hash, seed);
@ -335,7 +334,7 @@ public class yacySearch extends Thread {
public static int collectedLinks(final yacySearch[] searchThreads) { public static int collectedLinks(final yacySearch[] searchThreads) {
int links = 0; int links = 0;
for (int i = 0; i < searchThreads.length; i++) { for (int i = 0; i < searchThreads.length; i++) {
if (!(searchThreads[i].isAlive())) links += searchThreads[i].urls.length; if (!(searchThreads[i].isAlive()) && searchThreads[i].urls > 0) links += searchThreads[i].urls;
} }
return links; return links;
} }

@ -69,6 +69,7 @@ public class htmlParser extends AbstractParser implements Idiom {
SUPPORTED_EXTENSIONS.add("cfm"); SUPPORTED_EXTENSIONS.add("cfm");
SUPPORTED_EXTENSIONS.add("asp"); SUPPORTED_EXTENSIONS.add("asp");
SUPPORTED_EXTENSIONS.add("aspx"); SUPPORTED_EXTENSIONS.add("aspx");
SUPPORTED_EXTENSIONS.add("tex");
SUPPORTED_EXTENSIONS.add("txt"); SUPPORTED_EXTENSIONS.add("txt");
SUPPORTED_EXTENSIONS.add("jsp"); SUPPORTED_EXTENSIONS.add("jsp");
SUPPORTED_EXTENSIONS.add("pl"); SUPPORTED_EXTENSIONS.add("pl");
@ -77,6 +78,7 @@ public class htmlParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.add("text/xhtml+xml"); SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
SUPPORTED_MIME_TYPES.add("application/xhtml+xml"); SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
SUPPORTED_MIME_TYPES.add("application/x-httpd-php"); SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
SUPPORTED_MIME_TYPES.add("application/x-tex");
SUPPORTED_MIME_TYPES.add("text/plain"); SUPPORTED_MIME_TYPES.add("text/plain");
SUPPORTED_MIME_TYPES.add("text/sgml"); SUPPORTED_MIME_TYPES.add("text/sgml");
SUPPORTED_MIME_TYPES.add("text/csv"); SUPPORTED_MIME_TYPES.add("text/csv");

@ -393,6 +393,16 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
public static BlockingQueue<WordReferenceVars> transform(ReferenceContainer<WordReference> container) { public static BlockingQueue<WordReferenceVars> transform(ReferenceContainer<WordReference> container) {
LinkedBlockingQueue<WordReferenceVars> out = new LinkedBlockingQueue<WordReferenceVars>(); LinkedBlockingQueue<WordReferenceVars> out = new LinkedBlockingQueue<WordReferenceVars>();
if (container.size() <= 100) {
// transform without concurrency to omit thread creation overhead
for (Row.Entry entry: container) try {
out.put(new WordReferenceVars(new WordReferenceRow(entry)));
} catch (InterruptedException e) {}
try {
out.put(WordReferenceVars.poison);
} catch (InterruptedException e) {}
return out;
}
Thread distributor = new TransformDistributor(container, out); Thread distributor = new TransformDistributor(container, out);
distributor.start(); distributor.start();

@ -104,7 +104,7 @@ public final class LoaderDispatcher {
final boolean forText, final boolean forText,
final boolean global, final boolean global,
final long maxFileSize) throws IOException { final long maxFileSize) throws IOException {
return load(request(url, forText, global), forText, maxFileSize); return load(request(url, forText, global), maxFileSize);
} }
/** /**
@ -122,12 +122,12 @@ public final class LoaderDispatcher {
final boolean global, final boolean global,
CrawlProfile.CacheStrategy cacheStratgy, CrawlProfile.CacheStrategy cacheStratgy,
long maxFileSize) throws IOException { long maxFileSize) throws IOException {
return load(request(url, forText, global), forText, cacheStratgy, maxFileSize); return load(request(url, forText, global), cacheStratgy, maxFileSize);
} }
public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException { public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException {
byte[] b = load(request(url, false, true), false, cacheStratgy, maxFileSize).getContent(); byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent();
if (b == null) throw new IOException("load == null"); if (b == null) throw new IOException("load == null");
File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
@ -169,14 +169,14 @@ public final class LoaderDispatcher {
0); 0);
} }
public Response load(final Request request, final boolean acceptOnlyParseable, long maxFileSize) throws IOException { public Response load(final Request request, long maxFileSize) throws IOException {
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST; CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST;
if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy(); if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
return load(request, acceptOnlyParseable, cacheStrategy, maxFileSize); return load(request, cacheStrategy, maxFileSize);
} }
public Response load(final Request request, final boolean acceptOnlyParseable, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
// get the protocol of the next URL // get the protocol of the next URL
final String protocol = request.url().getProtocol(); final String protocol = request.url().getProtocol();
final String host = request.url().getHost(); final String host = request.url().getHost();
@ -258,7 +258,7 @@ public final class LoaderDispatcher {
// load resource from the internet // load resource from the internet
Response response = null; Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize); if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize);
if (protocol.equals("ftp")) response = ftpLoader.load(request, true); if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
if (protocol.equals("smb")) response = smbLoader.load(request, true); if (protocol.equals("smb")) response = smbLoader.load(request, true);
if (protocol.equals("file")) response = fileLoader.load(request, true); if (protocol.equals("file")) response = fileLoader.load(request, true);

Loading…
Cancel
Save