- scanned remote search process and enhanced some data structure and synchronizations here and there

- removed concurrency overhead for small number of index normalizations as it happens during remote search
- removed 'load only parseable' constraint for snippet fetch because some resources may not have any url file extension and these had therefore not been parseable and searcheable since they may become parseable after loading when their mime type is known
- this partly fixes some problems with http://forum.yacy-websuche.de/viewtopic.php?p=20300#p20300 but more changes are necessary to get all expected search results

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6926 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 7ddb70e7c6
commit 87087f12fe

@ -563,7 +563,7 @@ public class CrawlQueues {
try {
request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
Response response = sb.loader.load(request, true, maxFileSize);
Response response = sb.loader.load(request, maxFileSize);
if (response == null) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");

@ -28,7 +28,6 @@ import java.io.IOException;
import java.util.Date;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
@ -75,14 +74,14 @@ public final class HTTPLoader {
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
}
public Response load(final Request entry, final boolean acceptOnlyParseable, long maxFileSize) throws IOException {
public Response load(final Request entry, long maxFileSize) throws IOException {
long start = System.currentTimeMillis();
Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize);
Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize);
Latency.update(entry.url(), System.currentTimeMillis() - start);
return doc;
}
private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount, final long maxFileSize) throws IOException {
private Response load(final Request request, final int retryCount, final long maxFileSize) throws IOException {
if (retryCount < 0) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
@ -96,15 +95,6 @@ public final class HTTPLoader {
final boolean ssl = request.url().getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443 : 80;
// if not the right file type then reject file
if (acceptOnlyParseable) {
String supportError = TextParser.supportsExtension(request.url());
if (supportError != null) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, supportError);
throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
}
}
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
@ -138,15 +128,6 @@ public final class HTTPLoader {
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok
if (acceptOnlyParseable) {
// if the response has not the right file type then reject file
String supportError = TextParser.supports(request.url(), res.getResponseHeader().mime());
if (supportError != null) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, supportError);
throw new IOException("REJECTED WRONG MIME TYPE, mime = " + res.getResponseHeader().mime() + ": " + supportError);
}
}
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
@ -202,7 +183,7 @@ public final class HTTPLoader {
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, acceptOnlyParseable, retryCount - 1, maxFileSize);
return load(request, retryCount - 1, maxFileSize);
}
} else {
// if the response has not the right response type then reject file

@ -367,7 +367,7 @@ public final class Switchboard extends serverSwitch {
indexSegments.segment(Segments.Process.LOCALCRAWLING),
peers,
true,
30000);
10000);
// set up local robots.txt
this.robotstxtConfig = RobotsTxtConfig.init(this);
@ -894,7 +894,7 @@ public final class Switchboard extends serverSwitch {
indexSegments.segment(Segments.Process.LOCALCRAWLING),
peers,
true,
30000);
10000);
// create new web structure
this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));

@ -368,7 +368,7 @@ public final class yacyClient {
}
@SuppressWarnings("unchecked")
public static String[] search(
public static int search(
final yacySeed mySeed,
final String wordhashes,
final String excludehashes,
@ -392,7 +392,6 @@ public final class yacyClient {
final Bitfield constraint
) {
// send a search request to peer with remote Hash
// this mainly converts the words into word hashes
// INPUT:
// iam : complete seed of the requesting peer
@ -437,7 +436,7 @@ public final class yacyClient {
} catch (final IOException e) {
yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore);
//yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage());
return null;
return -1;
}
if (result == null || result.isEmpty()) {
@ -447,7 +446,7 @@ public final class yacyClient {
+ target.getName()
+ " (zero response), score="
+ target.selectscore);
return null;
return -1;
}
// compute all computation times
@ -468,14 +467,14 @@ public final class yacyClient {
// now create a plasmaIndex out of this result
// System.out.println("yacyClient: " + ((urlhashes.length() == 0) ? "primary" : "secondary")+ " search result = " + result.toString()); // debug
int results = 0, joincount = 0;
int urlcount = 0, joincount = 0;
try {
results = Integer.parseInt(result.get("count"));
joincount = Integer.parseInt(result.get("joincount"));
joincount = Integer.parseInt(result.get("joincount")); // the complete number of hits at remote site
urlcount = Integer.parseInt(result.get("count")); // the number of hits that are returned in the result list
} catch (final NumberFormatException e) {
yacyCore.log.logInfo("SEARCH failed FROM " + target.hash + ":" + target.getName() + ", wrong output format: " + e.getMessage());
//yacyCore.peerActions.peerDeparture(target, "search request to peer created number format exception");
return null;
return -1;
}
// System.out.println("***result count " + results);
@ -488,14 +487,13 @@ public final class yacyClient {
container[i] = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhashes.substring(i * Word.commonHashLength, (i + 1) * Word.commonHashLength).getBytes(), count);
} catch (RowSpaceExceededException e) {
Log.logException(e);
return null;
return -1;
}
}
// insert results to containers
URIMetadataRow urlEntry;
final String[] urls = new String[results];
for (int n = 0; n < results; n++) {
for (int n = 0; n < urlcount; n++) {
// get one single search result
urlEntry = URIMetadataRow.importEntry(result.get("resource" + n));
if (urlEntry == null) continue;
@ -504,27 +502,26 @@ public final class yacyClient {
final URIMetadataRow.Components metadata = urlEntry.metadata();
if (metadata == null) continue;
if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) {
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): filtered blacklisted url " + metadata.url() + " from peer " + target.getName());
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search: filtered blacklisted url " + metadata.url() + " from peer " + target.getName());
continue; // block with backlist
}
final String urlRejectReason = Switchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(metadata.url());
if (urlRejectReason != null) {
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search: rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
continue; // reject url outside of our domain
}
// save the url entry
Reference entry;
if (urlEntry.word() == null) {
if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search (client): no word attached from peer " + target.getName() + ", version " + target.getVersion());
Reference entry = urlEntry.word();
if (entry == null) {
if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search: no word attached from peer " + target.getName() + ", version " + target.getVersion());
continue; // no word attached
}
// the search-result-url transports all the attributes of word indexes
entry = urlEntry.word();
if (!Base64Order.enhancedCoder.equal(entry.metadataHash(), urlEntry.hash())) {
if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + new String(entry.metadataHash()) + "; url = " + metadata.url() + " from peer " + target.getName());
yacyCore.log.logInfo("remote search: url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + new String(entry.metadataHash()) + "; url = " + metadata.url() + " from peer " + target.getName());
continue; // spammed
}
@ -554,23 +551,30 @@ public final class yacyClient {
break;
}
}
// store url hash for statistics
urls[n] = new String(urlEntry.hash());
}
// store remote result to local result container
synchronized (containerCache) {
// insert one container into the search result buffer
containerCache.add(container[0], false, joincount); // one is enough
// integrate remote topwords
final String references = result.get("references");
yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references);
if (references != null) {
// add references twice, so they can be counted (must have at least 2 entries)
containerCache.addTopic(references.split(","));
containerCache.addTopic(references.split(","));
containerCache.add(container[0], false, joincount); // one is enough, only the references are used, not the word
}
// insert the containers to the index
for (ReferenceContainer<WordReference> c: container) try {
indexSegment.termIndex().add(c);
} catch (Exception e) {
Log.logException(e);
}
yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + container[0].size() + "/" + joincount + " references for joined word queries");
// integrate remote top-words/topics
final String references = result.get("references");
if (references != null && references.length() > 0) {
yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent topics: " + references);
// add references twice, so they can be counted (must have at least 2 entries)
String[] rs = references.split(",");
synchronized (containerCache) {
containerCache.addTopic(rs);
containerCache.addTopic(rs);
}
}
@ -592,7 +596,7 @@ public final class yacyClient {
ci = new ByteBuffer(entry.getValue().getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
Log.logException(e);
return null;
return -1;
}
//System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString());
ReferenceContainer.decompressIndex(singleAbstract, ci, target.hash);
@ -600,14 +604,8 @@ public final class yacyClient {
}
}
}
if (abstractCache.size() > 0) yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + abstractCache.size() + " index abstracts");
}
// insert the containers to the index
for (int m = 0; m < words; m++) try {
indexSegment.termIndex().add(container[m]);
} catch (Exception e) {
Log.logException(e);
}
// generate statistics
long searchtime;
@ -617,7 +615,7 @@ public final class yacyClient {
searchtime = totalrequesttime;
}
if (yacyCore.log.isFine()) yacyCore.log.logFine("SEARCH "
+ results
+ urlcount
+ " URLS FROM "
+ target.hash
+ ":"
@ -627,7 +625,7 @@ public final class yacyClient {
+ ", searchtime=" + searchtime + ", netdelay="
+ (totalrequesttime - searchtime) + ", references="
+ result.get("references"));
return urls;
return urlcount;
}
public static Map<String, String> permissionMessage(final yacySeedDB seedDB, final String targetHash) {

@ -56,7 +56,7 @@ public class yacySearch extends Thread {
final private Map<String, TreeMap<String, String>> abstractCache;
final private Blacklist blacklist;
final private yacySeed targetPeer;
private String[] urls;
private int urls;
private final int count, maxDistance;
final private RankingProfile rankingProfile;
final private Pattern prefer, filter;
@ -103,7 +103,7 @@ public class yacySearch extends Thread {
this.abstractCache = abstractCache;
this.blacklist = blacklist;
this.targetPeer = targetPeer;
this.urls = null;
this.urls = -1;
this.count = count;
this.maxDistance = maxDistance;
this.rankingProfile = rankingProfile;
@ -119,13 +119,11 @@ public class yacySearch extends Thread {
count, maxDistance, global, partitions,
targetPeer, indexSegment, crawlResults, containerCache, abstractCache,
blacklist, rankingProfile, constraint);
if (urls != null) {
if (urls >= 0) {
// urls is an array of url hashes. this is only used for log output
final StringBuilder urllist = new StringBuilder(this.urls.length * 13);
for (int i = 0; i < this.urls.length; i++) urllist.append(this.urls[i]).append(' ');
yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + urls.length + " links for word hash " + wordhashes + ": " + new String(urllist));
peers.mySeed().incRI(urls.length);
peers.mySeed().incRU(urls.length);
//yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + urls.length + " links for word hash " + wordhashes + ": " + new String(urllist));
peers.mySeed().incRI(urls);
peers.mySeed().incRU(urls);
} else {
yacyCore.log.logInfo("REMOTE SEARCH - no answer from remote peer " + targetPeer.hash + ":" + targetPeer.getName());
}
@ -144,7 +142,7 @@ public class yacySearch extends Thread {
}
public int links() {
return this.urls.length;
return this.urls;
}
public int count() {
@ -218,7 +216,8 @@ public class yacySearch extends Thread {
seed = dhtEnum.next();
if (seed == null) continue;
if (seed.matchPeerTags(wordhashes)) {
Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + seed.getPeerTags().toString());
String specialized = seed.getPeerTags().toString();
if (!specialized.equals("[*]")) Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized);
regularSeeds.remove(seed.hash);
ranking.deleteScore(seed.hash);
matchingSeeds.put(seed.hash, seed);
@ -335,7 +334,7 @@ public class yacySearch extends Thread {
public static int collectedLinks(final yacySearch[] searchThreads) {
int links = 0;
for (int i = 0; i < searchThreads.length; i++) {
if (!(searchThreads[i].isAlive())) links += searchThreads[i].urls.length;
if (!(searchThreads[i].isAlive()) && searchThreads[i].urls > 0) links += searchThreads[i].urls;
}
return links;
}

@ -69,6 +69,7 @@ public class htmlParser extends AbstractParser implements Idiom {
SUPPORTED_EXTENSIONS.add("cfm");
SUPPORTED_EXTENSIONS.add("asp");
SUPPORTED_EXTENSIONS.add("aspx");
SUPPORTED_EXTENSIONS.add("tex");
SUPPORTED_EXTENSIONS.add("txt");
SUPPORTED_EXTENSIONS.add("jsp");
SUPPORTED_EXTENSIONS.add("pl");
@ -77,6 +78,7 @@ public class htmlParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
SUPPORTED_MIME_TYPES.add("application/x-tex");
SUPPORTED_MIME_TYPES.add("text/plain");
SUPPORTED_MIME_TYPES.add("text/sgml");
SUPPORTED_MIME_TYPES.add("text/csv");

@ -393,6 +393,16 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
public static BlockingQueue<WordReferenceVars> transform(ReferenceContainer<WordReference> container) {
LinkedBlockingQueue<WordReferenceVars> out = new LinkedBlockingQueue<WordReferenceVars>();
if (container.size() <= 100) {
// transform without concurrency to omit thread creation overhead
for (Row.Entry entry: container) try {
out.put(new WordReferenceVars(new WordReferenceRow(entry)));
} catch (InterruptedException e) {}
try {
out.put(WordReferenceVars.poison);
} catch (InterruptedException e) {}
return out;
}
Thread distributor = new TransformDistributor(container, out);
distributor.start();

@ -104,7 +104,7 @@ public final class LoaderDispatcher {
final boolean forText,
final boolean global,
final long maxFileSize) throws IOException {
return load(request(url, forText, global), forText, maxFileSize);
return load(request(url, forText, global), maxFileSize);
}
/**
@ -122,12 +122,12 @@ public final class LoaderDispatcher {
final boolean global,
CrawlProfile.CacheStrategy cacheStratgy,
long maxFileSize) throws IOException {
return load(request(url, forText, global), forText, cacheStratgy, maxFileSize);
return load(request(url, forText, global), cacheStratgy, maxFileSize);
}
public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException {
byte[] b = load(request(url, false, true), false, cacheStratgy, maxFileSize).getContent();
byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent();
if (b == null) throw new IOException("load == null");
File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
@ -169,14 +169,14 @@ public final class LoaderDispatcher {
0);
}
public Response load(final Request request, final boolean acceptOnlyParseable, long maxFileSize) throws IOException {
public Response load(final Request request, long maxFileSize) throws IOException {
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST;
if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
return load(request, acceptOnlyParseable, cacheStrategy, maxFileSize);
return load(request, cacheStrategy, maxFileSize);
}
public Response load(final Request request, final boolean acceptOnlyParseable, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
// get the protocol of the next URL
final String protocol = request.url().getProtocol();
final String host = request.url().getHost();
@ -258,7 +258,7 @@ public final class LoaderDispatcher {
// load resource from the internet
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize);
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize);
if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
if (protocol.equals("smb")) response = smbLoader.load(request, true);
if (protocol.equals("file")) response = fileLoader.load(request, true);

Loading…
Cancel
Save