From 87087f12fe9544b28ac07198112e454ccc736aa6 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 17 Jun 2010 11:59:40 +0000 Subject: [PATCH] - scanned remote search process and enhanced some data structure and synchronizations here and there - removed concurrency overhead for small number of index normalizations as it happens during remote search - removed 'load only parseable' constraint for snippet fetch because some resources may not have any url file extension and these had therefore not been parseable and searcheable since they may become parseable after loading when their mime type is known - this partly fixes some problems with http://forum.yacy-websuche.de/viewtopic.php?p=20300#p20300 but more changes are necessary to get all expected search results git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6926 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/CrawlQueues.java | 2 +- .../anomic/crawler/retrieval/HTTPLoader.java | 27 +------ source/de/anomic/search/Switchboard.java | 4 +- source/de/anomic/yacy/yacyClient.java | 78 +++++++++---------- source/de/anomic/yacy/yacySearch.java | 21 +++-- .../net/yacy/document/parser/htmlParser.java | 2 + .../kelondro/data/word/WordReferenceVars.java | 10 +++ .../net/yacy/repository/LoaderDispatcher.java | 14 ++-- 8 files changed, 74 insertions(+), 84 deletions(-) diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index aa1abf100..827db3285 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -563,7 +563,7 @@ public class CrawlQueues { try { request.setStatus("loading", WorkflowJob.STATUS_RUNNING); final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); - Response response = sb.loader.load(request, true, maxFileSize); + Response response = sb.loader.load(request, maxFileSize); if (response == null) { request.setStatus("error", WorkflowJob.STATUS_FINISHED); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)"); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 5d2917e0e..32e21bb88 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -28,7 +28,6 @@ import java.io.IOException; import java.util.Date; import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.repository.Blacklist; @@ -75,14 +74,14 @@ public final class HTTPLoader { this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000); } - public Response load(final Request entry, final boolean acceptOnlyParseable, long maxFileSize) throws IOException { + public Response load(final Request entry, long maxFileSize) throws IOException { long start = System.currentTimeMillis(); - Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize); + Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize); Latency.update(entry.url(), System.currentTimeMillis() - start); return doc; } - private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount, final long maxFileSize) throws IOException { + private Response load(final Request request, final int retryCount, final long maxFileSize) throws IOException { if (retryCount < 0) { sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded"); @@ -96,15 +95,6 @@ public final class HTTPLoader { final boolean ssl = request.url().getProtocol().equals("https"); if (port < 0) port = (ssl) ? 443 : 80; - // if not the right file type then reject file - if (acceptOnlyParseable) { - String supportError = TextParser.supportsExtension(request.url()); - if (supportError != null) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, supportError); - throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError); - } - } - // check if url is in blacklist final String hostlow = host.toLowerCase(); if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) { @@ -138,15 +128,6 @@ public final class HTTPLoader { if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { // the transfer is ok - if (acceptOnlyParseable) { - // if the response has not the right file type then reject file - String supportError = TextParser.supports(request.url(), res.getResponseHeader().mime()); - if (supportError != null) { - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, supportError); - throw new IOException("REJECTED WRONG MIME TYPE, mime = " + res.getResponseHeader().mime() + ": " + supportError); - } - } - // we write the new cache entry to file system directly res.setAccountingName("CRAWLER"); final byte[] responseBody = res.getData(); @@ -202,7 +183,7 @@ public final class HTTPLoader { // retry crawling with new url request.redirectURL(redirectionUrl); - return load(request, acceptOnlyParseable, retryCount - 1, maxFileSize); + return load(request, retryCount - 1, maxFileSize); } } else { // if the response has not the right response type then reject file diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 08a48caf0..5341253b0 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -367,7 +367,7 @@ public final class Switchboard extends serverSwitch { indexSegments.segment(Segments.Process.LOCALCRAWLING), peers, true, - 30000); + 10000); // set up local robots.txt this.robotstxtConfig = RobotsTxtConfig.init(this); @@ -894,7 +894,7 @@ public final class Switchboard extends serverSwitch { indexSegments.segment(Segments.Process.LOCALCRAWLING), peers, true, - 30000); + 10000); // create new web structure this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map")); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 14a3e6c26..70b46b367 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -368,7 +368,7 @@ public final class yacyClient { } @SuppressWarnings("unchecked") - public static String[] search( + public static int search( final yacySeed mySeed, final String wordhashes, final String excludehashes, @@ -392,7 +392,6 @@ public final class yacyClient { final Bitfield constraint ) { // send a search request to peer with remote Hash - // this mainly converts the words into word hashes // INPUT: // iam : complete seed of the requesting peer @@ -437,7 +436,7 @@ public final class yacyClient { } catch (final IOException e) { yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore); //yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage()); - return null; + return -1; } if (result == null || result.isEmpty()) { @@ -447,7 +446,7 @@ public final class yacyClient { + target.getName() + " (zero response), score=" + target.selectscore); - return null; + return -1; } // compute all computation times @@ -468,14 +467,14 @@ public final class yacyClient { // now create a plasmaIndex out of this result // System.out.println("yacyClient: " + ((urlhashes.length() == 0) ? "primary" : "secondary")+ " search result = " + result.toString()); // debug - int results = 0, joincount = 0; + int urlcount = 0, joincount = 0; try { - results = Integer.parseInt(result.get("count")); - joincount = Integer.parseInt(result.get("joincount")); + joincount = Integer.parseInt(result.get("joincount")); // the complete number of hits at remote site + urlcount = Integer.parseInt(result.get("count")); // the number of hits that are returned in the result list } catch (final NumberFormatException e) { yacyCore.log.logInfo("SEARCH failed FROM " + target.hash + ":" + target.getName() + ", wrong output format: " + e.getMessage()); //yacyCore.peerActions.peerDeparture(target, "search request to peer created number format exception"); - return null; + return -1; } // System.out.println("***result count " + results); @@ -488,14 +487,13 @@ public final class yacyClient { container[i] = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhashes.substring(i * Word.commonHashLength, (i + 1) * Word.commonHashLength).getBytes(), count); } catch (RowSpaceExceededException e) { Log.logException(e); - return null; + return -1; } } // insert results to containers URIMetadataRow urlEntry; - final String[] urls = new String[results]; - for (int n = 0; n < results; n++) { + for (int n = 0; n < urlcount; n++) { // get one single search result urlEntry = URIMetadataRow.importEntry(result.get("resource" + n)); if (urlEntry == null) continue; @@ -504,27 +502,26 @@ public final class yacyClient { final URIMetadataRow.Components metadata = urlEntry.metadata(); if (metadata == null) continue; if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) { - if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): filtered blacklisted url " + metadata.url() + " from peer " + target.getName()); + if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search: filtered blacklisted url " + metadata.url() + " from peer " + target.getName()); continue; // block with backlist } final String urlRejectReason = Switchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(metadata.url()); if (urlRejectReason != null) { - if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName()); + if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search: rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName()); continue; // reject url outside of our domain } // save the url entry - Reference entry; - if (urlEntry.word() == null) { - if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search (client): no word attached from peer " + target.getName() + ", version " + target.getVersion()); + Reference entry = urlEntry.word(); + if (entry == null) { + if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search: no word attached from peer " + target.getName() + ", version " + target.getVersion()); continue; // no word attached } // the search-result-url transports all the attributes of word indexes - entry = urlEntry.word(); if (!Base64Order.enhancedCoder.equal(entry.metadataHash(), urlEntry.hash())) { - if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + new String(entry.metadataHash()) + "; url = " + metadata.url() + " from peer " + target.getName()); + yacyCore.log.logInfo("remote search: url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + new String(entry.metadataHash()) + "; url = " + metadata.url() + " from peer " + target.getName()); continue; // spammed } @@ -554,23 +551,30 @@ public final class yacyClient { break; } } - - // store url hash for statistics - urls[n] = new String(urlEntry.hash()); } // store remote result to local result container synchronized (containerCache) { // insert one container into the search result buffer - containerCache.add(container[0], false, joincount); // one is enough - - // integrate remote topwords - final String references = result.get("references"); - yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references); - if (references != null) { - // add references twice, so they can be counted (must have at least 2 entries) - containerCache.addTopic(references.split(",")); - containerCache.addTopic(references.split(",")); + containerCache.add(container[0], false, joincount); // one is enough, only the references are used, not the word + } + // insert the containers to the index + for (ReferenceContainer c: container) try { + indexSegment.termIndex().add(c); + } catch (Exception e) { + Log.logException(e); + } + yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + container[0].size() + "/" + joincount + " references for joined word queries"); + + // integrate remote top-words/topics + final String references = result.get("references"); + if (references != null && references.length() > 0) { + yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent topics: " + references); + // add references twice, so they can be counted (must have at least 2 entries) + String[] rs = references.split(","); + synchronized (containerCache) { + containerCache.addTopic(rs); + containerCache.addTopic(rs); } } @@ -592,7 +596,7 @@ public final class yacyClient { ci = new ByteBuffer(entry.getValue().getBytes("UTF-8")); } catch (UnsupportedEncodingException e) { Log.logException(e); - return null; + return -1; } //System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString()); ReferenceContainer.decompressIndex(singleAbstract, ci, target.hash); @@ -600,14 +604,8 @@ public final class yacyClient { } } } + if (abstractCache.size() > 0) yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + abstractCache.size() + " index abstracts"); } - - // insert the containers to the index - for (int m = 0; m < words; m++) try { - indexSegment.termIndex().add(container[m]); - } catch (Exception e) { - Log.logException(e); - } // generate statistics long searchtime; @@ -617,7 +615,7 @@ public final class yacyClient { searchtime = totalrequesttime; } if (yacyCore.log.isFine()) yacyCore.log.logFine("SEARCH " - + results + + urlcount + " URLS FROM " + target.hash + ":" @@ -627,7 +625,7 @@ public final class yacyClient { + ", searchtime=" + searchtime + ", netdelay=" + (totalrequesttime - searchtime) + ", references=" + result.get("references")); - return urls; + return urlcount; } public static Map permissionMessage(final yacySeedDB seedDB, final String targetHash) { diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 1c764e7ff..19d10ca96 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -56,7 +56,7 @@ public class yacySearch extends Thread { final private Map> abstractCache; final private Blacklist blacklist; final private yacySeed targetPeer; - private String[] urls; + private int urls; private final int count, maxDistance; final private RankingProfile rankingProfile; final private Pattern prefer, filter; @@ -103,7 +103,7 @@ public class yacySearch extends Thread { this.abstractCache = abstractCache; this.blacklist = blacklist; this.targetPeer = targetPeer; - this.urls = null; + this.urls = -1; this.count = count; this.maxDistance = maxDistance; this.rankingProfile = rankingProfile; @@ -119,13 +119,11 @@ public class yacySearch extends Thread { count, maxDistance, global, partitions, targetPeer, indexSegment, crawlResults, containerCache, abstractCache, blacklist, rankingProfile, constraint); - if (urls != null) { + if (urls >= 0) { // urls is an array of url hashes. this is only used for log output - final StringBuilder urllist = new StringBuilder(this.urls.length * 13); - for (int i = 0; i < this.urls.length; i++) urllist.append(this.urls[i]).append(' '); - yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + urls.length + " links for word hash " + wordhashes + ": " + new String(urllist)); - peers.mySeed().incRI(urls.length); - peers.mySeed().incRU(urls.length); + //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + urls.length + " links for word hash " + wordhashes + ": " + new String(urllist)); + peers.mySeed().incRI(urls); + peers.mySeed().incRU(urls); } else { yacyCore.log.logInfo("REMOTE SEARCH - no answer from remote peer " + targetPeer.hash + ":" + targetPeer.getName()); } @@ -144,7 +142,7 @@ public class yacySearch extends Thread { } public int links() { - return this.urls.length; + return this.urls; } public int count() { @@ -218,7 +216,8 @@ public class yacySearch extends Thread { seed = dhtEnum.next(); if (seed == null) continue; if (seed.matchPeerTags(wordhashes)) { - Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + seed.getPeerTags().toString()); + String specialized = seed.getPeerTags().toString(); + if (!specialized.equals("[*]")) Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized); regularSeeds.remove(seed.hash); ranking.deleteScore(seed.hash); matchingSeeds.put(seed.hash, seed); @@ -335,7 +334,7 @@ public class yacySearch extends Thread { public static int collectedLinks(final yacySearch[] searchThreads) { int links = 0; for (int i = 0; i < searchThreads.length; i++) { - if (!(searchThreads[i].isAlive())) links += searchThreads[i].urls.length; + if (!(searchThreads[i].isAlive()) && searchThreads[i].urls > 0) links += searchThreads[i].urls; } return links; } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 0e325b3bd..1883aab80 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -69,6 +69,7 @@ public class htmlParser extends AbstractParser implements Idiom { SUPPORTED_EXTENSIONS.add("cfm"); SUPPORTED_EXTENSIONS.add("asp"); SUPPORTED_EXTENSIONS.add("aspx"); + SUPPORTED_EXTENSIONS.add("tex"); SUPPORTED_EXTENSIONS.add("txt"); SUPPORTED_EXTENSIONS.add("jsp"); SUPPORTED_EXTENSIONS.add("pl"); @@ -77,6 +78,7 @@ public class htmlParser extends AbstractParser implements Idiom { SUPPORTED_MIME_TYPES.add("text/xhtml+xml"); SUPPORTED_MIME_TYPES.add("application/xhtml+xml"); SUPPORTED_MIME_TYPES.add("application/x-httpd-php"); + SUPPORTED_MIME_TYPES.add("application/x-tex"); SUPPORTED_MIME_TYPES.add("text/plain"); SUPPORTED_MIME_TYPES.add("text/sgml"); SUPPORTED_MIME_TYPES.add("text/csv"); diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index dd02996de..6634ec560 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -393,6 +393,16 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc public static BlockingQueue transform(ReferenceContainer container) { LinkedBlockingQueue out = new LinkedBlockingQueue(); + if (container.size() <= 100) { + // transform without concurrency to omit thread creation overhead + for (Row.Entry entry: container) try { + out.put(new WordReferenceVars(new WordReferenceRow(entry))); + } catch (InterruptedException e) {} + try { + out.put(WordReferenceVars.poison); + } catch (InterruptedException e) {} + return out; + } Thread distributor = new TransformDistributor(container, out); distributor.start(); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 77730fd32..a0f8e1148 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -104,7 +104,7 @@ public final class LoaderDispatcher { final boolean forText, final boolean global, final long maxFileSize) throws IOException { - return load(request(url, forText, global), forText, maxFileSize); + return load(request(url, forText, global), maxFileSize); } /** @@ -122,12 +122,12 @@ public final class LoaderDispatcher { final boolean global, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize) throws IOException { - return load(request(url, forText, global), forText, cacheStratgy, maxFileSize); + return load(request(url, forText, global), cacheStratgy, maxFileSize); } public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException { - byte[] b = load(request(url, false, true), false, cacheStratgy, maxFileSize).getContent(); + byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent(); if (b == null) throw new IOException("load == null"); File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); @@ -169,14 +169,14 @@ public final class LoaderDispatcher { 0); } - public Response load(final Request request, final boolean acceptOnlyParseable, long maxFileSize) throws IOException { + public Response load(final Request request, long maxFileSize) throws IOException { CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST; if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy(); - return load(request, acceptOnlyParseable, cacheStrategy, maxFileSize); + return load(request, cacheStrategy, maxFileSize); } - public Response load(final Request request, final boolean acceptOnlyParseable, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { + public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { // get the protocol of the next URL final String protocol = request.url().getProtocol(); final String host = request.url().getHost(); @@ -258,7 +258,7 @@ public final class LoaderDispatcher { // load resource from the internet Response response = null; - if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize); + if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize); if (protocol.equals("ftp")) response = ftpLoader.load(request, true); if (protocol.equals("smb")) response = smbLoader.load(request, true); if (protocol.equals("file")) response = fileLoader.load(request, true);