- scanned remote search process and enhanced some data structure and synchronizations here and there

- removed concurrency overhead for small number of index normalizations as it happens during remote search - removed 'load only parseable' constraint for snippet fetch because some resources may not have any url file extension and these had therefore not been parseable and searcheable since they may become parseable after loading when their mime type is known - this partly fixes some problems with http://forum.yacy-websuche.de/viewtopic.php?p=20300#p20300 but more changes are necessary to get all expected search results git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6926 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 87087f12fe
parent 7ddb70e7c6
commit 87087f12fe
8 changed files with 74 additions and 84 deletions
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -563,7 +563,7 @@ public class CrawlQueues {
                    try {
                        request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
                        final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
-                        Response response = sb.loader.load(request, true, maxFileSize);
+                        Response response = sb.loader.load(request, maxFileSize);
                        if (response == null) {
                            request.setStatus("error", WorkflowJob.STATUS_FINISHED);
                            if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@ -28,7 +28,6 @@ import java.io.IOException;
 import java.util.Date;

 import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.document.TextParser;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.repository.Blacklist;
@ -75,14 +74,14 @@ public final class HTTPLoader {
        this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
    }  
   
-    public Response load(final Request entry, final boolean acceptOnlyParseable, long maxFileSize) throws IOException {
+    public Response load(final Request entry, long maxFileSize) throws IOException {
        long start = System.currentTimeMillis();
-        Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize);
+        Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize);
        Latency.update(entry.url(), System.currentTimeMillis() - start);
        return doc;
    }
    
-    private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount, final long maxFileSize) throws IOException {
+    private Response load(final Request request, final int retryCount, final long maxFileSize) throws IOException {

        if (retryCount < 0) {
            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
@ -96,15 +95,6 @@ public final class HTTPLoader {
        final boolean ssl = request.url().getProtocol().equals("https");
        if (port < 0) port = (ssl) ? 443 : 80;
        
-        // if not the right file type then reject file
-        if (acceptOnlyParseable) {
-            String supportError = TextParser.supportsExtension(request.url());
-            if (supportError != null) {
-                sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, supportError);
-                throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
-            }
-        }
-        
        // check if url is in blacklist
        final String hostlow = host.toLowerCase();
        if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
@ -138,15 +128,6 @@ public final class HTTPLoader {
            if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
                // the transfer is ok
                
-                if (acceptOnlyParseable) {
-                	// if the response has not the right file type then reject file
-                    String supportError = TextParser.supports(request.url(), res.getResponseHeader().mime());
-                    if (supportError != null) {
-                    	sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, supportError);
-                        throw new IOException("REJECTED WRONG MIME TYPE, mime = " + res.getResponseHeader().mime() + ": " + supportError);
-                    }
-                }
-                
                // we write the new cache entry to file system directly
                res.setAccountingName("CRAWLER");
                final byte[] responseBody = res.getData();
@ -202,7 +183,7 @@ public final class HTTPLoader {
                    
                    // retry crawling with new url
                    request.redirectURL(redirectionUrl);
-                    return load(request, acceptOnlyParseable, retryCount - 1, maxFileSize);
+                    return load(request, retryCount - 1, maxFileSize);
                }
            } else {
                // if the response has not the right response type then reject file
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -367,7 +367,7 @@ public final class Switchboard extends serverSwitch {
                indexSegments.segment(Segments.Process.LOCALCRAWLING),
                peers,
                true, 
-                30000);
+                10000);
        
        // set up local robots.txt
        this.robotstxtConfig = RobotsTxtConfig.init(this);
@ -894,7 +894,7 @@ public final class Switchboard extends serverSwitch {
                    indexSegments.segment(Segments.Process.LOCALCRAWLING),
                    peers,
                    true, 
-                    30000);
+                    10000);

            // create new web structure
            this.webStructure = new WebStructureGraph(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", CRDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@ -368,7 +368,7 @@ public final class yacyClient {
    }
    
    @SuppressWarnings("unchecked")
-    public static String[] search(
+    public static int search(
            final yacySeed mySeed,
            final String wordhashes,
            final String excludehashes,
@ -392,7 +392,6 @@ public final class yacyClient {
            final Bitfield constraint
    ) {
        // send a search request to peer with remote Hash
-        // this mainly converts the words into word hashes

        // INPUT:
        // iam        : complete seed of the requesting peer
@ -437,7 +436,7 @@ public final class yacyClient {
        } catch (final IOException e) {
            yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore);
            //yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage());
-            return null;
+            return -1;
        }

        if (result == null || result.isEmpty()) {
@ -447,7 +446,7 @@ public final class yacyClient {
 					+ target.getName()
 					+ " (zero response), score="
 					+ target.selectscore);
-			return null;
+			return -1;
 		}

 		// compute all computation times
@ -468,14 +467,14 @@ public final class yacyClient {
 		// now create a plasmaIndex out of this result
 		// System.out.println("yacyClient: " + ((urlhashes.length() == 0) ? "primary" : "secondary")+ " search result = " + result.toString()); // debug
 		
-		int results = 0, joincount = 0;
+		int urlcount = 0, joincount = 0;
        try {
-            results = Integer.parseInt(result.get("count"));
-            joincount = Integer.parseInt(result.get("joincount"));
+            joincount = Integer.parseInt(result.get("joincount")); // the complete number of hits at remote site
+            urlcount = Integer.parseInt(result.get("count"));      // the number of hits that are returned in the result list
        } catch (final NumberFormatException e) {
            yacyCore.log.logInfo("SEARCH failed FROM " + target.hash + ":" + target.getName() + ", wrong output format: " + e.getMessage());
            //yacyCore.peerActions.peerDeparture(target, "search request to peer created number format exception");
-            return null;
+            return -1;
        }
 		// System.out.println("***result count " + results);

@ -488,14 +487,13 @@ public final class yacyClient {
                container[i] = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhashes.substring(i * Word.commonHashLength, (i + 1) * Word.commonHashLength).getBytes(), count);
            } catch (RowSpaceExceededException e) {
                Log.logException(e);
-                return null;
+                return -1;
            }
 		}

 		// insert results to containers
 		URIMetadataRow urlEntry;
-		final String[] urls = new String[results];
-		for (int n = 0; n < results; n++) {
+		for (int n = 0; n < urlcount; n++) {
 			// get one single search result
 			urlEntry = URIMetadataRow.importEntry(result.get("resource" + n));
 			if (urlEntry == null) continue;
@ -504,27 +502,26 @@ public final class yacyClient {
 			final URIMetadataRow.Components metadata = urlEntry.metadata();
 			if (metadata == null) continue;
 			if (blacklist.isListed(Blacklist.BLACKLIST_SEARCH, metadata.url())) {
-			    if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): filtered blacklisted url " + metadata.url() + " from peer " + target.getName());
+			    if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search: filtered blacklisted url " + metadata.url() + " from peer " + target.getName());
 				continue; // block with backlist
 			}
            
 			final String urlRejectReason = Switchboard.getSwitchboard().crawlStacker.urlInAcceptedDomain(metadata.url());
            if (urlRejectReason != null) {
-                if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
+                if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search: rejected url '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + target.getName());
                continue; // reject url outside of our domain
            }

 			// save the url entry
-			Reference entry;
-			if (urlEntry.word() == null) {
-			    if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search (client): no word attached from peer " + target.getName() + ", version " + target.getVersion());
+            Reference entry = urlEntry.word();
+			if (entry == null) {
+			    if (yacyCore.log.isWarning()) yacyCore.log.logWarning("remote search: no word attached from peer " + target.getName() + ", version " + target.getVersion());
 				continue; // no word attached
 			}

 			// the search-result-url transports all the attributes of word indexes
-			entry = urlEntry.word();
 			if (!Base64Order.enhancedCoder.equal(entry.metadataHash(), urlEntry.hash())) {
-				if (yacyCore.log.isInfo()) yacyCore.log.logInfo("remote search (client): url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + new String(entry.metadataHash()) + "; url = " + metadata.url() + " from peer " + target.getName());
+				yacyCore.log.logInfo("remote search: url-hash " + new String(urlEntry.hash()) + " does not belong to word-attached-hash " + new String(entry.metadataHash()) + "; url = " + metadata.url() + " from peer " + target.getName());
 				continue; // spammed
 			}

@ -554,23 +551,30 @@ public final class yacyClient {
                    break;
                }
 			}
-            
-			// store url hash for statistics
-			urls[n] = new String(urlEntry.hash());
 		}

        // store remote result to local result container
        synchronized (containerCache) {
            // insert one container into the search result buffer
-            containerCache.add(container[0], false, joincount); // one is enough
-            
-            // integrate remote topwords
-            final String references = result.get("references");
-            yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references);
-            if (references != null) {
-                // add references twice, so they can be counted (must have at least 2 entries)
-                containerCache.addTopic(references.split(","));
-                containerCache.addTopic(references.split(","));
+            containerCache.add(container[0], false, joincount); // one is enough, only the references are used, not the word
+        }
+        // insert the containers to the index
+        for (ReferenceContainer<WordReference> c: container) try {
+            indexSegment.termIndex().add(c);
+        } catch (Exception e) {
+            Log.logException(e);
+        }
+        yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + container[0].size() + "/" + joincount + " references for joined word queries");
+
+        // integrate remote top-words/topics
+        final String references = result.get("references");
+        if (references != null && references.length() > 0) {
+            yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent topics: " + references);
+            // add references twice, so they can be counted (must have at least 2 entries)
+            String[] rs = references.split(",");
+            synchronized (containerCache) {
+                containerCache.addTopic(rs);
+                containerCache.addTopic(rs);
            }
        }
        
@ -592,7 +596,7 @@ public final class yacyClient {
 							ci = new ByteBuffer(entry.getValue().getBytes("UTF-8"));
 						} catch (UnsupportedEncodingException e) {
 						    Log.logException(e);
-							return null;
+							return -1;
 						}
 						//System.out.println("DEBUG-ABSTRACTFETCH: for word hash " + wordhash + " received " + ci.toString());
 						ReferenceContainer.decompressIndex(singleAbstract, ci, target.hash);
@ -600,14 +604,8 @@ public final class yacyClient {
 					}
 				}
 			}
+			if (abstractCache.size() > 0) yacyCore.log.logInfo("remote search: peer " + target.getName() + " sent " + abstractCache.size() + " index abstracts");
 		}
-
-		// insert the containers to the index
-        for (int m = 0; m < words; m++) try {
-                indexSegment.termIndex().add(container[m]);
-            } catch (Exception e) {
-                Log.logException(e);
-            }
        
        // generate statistics
 		long searchtime;
@ -617,7 +615,7 @@ public final class yacyClient {
 			searchtime = totalrequesttime;
 		}
 		if (yacyCore.log.isFine()) yacyCore.log.logFine("SEARCH "
-				+ results
+				+ urlcount
 				+ " URLS FROM "
 				+ target.hash
 				+ ":"
@ -627,7 +625,7 @@ public final class yacyClient {
 				+ ", searchtime=" + searchtime + ", netdelay="
 				+ (totalrequesttime - searchtime) + ", references="
 				+ result.get("references"));
-		return urls;
+		return urlcount;
 	}

    public static Map<String, String> permissionMessage(final yacySeedDB seedDB, final String targetHash) {
--- a/source/de/anomic/yacy/yacySearch.java
+++ b/source/de/anomic/yacy/yacySearch.java
@ -56,7 +56,7 @@ public class yacySearch extends Thread {
    final private Map<String, TreeMap<String, String>> abstractCache;
    final private Blacklist blacklist;
    final private yacySeed targetPeer;
-    private String[] urls;
+    private int urls;
    private final int count, maxDistance;
    final private RankingProfile rankingProfile;
    final private Pattern prefer, filter;
@ -103,7 +103,7 @@ public class yacySearch extends Thread {
        this.abstractCache = abstractCache;
        this.blacklist = blacklist;
        this.targetPeer = targetPeer;
-        this.urls = null;
+        this.urls = -1;
        this.count = count;
        this.maxDistance = maxDistance;
        this.rankingProfile = rankingProfile;
@ -119,13 +119,11 @@ public class yacySearch extends Thread {
                        count, maxDistance, global, partitions,
                        targetPeer, indexSegment, crawlResults, containerCache, abstractCache,
                        blacklist, rankingProfile, constraint);
-            if (urls != null) {
+            if (urls >= 0) {
                // urls is an array of url hashes. this is only used for log output
-                final StringBuilder urllist = new StringBuilder(this.urls.length * 13);
-                for (int i = 0; i < this.urls.length; i++) urllist.append(this.urls[i]).append(' ');
-                yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + urls.length + " links for word hash " + wordhashes + ": " + new String(urllist));
-                peers.mySeed().incRI(urls.length);
-                peers.mySeed().incRU(urls.length);
+                //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + urls.length + " links for word hash " + wordhashes + ": " + new String(urllist));
+                peers.mySeed().incRI(urls);
+                peers.mySeed().incRU(urls);
            } else {
                yacyCore.log.logInfo("REMOTE SEARCH - no answer from remote peer " + targetPeer.hash + ":" + targetPeer.getName());
            }
@ -144,7 +142,7 @@ public class yacySearch extends Thread {
    }

    public int links() {
-        return this.urls.length;
+        return this.urls;
    }
    
    public int count() {
@ -218,7 +216,8 @@ public class yacySearch extends Thread {
        	seed = dhtEnum.next();
            if (seed == null) continue;
            if (seed.matchPeerTags(wordhashes)) {
-                Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + seed.getPeerTags().toString());
+                String specialized = seed.getPeerTags().toString();
+                if (!specialized.equals("[*]")) Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized);
                regularSeeds.remove(seed.hash);
                ranking.deleteScore(seed.hash);
                matchingSeeds.put(seed.hash, seed);
@ -335,7 +334,7 @@ public class yacySearch extends Thread {
    public static int collectedLinks(final yacySearch[] searchThreads) {
        int links = 0;
        for (int i = 0; i < searchThreads.length; i++) {
-            if (!(searchThreads[i].isAlive())) links += searchThreads[i].urls.length;
+            if (!(searchThreads[i].isAlive()) && searchThreads[i].urls > 0) links += searchThreads[i].urls;
        }
        return links;
    }
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -69,6 +69,7 @@ public class htmlParser extends AbstractParser implements Idiom {
        SUPPORTED_EXTENSIONS.add("cfm");
        SUPPORTED_EXTENSIONS.add("asp");
        SUPPORTED_EXTENSIONS.add("aspx");
+        SUPPORTED_EXTENSIONS.add("tex");
        SUPPORTED_EXTENSIONS.add("txt");
        SUPPORTED_EXTENSIONS.add("jsp");
        SUPPORTED_EXTENSIONS.add("pl");
@ -77,6 +78,7 @@ public class htmlParser extends AbstractParser implements Idiom {
        SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
        SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
        SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
+        SUPPORTED_MIME_TYPES.add("application/x-tex");
        SUPPORTED_MIME_TYPES.add("text/plain");
        SUPPORTED_MIME_TYPES.add("text/sgml");
        SUPPORTED_MIME_TYPES.add("text/csv");
--- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java
+++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java
@ -393,6 +393,16 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
    
    public static BlockingQueue<WordReferenceVars> transform(ReferenceContainer<WordReference> container) {
    	LinkedBlockingQueue<WordReferenceVars> out = new LinkedBlockingQueue<WordReferenceVars>();
+    	if (container.size() <= 100) {
+    	    // transform without concurrency to omit thread creation overhead
+    	    for (Row.Entry entry: container) try {
+                out.put(new WordReferenceVars(new WordReferenceRow(entry)));
+            } catch (InterruptedException e) {}
+            try {
+                out.put(WordReferenceVars.poison);
+            } catch (InterruptedException e) {}
+            return out;
+    	}
    	Thread distributor = new TransformDistributor(container, out);
    	distributor.start();
    	
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -104,7 +104,7 @@ public final class LoaderDispatcher {
            final boolean forText,
            final boolean global,
            final long maxFileSize) throws IOException {
-        return load(request(url, forText, global), forText, maxFileSize);
+        return load(request(url, forText, global), maxFileSize);
    }
    
    /**
@ -122,12 +122,12 @@ public final class LoaderDispatcher {
            final boolean global,
            CrawlProfile.CacheStrategy cacheStratgy,
            long maxFileSize) throws IOException {
-        return load(request(url, forText, global), forText, cacheStratgy, maxFileSize);
+        return load(request(url, forText, global), cacheStratgy, maxFileSize);
    }
    
    public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException {

-        byte[] b = load(request(url, false, true), false, cacheStratgy, maxFileSize).getContent();
+        byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent();
        if (b == null) throw new IOException("load == null");
        File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
        
@ -169,14 +169,14 @@ public final class LoaderDispatcher {
                    0);
    }
    
-    public Response load(final Request request, final boolean acceptOnlyParseable, long maxFileSize) throws IOException {
+    public Response load(final Request request, long maxFileSize) throws IOException {
        CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
        CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST;
        if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
-        return load(request, acceptOnlyParseable, cacheStrategy, maxFileSize);
+        return load(request, cacheStrategy, maxFileSize);
    }
    
-    public Response load(final Request request, final boolean acceptOnlyParseable, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
+    public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
        // get the protocol of the next URL
        final String protocol = request.url().getProtocol();
        final String host = request.url().getHost();
@ -258,7 +258,7 @@ public final class LoaderDispatcher {
        
        // load resource from the internet
        Response response = null;
-        if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize);
+        if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize);
        if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
        if (protocol.equals("smb")) response = smbLoader.load(request, true);
        if (protocol.equals("file")) response = fileLoader.load(request, true);