- fixes for some problems with the new crawling/caching strategies

- speed enhancements for the cache-only cache policy by using special no-delay rules in the balancer - fixed some deadlock- and 100% CPU problems in the balancer git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6243 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · c0e17de2fb
parent 634a01a9a4
commit c0e17de2fb
9 changed files with 69 additions and 31 deletions
--- a/htroot/CrawlStart_p.html
+++ b/htroot/CrawlStart_p.html
@ -200,8 +200,8 @@
          <td><label for="mustmatch">Policy for usage of Web Cache</label>:</td>
          <td>
 			<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
-			<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
 			<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
+			<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
 			<input type="radio" name="cachePolicy" value="cacheonly" />cache&nbsp;only
 		  </td>
          <td>
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@ -76,6 +76,8 @@ public class ViewImage {
            urlString = (url == null) ? null : url.toNormalform(true, true);
        }
        
+        if (urlString == null) return null;
+        
        int width = post.getInt("width", 0);
        int height = post.getInt("height", 0);
        int maxwidth = post.getInt("maxwidth", 0);
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@ -40,10 +40,24 @@ public class getpageinfo_p {
                url = "http://" + url;
            }
            if (actions.indexOf("title")>=0) {
+                yacyURL u = null;
                try {
-                    final yacyURL u = new yacyURL(url, null);
-                    final ContentScraper scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
-                    
+                    u = new yacyURL(url, null);
+                } catch (final MalformedURLException e) {
+                    // fail, do nothing
+                }
+                ContentScraper scraper = null;
+                if (u != null) try {
+                    scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
+                } catch (final IOException e) {
+                    // try again, try harder
+                    try {
+                        scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST);
+                    } catch (final IOException ee) {
+                        // now thats a fail, do nothing                            
+                    }
+                }  
+                if (scraper != null) {
                    // put the document title 
                    prop.putXML("title", scraper.getTitle());
                    
@ -54,11 +68,11 @@ public class getpageinfo_p {
                    final String list[]=scraper.getKeywords();
                    int count = 0;
                    for(int i=0;i<list.length;i++){
-                    	String tag = list[i];
-                    	if (!tag.equals("")) {                   	                 	
-                    		prop.putXML("tags_"+count+"_tag", tag);
-                    		count++;
-                    	}
+                        String tag = list[i];
+                        if (!tag.equals("")) {                                          
+                            prop.putXML("tags_"+count+"_tag", tag);
+                            count++;
+                        }
                    }
                    prop.put("tags", count);
                    // put description                    
@ -66,9 +80,6 @@ public class getpageinfo_p {
                    // put language
                    Set<String> languages = scraper.getContentLanguages();
                    prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
-
-                } catch (final MalformedURLException e) { /* ignore this */
-                } catch (final IOException e) { /* ignore this */
                }
            }
            if(actions.indexOf("robots")>=0){
--- a/source/de/anomic/crawler/Balancer.java
+++ b/source/de/anomic/crawler/Balancer.java
@ -34,6 +34,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentLinkedQueue;

 import de.anomic.crawler.retrieval.Request;
+import de.anomic.http.client.Cache;
 import de.anomic.kelondro.index.Row;
 import de.anomic.kelondro.index.ObjectIndex;
 import de.anomic.kelondro.order.CloneableIterator;
@ -107,7 +108,7 @@ public class Balancer {
        }
    }
    
-    public synchronized Request get(final String urlhash) throws IOException {
+    public Request get(final String urlhash) throws IOException {
        assert urlhash != null;
        if (urlFileIndex == null) return null; // case occurs during shutdown
        final Row.Entry entry = urlFileIndex.get(urlhash.getBytes());
@ -189,7 +190,7 @@ public class Balancer {
       return removedCounter;
    }
    
-    public synchronized boolean has(final String urlhash) {
+    public boolean has(final String urlhash) {
        return urlFileIndex.has(urlhash.getBytes());
    }
    
@ -305,17 +306,29 @@ public class Balancer {
    	long sleeptime = 0;
    	Request crawlEntry = null;
    	synchronized (this) {
+    	    String failhash = null;
    		while (this.urlFileIndex.size() > 0) {
 		    	// first simply take one of the entries in the top list, that should be one without any delay
-		        String result = nextFromDelayed();
-		        if (result == null && this.top.size() > 0) result = top.remove();
+		        String nexthash = nextFromDelayed();
+		        //System.out.println("*** nextFromDelayed=" + nexthash);
+		        if (nexthash == null && this.top.size() > 0) {
+		            nexthash = top.remove();
+		            //System.out.println("*** top.remove()=" + nexthash);
+		        }
 		        
 		        // check minimumDelta and if necessary force a sleep
 		        //final int s = urlFileIndex.size();
-		        Row.Entry rowEntry = (result == null) ? null : urlFileIndex.remove(result.getBytes());
+		        Row.Entry rowEntry = (nexthash == null) ? null : urlFileIndex.remove(nexthash.getBytes());
 		        if (rowEntry == null) {
+		            //System.out.println("*** rowEntry=null, nexthash=" + nexthash);
 		        	rowEntry = urlFileIndex.removeOne();
-		        	result = (rowEntry == null) ? null : new String(rowEntry.getPrimaryKeyBytes());
+		        	if (rowEntry == null) {
+		        	    nexthash = null;
+		        	} else {
+		        	    nexthash = new String(rowEntry.getPrimaryKeyBytes());
+		        	    //System.out.println("*** rowEntry.getPrimaryKeyBytes()=" + nexthash);
+		        	}
+		        	
 		        }
 		        if (rowEntry == null) {
 		        	Log.logWarning("Balancer", "removeOne() failed - size = " + this.size());
@ -334,18 +347,28 @@ public class Balancer {
 		        	return null;
 		        }
 		        // depending on the caching policy we need sleep time to avoid DoS-like situations
-		        sleeptime = (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+		        sleeptime = (
+		                profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY ||
+		                (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_IFEXIST && Cache.has(crawlEntry.url()))
+		                ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+		        
+		        assert nexthash.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + nexthash + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
+		        assert nexthash.equals(crawlEntry.url().hash()) : "result = " + nexthash + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
 		        
-		        assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
-		        assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
 		        if (this.domainStacks.size() <= 1) break;
+		        if (failhash != null && failhash.equals(nexthash)) break; // prevent endless loops
 		        
 		        if (delay && sleeptime > 0) {
+		            //System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
 		        	// put that thing back to omit a delay here
-		        	this.delayed.put(new Long(System.currentTimeMillis() + sleeptime + 1), result);
+		            if (!delayed.values().contains(nexthash)) {
+		                //System.out.println("*** delayed +=" + nexthash);
+		                this.delayed.put(new Long(System.currentTimeMillis() + sleeptime + 1), nexthash);
+		            }
 		        	this.urlFileIndex.put(rowEntry);
-		        	this.domainStacks.remove(result.substring(6));
-		        	continue;
+		        	this.domainStacks.remove(nexthash.substring(6));
+		        	failhash = nexthash;
+                    continue;
 		        }
 		        break;
 	    	}
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -87,12 +87,12 @@ public class CrawlQueues {
        // tests if hash occurrs in any database
        // if it exists, the name of the database is returned,
        // if it not exists, null is returned
-        if (noticeURL.existsInStack(hash)) return "crawler";
        if (delegatedURL.exists(hash)) return "delegated";
        if (errorURL.exists(hash)) return "errors";
        for (final crawlWorker worker: workers.values()) {
            if (worker.request.url().hash().equals(hash)) return "worker";
        }
+        if (noticeURL.existsInStack(hash)) return "crawler";
        return null;
    }
    
@ -105,8 +105,6 @@ public class CrawlQueues {
    public yacyURL getURL(final String urlhash) {
        assert urlhash != null;
        if (urlhash == null || urlhash.length() == 0) return null;
-        final Request ne = noticeURL.get(urlhash);
-        if (ne != null) return ne.url();
        ZURL.Entry ee = delegatedURL.getEntry(urlhash);
        if (ee != null) return ee.url();
        ee = errorURL.getEntry(urlhash);
@ -114,6 +112,8 @@ public class CrawlQueues {
        for (final crawlWorker w: workers.values()) {
            if (w.request.url().hash().equals(urlhash)) return w.request.url();
        }
+        final Request ne = noticeURL.get(urlhash);
+        if (ne != null) return ne.url();
        return null;
    }
    
--- a/source/de/anomic/crawler/Latency.java
+++ b/source/de/anomic/crawler/Latency.java
@ -202,7 +202,7 @@ public class Latency {
        }
        public void update(long time) {
            this.lastacc = System.currentTimeMillis();
-            this.timeacc += time;
+            this.timeacc += Math.min(30000, time);
            this.count++;
        }
        public void update() {
--- a/source/de/anomic/crawler/RobotsTxt.java
+++ b/source/de/anomic/crawler/RobotsTxt.java
@ -202,7 +202,7 @@ public class RobotsTxt {
                    int sz = this.robotsTable.size();
                    addEntry(robotsTxt4Host);
                    if (this.robotsTable.size() <= sz) {
-                    	Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, reseing database");
+                    	Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, resetting database");
                    	this.resetDatabase();
                    	addEntry(robotsTxt4Host);
                    }
--- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
+++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
@ -200,7 +200,7 @@ public final class LoaderDispatcher {
        // now forget about the cache, nothing there. Try to load the content from the internet
        
        // check access time: this is a double-check (we checked possibly already in the balancer)
-        // to make shure that we don't DoS the target by mistake
+        // to make sure that we don't DoS the target by mistake
        if (!request.url().isLocal()) {
            final Long lastAccess = accessTime.get(host);
            long wait = 0;
@ -214,7 +214,7 @@ public final class LoaderDispatcher {
            }
        }

-        // now it's for shure that we will access the target. Remember the access time
+        // now it's for sure that we will access the target. Remember the access time
        accessTime.put(host, System.currentTimeMillis());
        
        // load resource from the internet
--- a/source/de/anomic/document/parser/swfParser.java
+++ b/source/de/anomic/document/parser/swfParser.java
@ -78,6 +78,8 @@ public class swfParser extends AbstractParser implements Idiom {
            String contents = "";
            try {
            	contents = swf2html.convertSWFToHTML(source);
+            } catch (NegativeArraySizeException e) {
+                // seen in log
            } catch (Exception e) {
            	// we have seen a lot of OOM errors in the parser...
            	e.printStackTrace();