From c0e17de2fbc6da7c4721d6fb038473bd66569177 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Sat, 25 Jul 2009 21:38:57 +0000
Subject: [PATCH] - fixes for some problems with the new crawling/caching
 strategies - speed enhancements for the cache-only cache policy by using
 special no-delay rules in the balancer - fixed some deadlock- and 100% CPU
 problems in the balancer

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6243 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/CrawlStart_p.html                      |  2 +-
 htroot/ViewImage.java                         |  2 +
 htroot/api/util/getpageinfo_p.java            | 33 ++++++++-----
 source/de/anomic/crawler/Balancer.java        | 47 ++++++++++++++-----
 source/de/anomic/crawler/CrawlQueues.java     |  6 +--
 source/de/anomic/crawler/Latency.java         |  2 +-
 source/de/anomic/crawler/RobotsTxt.java       |  2 +-
 .../crawler/retrieval/LoaderDispatcher.java   |  4 +-
 .../de/anomic/document/parser/swfParser.java  |  2 +
 9 files changed, 69 insertions(+), 31 deletions(-)
diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html
index 4af4ae456..61e218e53 100644
--- a/htroot/CrawlStart_p.html
+++ b/htroot/CrawlStart_p.html
@@ -200,8 +200,8 @@
           <td><label for="mustmatch">Policy for usage of Web Cache</label>:</td>
           <td>
 			<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
-			<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
 			<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
+			<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
 			<input type="radio" name="cachePolicy" value="cacheonly" />cache&nbsp;only
 		  </td>
           <td>
diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java
index 8303d559f..c73f81b4b 100644
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@@ -76,6 +76,8 @@ public class ViewImage {
             urlString = (url == null) ? null : url.toNormalform(true, true);
         }
         
+        if (urlString == null) return null;
+        
         int width = post.getInt("width", 0);
         int height = post.getInt("height", 0);
         int maxwidth = post.getInt("maxwidth", 0);
diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java
index d265da13c..2554cc288 100755
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@@ -40,10 +40,24 @@ public class getpageinfo_p {
                 url = "http://" + url;
             }
             if (actions.indexOf("title")>=0) {
+                yacyURL u = null;
                 try {
-                    final yacyURL u = new yacyURL(url, null);
-                    final ContentScraper scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
-                    
+                    u = new yacyURL(url, null);
+                } catch (final MalformedURLException e) {
+                    // fail, do nothing
+                }
+                ContentScraper scraper = null;
+                if (u != null) try {
+                    scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
+                } catch (final IOException e) {
+                    // try again, try harder
+                    try {
+                        scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST);
+                    } catch (final IOException ee) {
+                        // now thats a fail, do nothing                            
+                    }
+                }  
+                if (scraper != null) {
                     // put the document title 
                     prop.putXML("title", scraper.getTitle());
                     
@@ -54,11 +68,11 @@ public class getpageinfo_p {
                     final String list[]=scraper.getKeywords();
                     int count = 0;
                     for(int i=0;i<list.length;i++){
-                    	String tag = list[i];
-                    	if (!tag.equals("")) {                   	                 	
-                    		prop.putXML("tags_"+count+"_tag", tag);
-                    		count++;
-                    	}
+                        String tag = list[i];
+                        if (!tag.equals("")) {                                          
+                            prop.putXML("tags_"+count+"_tag", tag);
+                            count++;
+                        }
                     }
                     prop.put("tags", count);
                     // put description                    
@@ -66,9 +80,6 @@ public class getpageinfo_p {
                     // put language
                     Set<String> languages = scraper.getContentLanguages();
                     prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
-
-                } catch (final MalformedURLException e) { /* ignore this */
-                } catch (final IOException e) { /* ignore this */
                 }
             }
             if(actions.indexOf("robots")>=0){
diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java
index 4d976ee30..9db2b1828 100644
--- a/source/de/anomic/crawler/Balancer.java
+++ b/source/de/anomic/crawler/Balancer.java
@@ -34,6 +34,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentLinkedQueue;
 
 import de.anomic.crawler.retrieval.Request;
+import de.anomic.http.client.Cache;
 import de.anomic.kelondro.index.Row;
 import de.anomic.kelondro.index.ObjectIndex;
 import de.anomic.kelondro.order.CloneableIterator;
@@ -107,7 +108,7 @@ public class Balancer {
         }
     }
     
-    public synchronized Request get(final String urlhash) throws IOException {
+    public Request get(final String urlhash) throws IOException {
         assert urlhash != null;
         if (urlFileIndex == null) return null; // case occurs during shutdown
         final Row.Entry entry = urlFileIndex.get(urlhash.getBytes());
@@ -189,7 +190,7 @@ public class Balancer {
        return removedCounter;
     }
     
-    public synchronized boolean has(final String urlhash) {
+    public boolean has(final String urlhash) {
         return urlFileIndex.has(urlhash.getBytes());
     }
     
@@ -305,17 +306,29 @@ public class Balancer {
     	long sleeptime = 0;
     	Request crawlEntry = null;
     	synchronized (this) {
+    	    String failhash = null;
     		while (this.urlFileIndex.size() > 0) {
 		    	// first simply take one of the entries in the top list, that should be one without any delay
-		        String result = nextFromDelayed();
-		        if (result == null && this.top.size() > 0) result = top.remove();
+		        String nexthash = nextFromDelayed();
+		        //System.out.println("*** nextFromDelayed=" + nexthash);
+		        if (nexthash == null && this.top.size() > 0) {
+		            nexthash = top.remove();
+		            //System.out.println("*** top.remove()=" + nexthash);
+		        }
 		        
 		        // check minimumDelta and if necessary force a sleep
 		        //final int s = urlFileIndex.size();
-		        Row.Entry rowEntry = (result == null) ? null : urlFileIndex.remove(result.getBytes());
+		        Row.Entry rowEntry = (nexthash == null) ? null : urlFileIndex.remove(nexthash.getBytes());
 		        if (rowEntry == null) {
+		            //System.out.println("*** rowEntry=null, nexthash=" + nexthash);
 		        	rowEntry = urlFileIndex.removeOne();
-		        	result = (rowEntry == null) ? null : new String(rowEntry.getPrimaryKeyBytes());
+		        	if (rowEntry == null) {
+		        	    nexthash = null;
+		        	} else {
+		        	    nexthash = new String(rowEntry.getPrimaryKeyBytes());
+		        	    //System.out.println("*** rowEntry.getPrimaryKeyBytes()=" + nexthash);
+		        	}
+		        	
 		        }
 		        if (rowEntry == null) {
 		        	Log.logWarning("Balancer", "removeOne() failed - size = " + this.size());
@@ -334,18 +347,28 @@ public class Balancer {
 		        	return null;
 		        }
 		        // depending on the caching policy we need sleep time to avoid DoS-like situations
-		        sleeptime = (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+		        sleeptime = (
+		                profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY ||
+		                (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_IFEXIST && Cache.has(crawlEntry.url()))
+		                ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+		        
+		        assert nexthash.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + nexthash + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
+		        assert nexthash.equals(crawlEntry.url().hash()) : "result = " + nexthash + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
 		        
-		        assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
-		        assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
 		        if (this.domainStacks.size() <= 1) break;
+		        if (failhash != null && failhash.equals(nexthash)) break; // prevent endless loops
 		        
 		        if (delay && sleeptime > 0) {
+		            //System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
 		        	// put that thing back to omit a delay here
-		        	this.delayed.put(new Long(System.currentTimeMillis() + sleeptime + 1), result);
+		            if (!delayed.values().contains(nexthash)) {
+		                //System.out.println("*** delayed +=" + nexthash);
+		                this.delayed.put(new Long(System.currentTimeMillis() + sleeptime + 1), nexthash);
+		            }
 		        	this.urlFileIndex.put(rowEntry);
-		        	this.domainStacks.remove(result.substring(6));
-		        	continue;
+		        	this.domainStacks.remove(nexthash.substring(6));
+		        	failhash = nexthash;
+                    continue;
 		        }
 		        break;
 	    	}
diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java
index eca4c0962..9e47ced54 100644
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@@ -87,12 +87,12 @@ public class CrawlQueues {
         // tests if hash occurrs in any database
         // if it exists, the name of the database is returned,
         // if it not exists, null is returned
-        if (noticeURL.existsInStack(hash)) return "crawler";
         if (delegatedURL.exists(hash)) return "delegated";
         if (errorURL.exists(hash)) return "errors";
         for (final crawlWorker worker: workers.values()) {
             if (worker.request.url().hash().equals(hash)) return "worker";
         }
+        if (noticeURL.existsInStack(hash)) return "crawler";
         return null;
     }
     
@@ -105,8 +105,6 @@ public class CrawlQueues {
     public yacyURL getURL(final String urlhash) {
         assert urlhash != null;
         if (urlhash == null || urlhash.length() == 0) return null;
-        final Request ne = noticeURL.get(urlhash);
-        if (ne != null) return ne.url();
         ZURL.Entry ee = delegatedURL.getEntry(urlhash);
         if (ee != null) return ee.url();
         ee = errorURL.getEntry(urlhash);
@@ -114,6 +112,8 @@ public class CrawlQueues {
         for (final crawlWorker w: workers.values()) {
             if (w.request.url().hash().equals(urlhash)) return w.request.url();
         }
+        final Request ne = noticeURL.get(urlhash);
+        if (ne != null) return ne.url();
         return null;
     }
     
diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java
index 798a22eb7..2d304a935 100644
--- a/source/de/anomic/crawler/Latency.java
+++ b/source/de/anomic/crawler/Latency.java
@@ -202,7 +202,7 @@ public class Latency {
         }
         public void update(long time) {
             this.lastacc = System.currentTimeMillis();
-            this.timeacc += time;
+            this.timeacc += Math.min(30000, time);
             this.count++;
         }
         public void update() {
diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java
index ed07c6bd4..2c0d34cc6 100644
--- a/source/de/anomic/crawler/RobotsTxt.java
+++ b/source/de/anomic/crawler/RobotsTxt.java
@@ -202,7 +202,7 @@ public class RobotsTxt {
                     int sz = this.robotsTable.size();
                     addEntry(robotsTxt4Host);
                     if (this.robotsTable.size() <= sz) {
-                    	Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, reseing database");
+                    	Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, resetting database");
                     	this.resetDatabase();
                     	addEntry(robotsTxt4Host);
                     }
diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
index c8251600e..398d89ec9 100644
--- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
+++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
@@ -200,7 +200,7 @@ public final class LoaderDispatcher {
         // now forget about the cache, nothing there. Try to load the content from the internet
         
         // check access time: this is a double-check (we checked possibly already in the balancer)
-        // to make shure that we don't DoS the target by mistake
+        // to make sure that we don't DoS the target by mistake
         if (!request.url().isLocal()) {
             final Long lastAccess = accessTime.get(host);
             long wait = 0;
@@ -214,7 +214,7 @@ public final class LoaderDispatcher {
             }
         }
 
-        // now it's for shure that we will access the target. Remember the access time
+        // now it's for sure that we will access the target. Remember the access time
         accessTime.put(host, System.currentTimeMillis());
         
         // load resource from the internet
diff --git a/source/de/anomic/document/parser/swfParser.java b/source/de/anomic/document/parser/swfParser.java
index a2a60cc88..b64cef0a1 100644
--- a/source/de/anomic/document/parser/swfParser.java
+++ b/source/de/anomic/document/parser/swfParser.java
@@ -78,6 +78,8 @@ public class swfParser extends AbstractParser implements Idiom {
             String contents = "";
             try {
             	contents = swf2html.convertSWFToHTML(source);
+            } catch (NegativeArraySizeException e) {
+                // seen in log
             } catch (Exception e) {
             	// we have seen a lot of OOM errors in the parser...
             	e.printStackTrace();