diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html
index 4af4ae456..61e218e53 100644
--- a/htroot/CrawlStart_p.html
+++ b/htroot/CrawlStart_p.html
@@ -200,8 +200,8 @@
: |
no cache
- if exist
if fresh
+ if exist
cache only
|
diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java
index 8303d559f..c73f81b4b 100644
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@@ -76,6 +76,8 @@ public class ViewImage {
urlString = (url == null) ? null : url.toNormalform(true, true);
}
+ if (urlString == null) return null;
+
int width = post.getInt("width", 0);
int height = post.getInt("height", 0);
int maxwidth = post.getInt("maxwidth", 0);
diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java
index d265da13c..2554cc288 100755
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@@ -40,10 +40,24 @@ public class getpageinfo_p {
url = "http://" + url;
}
if (actions.indexOf("title")>=0) {
+ yacyURL u = null;
try {
- final yacyURL u = new yacyURL(url, null);
- final ContentScraper scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
-
+ u = new yacyURL(url, null);
+ } catch (final MalformedURLException e) {
+ // fail, do nothing
+ }
+ ContentScraper scraper = null;
+ if (u != null) try {
+ scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
+ } catch (final IOException e) {
+ // try again, try harder
+ try {
+ scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST);
+ } catch (final IOException ee) {
+ // now thats a fail, do nothing
+ }
+ }
+ if (scraper != null) {
// put the document title
prop.putXML("title", scraper.getTitle());
@@ -54,11 +68,11 @@ public class getpageinfo_p {
final String list[]=scraper.getKeywords();
int count = 0;
for(int i=0;i languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
-
- } catch (final MalformedURLException e) { /* ignore this */
- } catch (final IOException e) { /* ignore this */
}
}
if(actions.indexOf("robots")>=0){
diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java
index 4d976ee30..9db2b1828 100644
--- a/source/de/anomic/crawler/Balancer.java
+++ b/source/de/anomic/crawler/Balancer.java
@@ -34,6 +34,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import de.anomic.crawler.retrieval.Request;
+import de.anomic.http.client.Cache;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.ObjectIndex;
import de.anomic.kelondro.order.CloneableIterator;
@@ -107,7 +108,7 @@ public class Balancer {
}
}
- public synchronized Request get(final String urlhash) throws IOException {
+ public Request get(final String urlhash) throws IOException {
assert urlhash != null;
if (urlFileIndex == null) return null; // case occurs during shutdown
final Row.Entry entry = urlFileIndex.get(urlhash.getBytes());
@@ -189,7 +190,7 @@ public class Balancer {
return removedCounter;
}
- public synchronized boolean has(final String urlhash) {
+ public boolean has(final String urlhash) {
return urlFileIndex.has(urlhash.getBytes());
}
@@ -305,17 +306,29 @@ public class Balancer {
long sleeptime = 0;
Request crawlEntry = null;
synchronized (this) {
+ String failhash = null;
while (this.urlFileIndex.size() > 0) {
// first simply take one of the entries in the top list, that should be one without any delay
- String result = nextFromDelayed();
- if (result == null && this.top.size() > 0) result = top.remove();
+ String nexthash = nextFromDelayed();
+ //System.out.println("*** nextFromDelayed=" + nexthash);
+ if (nexthash == null && this.top.size() > 0) {
+ nexthash = top.remove();
+ //System.out.println("*** top.remove()=" + nexthash);
+ }
// check minimumDelta and if necessary force a sleep
//final int s = urlFileIndex.size();
- Row.Entry rowEntry = (result == null) ? null : urlFileIndex.remove(result.getBytes());
+ Row.Entry rowEntry = (nexthash == null) ? null : urlFileIndex.remove(nexthash.getBytes());
if (rowEntry == null) {
+ //System.out.println("*** rowEntry=null, nexthash=" + nexthash);
rowEntry = urlFileIndex.removeOne();
- result = (rowEntry == null) ? null : new String(rowEntry.getPrimaryKeyBytes());
+ if (rowEntry == null) {
+ nexthash = null;
+ } else {
+ nexthash = new String(rowEntry.getPrimaryKeyBytes());
+ //System.out.println("*** rowEntry.getPrimaryKeyBytes()=" + nexthash);
+ }
+
}
if (rowEntry == null) {
Log.logWarning("Balancer", "removeOne() failed - size = " + this.size());
@@ -334,18 +347,28 @@ public class Balancer {
return null;
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
- sleeptime = (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+ sleeptime = (
+ profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY ||
+ (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_IFEXIST && Cache.has(crawlEntry.url()))
+ ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+
+ assert nexthash.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + nexthash + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
+ assert nexthash.equals(crawlEntry.url().hash()) : "result = " + nexthash + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
- assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
- assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
if (this.domainStacks.size() <= 1) break;
+ if (failhash != null && failhash.equals(nexthash)) break; // prevent endless loops
if (delay && sleeptime > 0) {
+ //System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
// put that thing back to omit a delay here
- this.delayed.put(new Long(System.currentTimeMillis() + sleeptime + 1), result);
+ if (!delayed.values().contains(nexthash)) {
+ //System.out.println("*** delayed +=" + nexthash);
+ this.delayed.put(new Long(System.currentTimeMillis() + sleeptime + 1), nexthash);
+ }
this.urlFileIndex.put(rowEntry);
- this.domainStacks.remove(result.substring(6));
- continue;
+ this.domainStacks.remove(nexthash.substring(6));
+ failhash = nexthash;
+ continue;
}
break;
}
diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java
index eca4c0962..9e47ced54 100644
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@@ -87,12 +87,12 @@ public class CrawlQueues {
// tests if hash occurrs in any database
// if it exists, the name of the database is returned,
// if it not exists, null is returned
- if (noticeURL.existsInStack(hash)) return "crawler";
if (delegatedURL.exists(hash)) return "delegated";
if (errorURL.exists(hash)) return "errors";
for (final crawlWorker worker: workers.values()) {
if (worker.request.url().hash().equals(hash)) return "worker";
}
+ if (noticeURL.existsInStack(hash)) return "crawler";
return null;
}
@@ -105,8 +105,6 @@ public class CrawlQueues {
public yacyURL getURL(final String urlhash) {
assert urlhash != null;
if (urlhash == null || urlhash.length() == 0) return null;
- final Request ne = noticeURL.get(urlhash);
- if (ne != null) return ne.url();
ZURL.Entry ee = delegatedURL.getEntry(urlhash);
if (ee != null) return ee.url();
ee = errorURL.getEntry(urlhash);
@@ -114,6 +112,8 @@ public class CrawlQueues {
for (final crawlWorker w: workers.values()) {
if (w.request.url().hash().equals(urlhash)) return w.request.url();
}
+ final Request ne = noticeURL.get(urlhash);
+ if (ne != null) return ne.url();
return null;
}
diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java
index 798a22eb7..2d304a935 100644
--- a/source/de/anomic/crawler/Latency.java
+++ b/source/de/anomic/crawler/Latency.java
@@ -202,7 +202,7 @@ public class Latency {
}
public void update(long time) {
this.lastacc = System.currentTimeMillis();
- this.timeacc += time;
+ this.timeacc += Math.min(30000, time);
this.count++;
}
public void update() {
diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java
index ed07c6bd4..2c0d34cc6 100644
--- a/source/de/anomic/crawler/RobotsTxt.java
+++ b/source/de/anomic/crawler/RobotsTxt.java
@@ -202,7 +202,7 @@ public class RobotsTxt {
int sz = this.robotsTable.size();
addEntry(robotsTxt4Host);
if (this.robotsTable.size() <= sz) {
- Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, reseing database");
+ Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, resetting database");
this.resetDatabase();
addEntry(robotsTxt4Host);
}
diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
index c8251600e..398d89ec9 100644
--- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
+++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
@@ -200,7 +200,7 @@ public final class LoaderDispatcher {
// now forget about the cache, nothing there. Try to load the content from the internet
// check access time: this is a double-check (we checked possibly already in the balancer)
- // to make shure that we don't DoS the target by mistake
+ // to make sure that we don't DoS the target by mistake
if (!request.url().isLocal()) {
final Long lastAccess = accessTime.get(host);
long wait = 0;
@@ -214,7 +214,7 @@ public final class LoaderDispatcher {
}
}
- // now it's for shure that we will access the target. Remember the access time
+ // now it's for sure that we will access the target. Remember the access time
accessTime.put(host, System.currentTimeMillis());
// load resource from the internet
diff --git a/source/de/anomic/document/parser/swfParser.java b/source/de/anomic/document/parser/swfParser.java
index a2a60cc88..b64cef0a1 100644
--- a/source/de/anomic/document/parser/swfParser.java
+++ b/source/de/anomic/document/parser/swfParser.java
@@ -78,6 +78,8 @@ public class swfParser extends AbstractParser implements Idiom {
String contents = "";
try {
contents = swf2html.convertSWFToHTML(source);
+ } catch (NegativeArraySizeException e) {
+ // seen in log
} catch (Exception e) {
// we have seen a lot of OOM errors in the parser...
e.printStackTrace();
|