- fixes for some problems with the new crawling/caching strategies

- speed enhancements for the cache-only cache policy by using special no-delay rules in the balancer
- fixed some deadlock- and 100% CPU problems in the balancer

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6243 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 634a01a9a4
commit c0e17de2fb

@ -200,8 +200,8 @@
<td><label for="mustmatch">Policy for usage of Web Cache</label>:</td>
<td>
<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="cacheonly" />cache&nbsp;only
</td>
<td>

@ -76,6 +76,8 @@ public class ViewImage {
urlString = (url == null) ? null : url.toNormalform(true, true);
}
if (urlString == null) return null;
int width = post.getInt("width", 0);
int height = post.getInt("height", 0);
int maxwidth = post.getInt("maxwidth", 0);

@ -40,10 +40,24 @@ public class getpageinfo_p {
url = "http://" + url;
}
if (actions.indexOf("title")>=0) {
yacyURL u = null;
try {
final yacyURL u = new yacyURL(url, null);
final ContentScraper scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
u = new yacyURL(url, null);
} catch (final MalformedURLException e) {
// fail, do nothing
}
ContentScraper scraper = null;
if (u != null) try {
scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
} catch (final IOException e) {
// try again, try harder
try {
scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST);
} catch (final IOException ee) {
// now thats a fail, do nothing
}
}
if (scraper != null) {
// put the document title
prop.putXML("title", scraper.getTitle());
@ -54,11 +68,11 @@ public class getpageinfo_p {
final String list[]=scraper.getKeywords();
int count = 0;
for(int i=0;i<list.length;i++){
String tag = list[i];
if (!tag.equals("")) {
prop.putXML("tags_"+count+"_tag", tag);
count++;
}
String tag = list[i];
if (!tag.equals("")) {
prop.putXML("tags_"+count+"_tag", tag);
count++;
}
}
prop.put("tags", count);
// put description
@ -66,9 +80,6 @@ public class getpageinfo_p {
// put language
Set<String> languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
} catch (final MalformedURLException e) { /* ignore this */
} catch (final IOException e) { /* ignore this */
}
}
if(actions.indexOf("robots")>=0){

@ -34,6 +34,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import de.anomic.crawler.retrieval.Request;
import de.anomic.http.client.Cache;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.ObjectIndex;
import de.anomic.kelondro.order.CloneableIterator;
@ -107,7 +108,7 @@ public class Balancer {
}
}
public synchronized Request get(final String urlhash) throws IOException {
public Request get(final String urlhash) throws IOException {
assert urlhash != null;
if (urlFileIndex == null) return null; // case occurs during shutdown
final Row.Entry entry = urlFileIndex.get(urlhash.getBytes());
@ -189,7 +190,7 @@ public class Balancer {
return removedCounter;
}
public synchronized boolean has(final String urlhash) {
public boolean has(final String urlhash) {
return urlFileIndex.has(urlhash.getBytes());
}
@ -305,17 +306,29 @@ public class Balancer {
long sleeptime = 0;
Request crawlEntry = null;
synchronized (this) {
String failhash = null;
while (this.urlFileIndex.size() > 0) {
// first simply take one of the entries in the top list, that should be one without any delay
String result = nextFromDelayed();
if (result == null && this.top.size() > 0) result = top.remove();
String nexthash = nextFromDelayed();
//System.out.println("*** nextFromDelayed=" + nexthash);
if (nexthash == null && this.top.size() > 0) {
nexthash = top.remove();
//System.out.println("*** top.remove()=" + nexthash);
}
// check minimumDelta and if necessary force a sleep
//final int s = urlFileIndex.size();
Row.Entry rowEntry = (result == null) ? null : urlFileIndex.remove(result.getBytes());
Row.Entry rowEntry = (nexthash == null) ? null : urlFileIndex.remove(nexthash.getBytes());
if (rowEntry == null) {
//System.out.println("*** rowEntry=null, nexthash=" + nexthash);
rowEntry = urlFileIndex.removeOne();
result = (rowEntry == null) ? null : new String(rowEntry.getPrimaryKeyBytes());
if (rowEntry == null) {
nexthash = null;
} else {
nexthash = new String(rowEntry.getPrimaryKeyBytes());
//System.out.println("*** rowEntry.getPrimaryKeyBytes()=" + nexthash);
}
}
if (rowEntry == null) {
Log.logWarning("Balancer", "removeOne() failed - size = " + this.size());
@ -334,18 +347,28 @@ public class Balancer {
return null;
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
sleeptime = (
profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY ||
(profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_IFEXIST && Cache.has(crawlEntry.url()))
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
assert nexthash.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + nexthash + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
assert nexthash.equals(crawlEntry.url().hash()) : "result = " + nexthash + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
if (this.domainStacks.size() <= 1) break;
if (failhash != null && failhash.equals(nexthash)) break; // prevent endless loops
if (delay && sleeptime > 0) {
//System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
// put that thing back to omit a delay here
this.delayed.put(new Long(System.currentTimeMillis() + sleeptime + 1), result);
if (!delayed.values().contains(nexthash)) {
//System.out.println("*** delayed +=" + nexthash);
this.delayed.put(new Long(System.currentTimeMillis() + sleeptime + 1), nexthash);
}
this.urlFileIndex.put(rowEntry);
this.domainStacks.remove(result.substring(6));
continue;
this.domainStacks.remove(nexthash.substring(6));
failhash = nexthash;
continue;
}
break;
}

@ -87,12 +87,12 @@ public class CrawlQueues {
// tests if hash occurrs in any database
// if it exists, the name of the database is returned,
// if it not exists, null is returned
if (noticeURL.existsInStack(hash)) return "crawler";
if (delegatedURL.exists(hash)) return "delegated";
if (errorURL.exists(hash)) return "errors";
for (final crawlWorker worker: workers.values()) {
if (worker.request.url().hash().equals(hash)) return "worker";
}
if (noticeURL.existsInStack(hash)) return "crawler";
return null;
}
@ -105,8 +105,6 @@ public class CrawlQueues {
public yacyURL getURL(final String urlhash) {
assert urlhash != null;
if (urlhash == null || urlhash.length() == 0) return null;
final Request ne = noticeURL.get(urlhash);
if (ne != null) return ne.url();
ZURL.Entry ee = delegatedURL.getEntry(urlhash);
if (ee != null) return ee.url();
ee = errorURL.getEntry(urlhash);
@ -114,6 +112,8 @@ public class CrawlQueues {
for (final crawlWorker w: workers.values()) {
if (w.request.url().hash().equals(urlhash)) return w.request.url();
}
final Request ne = noticeURL.get(urlhash);
if (ne != null) return ne.url();
return null;
}

@ -202,7 +202,7 @@ public class Latency {
}
public void update(long time) {
this.lastacc = System.currentTimeMillis();
this.timeacc += time;
this.timeacc += Math.min(30000, time);
this.count++;
}
public void update() {

@ -202,7 +202,7 @@ public class RobotsTxt {
int sz = this.robotsTable.size();
addEntry(robotsTxt4Host);
if (this.robotsTable.size() <= sz) {
Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, reseing database");
Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, resetting database");
this.resetDatabase();
addEntry(robotsTxt4Host);
}

@ -200,7 +200,7 @@ public final class LoaderDispatcher {
// now forget about the cache, nothing there. Try to load the content from the internet
// check access time: this is a double-check (we checked possibly already in the balancer)
// to make shure that we don't DoS the target by mistake
// to make sure that we don't DoS the target by mistake
if (!request.url().isLocal()) {
final Long lastAccess = accessTime.get(host);
long wait = 0;
@ -214,7 +214,7 @@ public final class LoaderDispatcher {
}
}
// now it's for shure that we will access the target. Remember the access time
// now it's for sure that we will access the target. Remember the access time
accessTime.put(host, System.currentTimeMillis());
// load resource from the internet

@ -78,6 +78,8 @@ public class swfParser extends AbstractParser implements Idiom {
String contents = "";
try {
contents = swf2html.convertSWFToHTML(source);
} catch (NegativeArraySizeException e) {
// seen in log
} catch (Exception e) {
// we have seen a lot of OOM errors in the parser...
e.printStackTrace();

Loading…
Cancel
Save