snippet retrieval loading processes may use a smaller minimum load time

value than crawling processes. This speeds up the search result
preparation dramatically.
pull/1/head
Michael Peter Christen 13 years ago
parent ef488a15f7
commit 24d9db1613

@ -50,6 +50,7 @@ import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.NewsPool;
import net.yacy.search.Switchboard;
import net.yacy.search.snippet.TextSnippet;
import de.anomic.data.BookmarkHelper;
import de.anomic.data.BookmarksDB;
import de.anomic.data.BookmarksDB.Bookmark;
@ -196,7 +197,7 @@ public class Bookmarks {
// try to get the bookmark from the LURL database
final URIMetadata urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash));
if (urlentry != null) try {
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null));
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay));
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", urlentry.url().toNormalform(false, true));
prop.putHTML("mode_title", urlentry.dc_title());

@ -55,6 +55,7 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.SitemapImporter;
import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.crawler.retrieval.Request;
@ -323,7 +324,7 @@ public class Crawler_p {
sb.crawlQueues.errorURL.remove(urlhash);
// get a scraper to get the title
final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER);
final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title();
final String description = scraper.dc_description();
@ -551,7 +552,7 @@ public class Crawler_p {
try {
final DigestURI sitelistURL = new DigestURI(crawlingStart);
// download document
Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER);
Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
// String title = scraper.getTitle();
// String description = scraper.getDescription();

@ -30,6 +30,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.retrieval.Response;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -65,7 +66,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1));
@ -107,7 +108,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon1Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1));
@ -149,7 +150,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon2Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000));
@ -191,7 +192,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geo1Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname);
@ -234,7 +235,7 @@ public class DictionaryLoader_p {
if (post.containsKey("drw0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
LibraryProvider.activateDeReWo();
@ -278,7 +279,7 @@ public class DictionaryLoader_p {
if (post.containsKey("pnd0Load")) {
// load from the net
try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay);
final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file());
LibraryProvider.activatePND();

@ -43,6 +43,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.RSSLoader;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.WorkTables;
@ -256,7 +257,7 @@ public class Load_RSS_p {
RSSReader rss = null;
if (url != null) try {
prop.put("url", url.toNormalform(true, false));
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER);
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
final byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (final IOException e) {

@ -58,6 +58,7 @@ import net.yacy.search.index.Segment;
import com.hp.hpl.jena.rdf.model.Model;
import de.anomic.crawler.Cache;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.retrieval.Response;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -164,7 +165,7 @@ public class ViewFile {
Response response = null;
try {
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null);
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay);
} catch (final IOException e) {
prop.put("error", "4");
prop.put("error_errorText", "error loading resource: " + e.getMessage());

@ -44,6 +44,7 @@ import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlQueues;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -96,7 +97,7 @@ public class ViewImage {
if (image == null) {
byte[] resourceb = null;
if (url != null) try {
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH);
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, CrawlQueues.queuedMinLoadDelay);
} catch (final IOException e) {
Log.logFine("ViewImage", "cannot load: " + e.getMessage());
}

@ -45,6 +45,7 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.RobotsTxtEntry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -95,7 +96,7 @@ public class getpageinfo {
}
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER);
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
} catch (final IOException e) {
Log.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"

@ -45,6 +45,7 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.RobotsTxtEntry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -95,7 +96,7 @@ public class getpageinfo_p {
}
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER);
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
} catch (final IOException e) {
Log.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"

@ -41,6 +41,7 @@ import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.peers.graphics.WebStructureGraph;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlQueues;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -97,7 +98,7 @@ public class webstructure {
prop.put("references", 1);
net.yacy.document.Document scraper = null;
if (url != null) try {
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null);
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, CrawlQueues.queuedMinLoadDelay);
} catch (final IOException e) {
Log.logException(e);
}

@ -80,6 +80,7 @@ import net.yacy.search.query.SearchEvent;
import net.yacy.search.query.SearchEventCache;
import net.yacy.search.query.SnippetProcess;
import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.snippet.TextSnippet;
import de.anomic.data.DidYouMean;
import de.anomic.data.UserDB;
import de.anomic.data.ymark.YMarkTables;
@ -668,7 +669,7 @@ public class yacysearch {
sb.loader.loadDocuments(
sb.loader.request(urlentry.url(), true, false),
CacheStrategy.IFEXIST,
Integer.MAX_VALUE, BlacklistType.SEARCH);
Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay);
} catch ( final IOException e ) {
} catch ( final Parser.Failure e ) {
}

@ -183,7 +183,7 @@ public class yacysearchitem {
// END interaction
prop.putHTML("content_target", target);
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null);
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay);
prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // acquire license for favicon url loading
prop.put("content_urlhash", resulthashString);
prop.put("content_ranking", result.ranking);
@ -266,7 +266,7 @@ public class yacysearchitem {
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
final String license = sb.licensedURLs.aquireLicense(ms.url());
sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null);
sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay);
prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring);
prop.putHTML("content_item_href", resultUrlstring);
prop.putHTML("content_item_target", target);

@ -60,6 +60,7 @@ import de.anomic.crawler.retrieval.Response;
public class CrawlQueues {
public static final long queuedMinLoadDelay = 500;
private static final String ERROR_DB_FILENAME = "urlError4.db";
private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db";
@ -656,7 +657,7 @@ public class CrawlQueues {
try {
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER);
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, queuedMinLoadDelay);
if (response == null) {
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.this.log.isFine()) {

@ -63,7 +63,7 @@ public class RSSLoader extends Thread {
public void run() {
RSSReader rss = null;
try {
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER);
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
final byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (final MalformedURLException e) {

@ -22,6 +22,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.snippet.TextSnippet;
import de.anomic.crawler.retrieval.Response;
public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandler {
@ -68,7 +69,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
return null;
}
try {
response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null);
response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay);
} catch (final IOException e) {
Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to IOException for url: "+url);
return null;

@ -39,6 +39,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Segment;
import net.yacy.search.snippet.TextSnippet;
import de.anomic.crawler.retrieval.Response;
public class YMarkMetadata {
@ -97,7 +98,7 @@ public class YMarkMetadata {
public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure {
if(this.document == null) {
Response response = null;
response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null);
response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay);
this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
}
return this.document;

@ -45,6 +45,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.snippet.TextSnippet;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@ -62,7 +63,7 @@ public class OAIListFriendsLoader implements Serializable {
listFriends.putAll(moreFriends);
if (loader != null) for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) {
try {
loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null);
loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay);
} catch (final MalformedURLException e) {
}
}
@ -87,7 +88,7 @@ public class OAIListFriendsLoader implements Serializable {
Map<String, String> m;
for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
if (!oaiFriend.getValue().exists()) {
final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay);
if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
}

@ -30,6 +30,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.snippet.TextSnippet;
import de.anomic.crawler.retrieval.Response;
@ -54,7 +55,7 @@ public class OAIPMHLoader {
for (int i = 0; i < 5; i++) {
// make some retries if first attempt fails
try {
response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null);
response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay);
break;
} catch (IOException e) {
Log.logWarning("OAIPMHLoader", "loading failed at attempt " + (i + 1) + ": " + source.toNormalform(true, false));

@ -38,6 +38,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import net.yacy.search.snippet.TextSnippet;
import net.yacy.visualization.RasterPlotter;
import de.anomic.crawler.Cache;
import de.anomic.crawler.retrieval.Response;
@ -112,7 +113,7 @@ public class OSMTile {
// download resource using the crawler and keep resource in memory if possible
Response entry = null;
try {
entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null);
entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay);
} catch (final IOException e) {
Log.logWarning("OSMTile", "cannot load: " + e.getMessage());
return null;

@ -63,6 +63,7 @@ import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;
import net.yacy.peers.Network;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlQueues;
import de.anomic.server.serverCore;
import de.anomic.tools.CryptoLib;
import de.anomic.tools.SignatureOutputStream;
@ -240,7 +241,7 @@ public final class yacyRelease extends yacyVersion {
try {
final DigestURI uri = location.getLocationURL();
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null);
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, CrawlQueues.queuedMinLoadDelay);
} catch (final IOException e) {
return null;
}

@ -66,7 +66,6 @@ import de.anomic.crawler.retrieval.SMBLoader;
public final class LoaderDispatcher {
private static final long minDelay = 250; // milliseconds; 4 accesses per second
private static final ConcurrentHashMap<String, Long> accessTime = new ConcurrentHashMap<String, Long>(); // to protect targets from DDoS
private final Switchboard sb;
@ -133,9 +132,9 @@ public final class LoaderDispatcher {
0);
}
public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType) throws IOException {
public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, final long minDelay) throws IOException {
final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType).getContent();
final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType, minDelay).getContent();
if (b == null) throw new IOException("load == null");
final File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
@ -146,11 +145,11 @@ public final class LoaderDispatcher {
tmp.renameTo(targetFile);
}
public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType) throws IOException {
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType);
public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType, final long minDelay) throws IOException {
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType, minDelay);
}
public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType) throws IOException {
public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay) throws IOException {
Semaphore check = this.loaderSteering.get(request.url());
if (check != null) {
// a loading process may be going on for that url
@ -161,7 +160,7 @@ public final class LoaderDispatcher {
this.loaderSteering.put(request.url(), new Semaphore(0));
try {
final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType);
final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType, minDelay);
check = this.loaderSteering.remove(request.url());
if (check != null) check.release(1000);
return response;
@ -181,7 +180,7 @@ public final class LoaderDispatcher {
* @return the loaded entity in a Response object
* @throws IOException
*/
private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType) throws IOException {
private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay) throws IOException {
// get the protocol of the next URL
final DigestURI url = request.url();
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
@ -264,8 +263,11 @@ public final class LoaderDispatcher {
// force a sleep here. Instead just sleep we clean up the accessTime map
final long untilTime = System.currentTimeMillis() + wait;
cleanupAccessTimeTable(untilTime);
if (System.currentTimeMillis() < untilTime)
try {Thread.sleep(untilTime - System.currentTimeMillis());} catch (final InterruptedException ee) {}
if (System.currentTimeMillis() < untilTime) {
long frcdslp = untilTime - System.currentTimeMillis();
this.log.logInfo("Forcing sleep of " + frcdslp + " ms for host " + host);
try {Thread.sleep(frcdslp);} catch (final InterruptedException ee) {}
}
}
}
@ -330,19 +332,19 @@ public final class LoaderDispatcher {
* @return the content as {@link byte[]}
* @throws IOException
*/
public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType) throws IOException {
public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException {
// try to download the resource using the loader
final Response entry = load(request, cacheStrategy, blacklistType);
final Response entry = load(request, cacheStrategy, blacklistType, minDelay);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
return entry.getContent();
}
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType) throws IOException, Parser.Failure {
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final long minDelay) throws IOException, Parser.Failure {
// load resource
final Response response = load(request, cacheStrategy, maxFileSize, blacklistType);
final Response response = load(request, cacheStrategy, maxFileSize, blacklistType, minDelay);
final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
@ -353,10 +355,10 @@ public final class LoaderDispatcher {
return response.parse();
}
public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType) throws IOException {
public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final long minDelay) throws IOException {
// load resource
Request request = request(location, true, false);
final Response response = this.load(request, cachePolicy, blacklistType);
final Response response = this.load(request, cachePolicy, blacklistType, minDelay);
final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
@ -379,8 +381,8 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
*/
public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType);
public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, minDelay);
if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader();
if (response.getContent() == null) throw new IOException("resource == null");
@ -405,16 +407,16 @@ public final class LoaderDispatcher {
while (i.hasNext()) {
e = i.next();
if (System.currentTimeMillis() > timeout) break;
if (System.currentTimeMillis() - e.getValue().longValue() > minDelay) i.remove();
if (System.currentTimeMillis() - e.getValue().longValue() > 1000) i.remove();
}
}
public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType) {
new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType).start();
public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType, final long minDelay) {
new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay).start();
}
public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType) {
new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType).start();
public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType, final long minDelay) {
new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay).start();
}
private class Loader extends Thread {
@ -424,13 +426,15 @@ public final class LoaderDispatcher {
private final int maxFileSize;
private final CacheStrategy cacheStrategy;
private final BlacklistType blacklistType;
private final long minDelay;
public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType) {
public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) {
this.url = url;
this.cache = cache;
this.maxFileSize = maxFileSize;
this.cacheStrategy = cacheStrategy;
this.blacklistType = blacklistType;
this.minDelay = minDelay;
}
@Override
@ -438,7 +442,7 @@ public final class LoaderDispatcher {
if (this.cache != null && this.cache.exists()) return;
try {
// load from the net
final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType);
final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType, this.minDelay);
final byte[] b = response.getContent();
if (this.cache != null) FileUtils.copy(b, this.cache);
} catch (final MalformedURLException e) {} catch (final IOException e) {}

@ -152,6 +152,7 @@ import net.yacy.search.query.SearchEvent;
import net.yacy.search.query.SearchEventCache;
import net.yacy.search.ranking.BlockRank;
import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.snippet.TextSnippet;
import com.google.common.io.Files;
@ -2675,7 +2676,7 @@ public final class Switchboard extends serverSwitch
Thread.currentThread().setName("Switchboard.addToIndex:" + urls);
try {
final Response response =
Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER);
Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
if ( response == null ) {
throw new IOException("response == null");
}
@ -3076,7 +3077,7 @@ public final class Switchboard extends serverSwitch
final Map<MultiProtocolURI, String> links;
searchEvent.getRankingResult().oneFeederStarted();
try {
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH);
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay);
if ( links != null ) {
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
while ( i.hasNext() ) {
@ -3115,7 +3116,7 @@ public final class Switchboard extends serverSwitch
final Map<MultiProtocolURI, String> links;
DigestURI url;
try {
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH);
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay);
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
@ -3179,7 +3180,7 @@ public final class Switchboard extends serverSwitch
searchEvent.getRankingResult().oneFeederStarted();
try {
final Response response =
sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH);
sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay);
final byte[] resource = (response == null) ? null : response.getContent();
//System.out.println("BLEKKO: " + UTF8.String(resource));
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);

@ -69,6 +69,7 @@ import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.Switchboard;
import net.yacy.search.query.RWIProcess;
import net.yacy.search.query.SearchEvent;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.retrieval.Response;
public class Segment {
@ -571,7 +572,7 @@ public class Segment {
try {
// parse the resource
final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, Integer.MAX_VALUE, null));
final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay));
if (document == null) {
// delete just the url entry
urlMetadata().remove(urlhash);

@ -26,6 +26,7 @@
package net.yacy.search.query;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@ -501,8 +502,13 @@ public class SnippetProcess {
SolrDocument sd = null;
StringBuilder querystring = new StringBuilder(17);
querystring.append(SolrField.id.getSolrFieldName()).append(':').append('"').append(ASCII.String(page.hash())).append('"');
final SolrDocumentList sdl = this.solr.query(querystring.toString(), 0, 1);
if (!sdl.isEmpty()) {
SolrDocumentList sdl = null;
try {
sdl = this.solr.query(querystring.toString(), 0, 1);
} catch (IOException e) {
Log.logException(e);
}
if (sdl != null && !sdl.isEmpty()) {
sd = sdl.get(0);
}
if (sd != null) {
@ -537,9 +543,7 @@ public class SnippetProcess {
Log.logWarning("SnippetProcess", "worker ended with timeout");
}
//System.out.println("FINISHED WORKER " + id + " FOR " + this.neededResults + " RESULTS, loops = " + loops);
} catch (final Exception e) {
Log.logException(e);
}
} catch (final Exception e) { Log.logException(e); }
//Log.logInfo("SEARCH", "resultWorker thread " + this.id + " terminated");
}

@ -143,7 +143,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
Document document;
try {
document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, Integer.MAX_VALUE, BlacklistType.SEARCH));
document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay));
} catch (final IOException e) {
Log.logFine("snippet fetch", "load error: " + e.getMessage());
return new ArrayList<MediaSnippet>();

@ -60,6 +60,7 @@ import de.anomic.crawler.retrieval.Response;
public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnippet> {
public static final long snippetMinLoadDelay = 10;
private static final int MAX_CACHE = 1000;
@ -213,7 +214,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
final Request request = loader == null ? null : loader.request(url, true, reindexing);
Response response;
try {
response = loader == null || request == null ? null : loader.load(request, CacheStrategy.CACHEONLY, BlacklistType.SEARCH);
response = loader == null || request == null ? null : loader.load(request, CacheStrategy.CACHEONLY, BlacklistType.SEARCH, snippetMinLoadDelay);
} catch (IOException e1) {
response = null;
}
@ -245,7 +246,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// try to load the resource from the cache
Response response = null;
try {
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), (url.isFile() || url.isSMB() || cacheStrategy == null) ? CacheStrategy.NOCACHE : cacheStrategy, BlacklistType.SEARCH);
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), (url.isFile() || url.isSMB()) ? CacheStrategy.NOCACHE : (cacheStrategy == null ? CacheStrategy.CACHEONLY : cacheStrategy), BlacklistType.SEARCH, snippetMinLoadDelay);
} catch (IOException e) {
response = null;
}

Loading…
Cancel
Save