* fix system update if urls are in blacklist (for example for very general blacklists like *.de)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7375 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 14 years ago
parent 56264dcc17
commit 9d2159582f

@ -63,7 +63,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon0Load")) {
// load from the net
try {
Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, false);
byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocalization(LibraryProvider.Dictionary.GEON0.file()));
@ -103,7 +103,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geo1Load")) {
// load from the net
try {
Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, false);
byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
LibraryProvider.geoLoc.removeLocalization(LibraryProvider.Dictionary.GEODB0.nickname);

@ -255,7 +255,7 @@ public class Load_RSS_p {
RSSReader rss = null;
if (url != null) try {
prop.put("url", url.toNormalform(true, false));
Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (IOException e) {

@ -169,7 +169,7 @@ public class ViewFile {
Response response = null;
try {
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CrawlProfile.CacheStrategy.IFEXIST : CrawlProfile.CacheStrategy.CACHEONLY, Long.MAX_VALUE);
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CrawlProfile.CacheStrategy.IFEXIST : CrawlProfile.CacheStrategy.CACHEONLY, Long.MAX_VALUE, true);
} catch (IOException e) {
prop.put("error", "4");
prop.put("error_errorText", "error loading resource: " + e.getMessage());

@ -186,7 +186,7 @@ public class get_treeview {
try {
final DigestURI u = new DigestURI(post.get(ROOT).substring(2));
Response response = null;
response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true);
final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
if(document != null) {
if(isWordCount) {

@ -92,7 +92,7 @@ public class import_ymark {
try {
if(!bmk.containsKey(YMarkTables.BOOKMARK.TAGS.key()) || bmk.get(YMarkTables.BOOKMARK.TAGS.key()).equals(YMarkTables.BOOKMARK.TAGS.deflt())) {
final DigestURI u = new DigestURI(bmk.get(YMarkTables.BOOKMARK.URL.key()));
Response response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
Response response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true);
final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
if(document != null) {
bmk.put(YMarkTables.BOOKMARK.TAGS.key(), sb.tables.bookmarks.autoTag(document, bmk_user, 3));

@ -592,7 +592,7 @@ public class CrawlQueues {
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
CrawlProfile e = mp == null ? null : new CrawlProfile(mp);
Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize);
Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize, true);
if (response == null) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");

@ -56,7 +56,7 @@ public class RSSLoader extends Thread {
public void run() {
RSSReader rss = null;
try {
Response response = sb.loader.load(sb.loader.request(urlf, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = sb.loader.load(sb.loader.request(urlf, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (MalformedURLException e) {

@ -175,7 +175,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
public ZURL.Entry get(final byte[] urlhash) {
try {
if (urlIndex == null) return null;
//System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash);
// System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash);
final Row.Entry entry = urlIndex.get(urlhash);
if (entry == null) return null;
return new Entry(entry);

@ -68,14 +68,14 @@ public final class HTTPLoader {
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
}
public Response load(final Request entry, long maxFileSize) throws IOException {
public Response load(final Request entry, long maxFileSize, boolean checkBlacklist) throws IOException {
long start = System.currentTimeMillis();
Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize);
Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, checkBlacklist);
Latency.update(entry.url(), System.currentTimeMillis() - start);
return doc;
}
private Response load(final Request request, final int retryCount, final long maxFileSize) throws IOException {
private Response load(final Request request, final int retryCount, final long maxFileSize, final boolean checkBlacklist) throws IOException {
if (retryCount < 0) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
@ -93,7 +93,7 @@ public final class HTTPLoader {
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist");
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@ -164,7 +164,7 @@ public final class HTTPLoader {
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, retryCount - 1, maxFileSize);
return load(request, retryCount - 1, maxFileSize, checkBlacklist);
} else {
// no redirection url provided
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided");

@ -458,7 +458,7 @@ public final class HTTPDFileHandler {
for (int i = 0; i < list.length; i++) {
f = new File(targetFile, list[i]);
if (f.isDirectory()) {
aBuffer.append(" <li><a href=\"" + path + list[i] + "/\">" + list[i] + "/</a><br></li>\n");
aBuffer.append(" <li><a href=\"" + path + list[i] + "/\">" + list[i] + "/</a><br/></li>\n");
} else {
if (list[i].endsWith("html") || (list[i].endsWith("htm"))) {
scraper = ContentScraper.parseResource(f);
@ -485,12 +485,12 @@ public final class HTTPDFileHandler {
size = (sz / 1024 / 1024) + " MB";
}
aBuffer.append(" <li>");
if (headline != null && headline.length() > 0) aBuffer.append("<a href=\"" + list[i] + "\"><b>" + headline + "</b></a><br>");
aBuffer.append("<a href=\"" + path + list[i] + "\">" + list[i] + "</a><br>");
if (author != null && author.length() > 0) aBuffer.append("Author: " + author + "<br>");
if (publisher != null && publisher.length() > 0) aBuffer.append("Publisher: " + publisher + "<br>");
if (description != null && description.length() > 0) aBuffer.append("Description: " + description + "<br>");
aBuffer.append(DateFormatter.formatShortDay(new Date(f.lastModified())) + ", " + size + ((images > 0) ? ", " + images + " images" : "") + ((links > 0) ? ", " + links + " links" : "") + "<br></li>\n");
if (headline != null && headline.length() > 0) aBuffer.append("<a href=\"" + list[i] + "\"><b>" + headline + "</b></a><br/>");
aBuffer.append("<a href=\"" + path + list[i] + "\">" + list[i] + "</a><br/>");
if (author != null && author.length() > 0) aBuffer.append("Author: " + author + "<br/>");
if (publisher != null && publisher.length() > 0) aBuffer.append("Publisher: " + publisher + "<br/>");
if (description != null && description.length() > 0) aBuffer.append("Description: " + description + "<br/>");
aBuffer.append(DateFormatter.formatShortDay(new Date(f.lastModified())) + ", " + size + ((images > 0) ? ", " + images + " images" : "") + ((links > 0) ? ", " + links + " links" : "") + "<br/></li>\n");
}
}
aBuffer.append(" </ul>\n</body>\n</html>\n");

@ -2015,7 +2015,7 @@ public final class Switchboard extends serverSwitch {
@Override
public void run() {
try {
final Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE);
final Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE, true);
if (response == null) throw new IOException("response == null");
if (response.getContent() == null) throw new IOException("content == null");
if (response.getResponseHeader() == null) throw new IOException("header == null");
@ -2364,7 +2364,7 @@ public final class Switchboard extends serverSwitch {
// if we have an url then try to load the rss
RSSReader rss = null;
try {
Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
byte[] resource = response == null ? null : response.getContent();
//System.out.println("BLEKKO: " + new String(resource));
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);

@ -176,7 +176,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return;
} else {
// try to load the resource from the cache
response = loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE);
response = loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy.mustBeOffline()) {

@ -112,7 +112,7 @@ public class OSMTile {
// download resource using the crawler and keep resource in memory if possible
Response entry = null;
try {
entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true);
} catch (IOException e) {
Log.logWarning("OSMTile", "cannot load: " + e.getMessage());
return null;

@ -81,7 +81,7 @@ public class OAIListFriendsLoader {
Map<String, String> m;
for (Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
if (!oaiFriend.getValue().exists()) {
Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
}

@ -48,7 +48,7 @@ public class OAIPMHLoader {
this.source = source;
// load the file from the net
Response response = loader.load(loader.request(source, false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = loader.load(loader.request(source, false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(source, b);
//System.out.println("*** ResumptionToken = " + this.resumptionToken.toString());

@ -135,7 +135,7 @@ public final class LoaderDispatcher {
public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException {
byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent();
byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, false).getContent();
if (b == null) throw new IOException("load == null");
File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
@ -146,7 +146,7 @@ public final class LoaderDispatcher {
tmp.renameTo(targetFile);
}
public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize, boolean checkBlacklist) throws IOException {
String url = request.url().toNormalform(true, false);
Semaphore check = this.loaderSteering.get(url);
if (check != null) {
@ -158,7 +158,7 @@ public final class LoaderDispatcher {
try {
this.loaderSteering.put(url, new Semaphore(0));
Response response = loadInternal(request, cacheStrategy, maxFileSize);
Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist);
check = this.loaderSteering.remove(url);
if (check != null) check.release(1000);
return response;
@ -177,7 +177,7 @@ public final class LoaderDispatcher {
* @return the loaded entity in a Response object
* @throws IOException
*/
private Response loadInternal(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
private Response loadInternal(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize, boolean checkBlacklist) throws IOException {
// get the protocol of the next URL
final DigestURI url = request.url();
if (url.isFile() || url.isSMB()) cacheStrategy = CrawlProfile.CacheStrategy.NOCACHE; // load just from the file system
@ -261,7 +261,7 @@ public final class LoaderDispatcher {
// load resource from the internet
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize);
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize, checkBlacklist);
if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
if (protocol.equals("smb")) response = smbLoader.load(request, true);
if (protocol.equals("file")) response = fileLoader.load(request, true);
@ -300,7 +300,7 @@ public final class LoaderDispatcher {
public byte[] loadContent(final Request request, CrawlProfile.CacheStrategy cacheStrategy) throws IOException {
// try to download the resource using the loader
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final Response entry = load(request, cacheStrategy, maxFileSize);
final Response entry = load(request, cacheStrategy, maxFileSize, false);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
@ -310,7 +310,7 @@ public final class LoaderDispatcher {
public Document[] loadDocuments(final Request request, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, long maxFileSize) throws IOException, Parser.Failure {
// load resource
final Response response = load(request, cacheStrategy, maxFileSize);
final Response response = load(request, cacheStrategy, maxFileSize, false);
final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
@ -324,7 +324,7 @@ public final class LoaderDispatcher {
public ContentScraper parseResource(final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException {
// load page
final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
Response r = this.load(request(location, true, false), cachePolicy, maxFileSize);
Response r = this.load(request(location, true, false), cachePolicy, maxFileSize, false);
byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
@ -343,7 +343,7 @@ public final class LoaderDispatcher {
* @throws IOException
*/
public final Map<MultiProtocolURI, String> loadLinks(DigestURI url, CrawlProfile.CacheStrategy cacheStrategy) throws IOException {
Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE);
Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE, false);
if (response == null) throw new IOException("response == null");
ResponseHeader responseHeader = response.getResponseHeader();
byte[] resource = response.getContent();
@ -401,7 +401,7 @@ public final class LoaderDispatcher {
if (this.cache != null && this.cache.exists()) return;
try {
// load from the net
Response response = load(request(new DigestURI(this.url), false, true), this.cacheStrategy, this.maxFileSize);
Response response = load(request(new DigestURI(this.url), false, true), this.cacheStrategy, this.maxFileSize, true);
byte[] b = response.getContent();
if (this.cache != null) FileUtils.copy(b, this.cache);
} catch (MalformedURLException e) {} catch (IOException e) {}

Loading…
Cancel
Save