Merge commit '6d4e08ed06c5cd28c45981b2ebe31c7f7ec6fd83' into quix0r

Conflicts:
	source/de/anomic/crawler/CrawlQueues.java
pull/1/head
Michael Christen 13 years ago
commit 216a287a85

@ -706,5 +706,4 @@ public class CrawlQueues {
}
}
}

@ -157,7 +157,16 @@ public final class Records {
* @throws IOException
*/
private final long filesize() throws IOException {
return raf.length() / recordsize;
long records = 0;
try {
records = raf.length() / recordsize;
} catch (NullPointerException e) {
// This may happen on shutdown while still something is moving on
Log.logException(e);
}
return records;
}
/**

@ -29,7 +29,6 @@ import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
@ -38,6 +37,8 @@ import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -91,22 +92,22 @@ public class Blacklist {
}));
public static final String BLACKLIST_TYPES_STRING = "proxy,crawler,dht,search,surftips,news";
private File blacklistRootPath = null;
private final Map<String, HandleSet> cachedUrlHashs;
private final Map<String, Map<String, List<String>>> hostpaths_matchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
private final Map<String, Map<String, List<String>>> hostpaths_notmatchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
private final ConcurrentMap<String, HandleSet> cachedUrlHashs;
private final ConcurrentMap<String, ConcurrentMap<String, List<String>>> hostpaths_matchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
private final ConcurrentMap<String, ConcurrentMap<String, List<String>>> hostpaths_notmatchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
public Blacklist(final File rootPath) {
setRootPath(rootPath);
// prepare the data structure
this.hostpaths_matchable = new HashMap<String, Map<String, List<String>>>();
this.hostpaths_notmatchable = new HashMap<String, Map<String, List<String>>>();
this.cachedUrlHashs = new HashMap<String, HandleSet>();
this.hostpaths_matchable = new ConcurrentHashMap<String, Map<String, List<String>>>();
this.hostpaths_notmatchable = new ConcurrentHashMap<String, Map<String, List<String>>>();
this.cachedUrlHashs = new ConcurrentHashMap<String, HandleSet>();
for (final String blacklistType : BLACKLIST_TYPES) {
this.hostpaths_matchable.put(blacklistType, new HashMap<String, List<String>>());
this.hostpaths_notmatchable.put(blacklistType, new HashMap<String, List<String>>());
this.hostpaths_matchable.put(blacklistType, new ConcurrentHashMap<String, List<String>>());
this.hostpaths_notmatchable.put(blacklistType, new ConcurrentHashMap<String, List<String>>());
this.cachedUrlHashs.put(blacklistType, new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0));
}
}

@ -64,6 +64,7 @@ import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.crawler.retrieval.SMBLoader;
import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.http.client.Cache;
public final class LoaderDispatcher {
@ -137,7 +138,7 @@ public final class LoaderDispatcher {
public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile) throws IOException {
final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, false).getContent();
final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, true).getContent();
if (b == null) throw new IOException("load == null");
final File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
@ -190,6 +191,12 @@ public final class LoaderDispatcher {
final String protocol = url.getProtocol();
final String host = url.getHost();
// check if url is in blacklist
if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
// check if we have the page in the cache
final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) {
@ -324,7 +331,7 @@ public final class LoaderDispatcher {
*/
public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy) throws IOException {
// try to download the resource using the loader
final Response entry = load(request, cacheStrategy, false);
final Response entry = load(request, cacheStrategy, true);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
@ -334,7 +341,7 @@ public final class LoaderDispatcher {
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final int maxFileSize) throws IOException, Parser.Failure {
// load resource
final Response response = load(request, cacheStrategy, maxFileSize, false);
final Response response = load(request, cacheStrategy, maxFileSize, true);
final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
@ -347,7 +354,7 @@ public final class LoaderDispatcher {
public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
// load page
final Response r = this.load(request(location, true, false), cachePolicy, false);
final Response r = this.load(request(location, true, false), cachePolicy, true);
final byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
@ -366,7 +373,7 @@ public final class LoaderDispatcher {
* @throws IOException
*/
public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, false);
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, true);
if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader();
if (response.getContent() == null) throw new IOException("resource == null");

@ -27,6 +27,7 @@ package net.yacy.search.snippet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -48,7 +49,10 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.repository.Blacklist;
import net.yacy.search.Switchboard;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.ZURL.FailCategory;
public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
@ -165,6 +169,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
entry = i.next();
url = new DigestURI(entry.getKey());
desc = entry.getValue();
if (isUrlBlacklisted(url, Blacklist.BLACKLIST_SEARCH)) continue;
final int ranking = removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() +
removeAppearanceHashes(desc, queryhashes).size();
if (ranking < 2 * queryhashes.size()) {
@ -189,6 +194,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
ientry = i.next();
url = new DigestURI(ientry.url());
final String u = url.toString();
if (isUrlBlacklisted(url, Blacklist.BLACKLIST_SEARCH)) continue;
if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue;
if (ientry.height() > 0 && ientry.height() < 32) continue;
if (ientry.width() > 0 && ientry.width() < 32) continue;
@ -230,4 +236,27 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
return remaininghashes;
}
/**
* Checks wether given URL is in blacklist for given blacklist type
*
* @param url The URL to check
* @param blacklistType Type of blacklist (see class Blacklist, BLACKLIST_FOO)
* @return isBlacklisted Wether the given URL is blacklisted
*/
private static boolean isUrlBlacklisted (DigestURI url, String blacklistType) {
// Default is not blacklisted
boolean isBlacklisted = false;
// check if url is in blacklist
if (Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
Switchboard.getSwitchboard().crawlQueues.errorURL.push(new Request(url, null), Switchboard.getSwitchboard().peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
Log.logFine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
isBlacklisted = true;
}
// Return result
return isBlacklisted;
}
}

Loading…
Cancel
Save