Michael Peter Christen 4 years ago
commit 3078b74e1d

@ -133,7 +133,7 @@ document.getElementById("apilink").setAttribute("href", "Network.xml?" + window.
#{list}# #{list}#
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#"> <tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td >#[hash]#</td> <td >#[hash]#</td>
<td #(special)#::class="TableCellActive"#(/special)#>#[shortname]##(ssl)#::<a href="https://#[ip]#:#[portssl]#/Network.html?page=1" class="forceNoExternalIcon" target="_blank"><img src="env/grafics/lock.gif" width="11" height="15" title="https supported" alt="https supported" /></a>#(/ssl)#</td> <td #(special)#::class="TableCellActive"#(/special)#>#[shortname]##(ssl)#::<a href="https://#[ip]#:#[portssl]#/Network.html?page=1" class="forceNoExternalIcon" target="_blank"><img src="env/grafics/lockclose.png" width="11" height="15" title="https supported" alt="https supported" /></a>#(/ssl)#</td>
<td nowrap> <td nowrap>
#(type)##(direct)#<img src="env/grafics/JuniorPassive.gif" width="11" height="11" title="Type: Junior | Contact: passive" alt="Junior passive" />::<img src="env/grafics/JuniorDirect.gif" width="11" height="11" title="Type: Junior | Contact: direct" alt="Junior direct" />::<img src="env/grafics/JuniorOffline.gif" width="11" height="11" title="Type: Junior | Contact: offline" alt="Junior offline" />#(/direct)#::#(direct)#<img src="env/grafics/SeniorPassive.gif" width="11" height="11" title="Type: Senior | Contact: passive" alt="senior passive" />::<img src="env/grafics/SeniorDirect.gif" width="11" height="11" title="Type: Senior | Contact: direct" alt="Senior direct" />::<img src="env/grafics/SeniorOffline.gif" width="11" height="11" title="Type: Senior | Contact: offline" alt="Senior offline" />#(/direct)#::<a href="#[url]#" class="forceNoExternalIcon">#(direct)#<img src="env/grafics/PrincipalPassive.gif" width="11" height="11" title="Type: Principal | Contact: passive | Seed download: possible" alt="Principal passive" />::<img src="env/grafics/PrincipalDirect.gif" width="11" height="11" title="Type: Principal | Contact: direct | Seed download: possible" alt="Principal active" />::<img src="env/grafics/PrincipalOffline.gif" width="11" height="11" title="Type: Principal | Contact: offline | Seed download: ?" alt="Principal offline" />#(/direct)#</a>#(/type)##(acceptcrawl)#<img src="env/grafics/CrawlNo.gif" width="11" height="11" title="Accept Crawl: no" alt="no crawl" />::<img src="env/grafics/CrawlYes.gif" width="11" height="11" title="Accept Crawl: yes" alt="crawl possible" />::<img src="env/grafics/CrawlYesOffline.gif" width="11" height="11" title="Accept Crawl: yes" alt="crawl possible" />#(/acceptcrawl)##(dhtreceive)#<img src="env/grafics/DHTReceiveNo.gif" width="11" height="11" title="DHT Receive: no; #[peertags]#" alt="no DHT receive" />::<img src="env/grafics/DHTReceiveYes.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT receive enabled" />::<img src="env/grafics/DHTReceiveYesOffline.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT receive enabled" />#(/dhtreceive)##{ips}#<a href="#(c)#http://#(ipv6)#::[#(/ipv6)##[ip]##(ipv6)#::]#(/ipv6)#:#[port]#/Network.html?page=1::Network.html?page=4&amp;peerHash=#[hash]#&amp;peerIP=#[ip]#&amp;peerPort=#[port]#&amp;addPeer=add+Peer#(/c)#"#(c)#:: target="_blank"#(/c)#class="forceNoExternalIcon"><img src="env/grafics/#(nodestate)#NodeDisqualified::NodeQualified#(/nodestate)##(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)#.gif" width="11" height="11" title="#(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)# #(nodestate)#Peer::Node Peer#(/nodestate)#" /></a>#{/ips}#</td> #(type)##(direct)#<img src="env/grafics/JuniorPassive.gif" width="11" height="11" title="Type: Junior | Contact: passive" alt="Junior passive" />::<img src="env/grafics/JuniorDirect.gif" width="11" height="11" title="Type: Junior | Contact: direct" alt="Junior direct" />::<img src="env/grafics/JuniorOffline.gif" width="11" height="11" title="Type: Junior | Contact: offline" alt="Junior offline" />#(/direct)#::#(direct)#<img src="env/grafics/SeniorPassive.gif" width="11" height="11" title="Type: Senior | Contact: passive" alt="senior passive" />::<img src="env/grafics/SeniorDirect.gif" width="11" height="11" title="Type: Senior | Contact: direct" alt="Senior direct" />::<img src="env/grafics/SeniorOffline.gif" width="11" height="11" title="Type: Senior | Contact: offline" alt="Senior offline" />#(/direct)#::<a href="#[url]#" class="forceNoExternalIcon">#(direct)#<img src="env/grafics/PrincipalPassive.gif" width="11" height="11" title="Type: Principal | Contact: passive | Seed download: possible" alt="Principal passive" />::<img src="env/grafics/PrincipalDirect.gif" width="11" height="11" title="Type: Principal | Contact: direct | Seed download: possible" alt="Principal active" />::<img src="env/grafics/PrincipalOffline.gif" width="11" height="11" title="Type: Principal | Contact: offline | Seed download: ?" alt="Principal offline" />#(/direct)#</a>#(/type)##(acceptcrawl)#<img src="env/grafics/CrawlNo.gif" width="11" height="11" title="Accept Crawl: no" alt="no crawl" />::<img src="env/grafics/CrawlYes.gif" width="11" height="11" title="Accept Crawl: yes" alt="crawl possible" />::<img src="env/grafics/CrawlYesOffline.gif" width="11" height="11" title="Accept Crawl: yes" alt="crawl possible" />#(/acceptcrawl)##(dhtreceive)#<img src="env/grafics/DHTReceiveNo.gif" width="11" height="11" title="DHT Receive: no; #[peertags]#" alt="no DHT receive" />::<img src="env/grafics/DHTReceiveYes.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT receive enabled" />::<img src="env/grafics/DHTReceiveYesOffline.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT receive enabled" />#(/dhtreceive)##{ips}#<a href="#(c)#http://#(ipv6)#::[#(/ipv6)##[ip]##(ipv6)#::]#(/ipv6)#:#[port]#/Network.html?page=1::Network.html?page=4&amp;peerHash=#[hash]#&amp;peerIP=#[ip]#&amp;peerPort=#[port]#&amp;addPeer=add+Peer#(/c)#"#(c)#:: target="_blank"#(/c)#class="forceNoExternalIcon"><img src="env/grafics/#(nodestate)#NodeDisqualified::NodeQualified#(/nodestate)##(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)#.gif" width="11" height="11" title="#(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)# #(nodestate)#Peer::Node Peer#(/nodestate)#" /></a>#{/ips}#</td>
<td align="right">#[version]#</td> <td align="right">#[version]#</td>
@ -249,7 +249,7 @@ document.getElementById("apilink").setAttribute("href", "Network.xml?" + window.
<td>QPH<br/>(remote)</td> <td>QPH<br/>(remote)</td>
</tr> </tr>
<tr class="TableCellLight"> <tr class="TableCellLight">
<td>#[my-name]##(my-ssl)#::<img src="env/grafics/lock.gif" width="11" height="15" title="https supported" alt="https supported" />#(/my-ssl)#</td> <td>#[my-name]##(my-ssl)#::<img src="env/grafics/lockclose.png" width="11" height="15" title="https supported" alt="https supported" />#(/my-ssl)#</td>
<td nowrap>#(my-info)#<img src="env/grafics/Virgin.gif" width="11" height="11" title="Type: Virgin" alt="Virgin" />::<img src="env/grafics/JuniorDirect.gif" width="11" height="11" title="Type: Junior" alt="Junior" />::<img src="env/grafics/SeniorDirect.gif" width="11" height="11" title="Type: Senior" alt="Senior" />::<img src="env/grafics/PrincipalDirect.gif" width="11" height="11" title="Type: Principal" alt="Principal" />#(/my-info)##(my-acceptcrawl)#<img src="env/grafics/CrawlNo.gif" width="11" height="11" title="Accept Crawl: no" alt="no crawl" />::<img src="env/grafics/CrawlYes.gif" width="11" height="11" title="Accept Crawl: yes" alt="Crawl enabled" />#(/my-acceptcrawl)##(my-dhtreceive)#<img src="env/grafics/DHTReceiveNo.gif" width="11" height="11" title="DHT Receive: no" alt="no DHT receive" />::<img src="env/grafics/DHTReceiveYes.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT Receive enabled" />#(/my-dhtreceive)##{ips}#<a href="#(c)#http://#(ipv6)#::[#(/ipv6)##[ip]##(ipv6)#::]#(/ipv6)#:#[port]#/Network.html?page=1::Network.html?page=4&amp;peerHash=#[hash]#&amp;peerIP=#[ip]#&amp;peerPort=#[port]#&amp;addPeer=add+Peer#(/c)#"#(c)#:: target="_blank"#(/c)#class="forceNoExternalIcon"><img src="env/grafics/#(nodestate)#NodeDisqualified::NodeQualified#(/nodestate)##(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)#.gif" width="11" height="11" title="#(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)# #(nodestate)#Peer::Node Peer#(/nodestate)#" /></a>#{/ips}#</td> <td nowrap>#(my-info)#<img src="env/grafics/Virgin.gif" width="11" height="11" title="Type: Virgin" alt="Virgin" />::<img src="env/grafics/JuniorDirect.gif" width="11" height="11" title="Type: Junior" alt="Junior" />::<img src="env/grafics/SeniorDirect.gif" width="11" height="11" title="Type: Senior" alt="Senior" />::<img src="env/grafics/PrincipalDirect.gif" width="11" height="11" title="Type: Principal" alt="Principal" />#(/my-info)##(my-acceptcrawl)#<img src="env/grafics/CrawlNo.gif" width="11" height="11" title="Accept Crawl: no" alt="no crawl" />::<img src="env/grafics/CrawlYes.gif" width="11" height="11" title="Accept Crawl: yes" alt="Crawl enabled" />#(/my-acceptcrawl)##(my-dhtreceive)#<img src="env/grafics/DHTReceiveNo.gif" width="11" height="11" title="DHT Receive: no" alt="no DHT receive" />::<img src="env/grafics/DHTReceiveYes.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT Receive enabled" />#(/my-dhtreceive)##{ips}#<a href="#(c)#http://#(ipv6)#::[#(/ipv6)##[ip]##(ipv6)#::]#(/ipv6)#:#[port]#/Network.html?page=1::Network.html?page=4&amp;peerHash=#[hash]#&amp;peerIP=#[ip]#&amp;peerPort=#[port]#&amp;addPeer=add+Peer#(/c)#"#(c)#:: target="_blank"#(/c)#class="forceNoExternalIcon"><img src="env/grafics/#(nodestate)#NodeDisqualified::NodeQualified#(/nodestate)##(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)#.gif" width="11" height="11" title="#(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)# #(nodestate)#Peer::Node Peer#(/nodestate)#" /></a>#{/ips}#</td>
<td align="right">#[my-version]#</td> <td align="right">#[my-version]#</td>
<td align="right">#[my-utc]#</td> <td align="right">#[my-utc]#</td>

@ -134,7 +134,7 @@
You can download a more recent version of YaCy. Click here to install this update and restart YaCy: You can download a more recent version of YaCy. Click here to install this update and restart YaCy:
<form action="Status.html" method="get" class="PeerControl" accept-charset="UTF-8"> <form action="Status.html" method="get" class="PeerControl" accept-charset="UTF-8">
<button type="submit" name="aquirerelease" class="btn btn-primary" value="Update YaCy"> <button type="submit" name="aquirerelease" class="btn btn-primary" value="Update YaCy">
<img src="env/grafics/lock.gif" alt="lock icon"/> <img src="env/grafics/lockclose.png" alt="lock icon"/>
Install YaCy v#[latestVersion]# Install YaCy v#[latestVersion]#
</button> </button>
</form> </form>

@ -57,12 +57,12 @@
<form action="Surftips.html" method="get" class="PeerControl" accept-charset="UTF-8"><div> <form action="Surftips.html" method="get" class="PeerControl" accept-charset="UTF-8"><div>
#(publicSurftips)# #(publicSurftips)#
<button type="submit" name="publicPage" class="btn btn-primary" value="1"> <button type="submit" name="publicPage" class="btn btn-primary" value="1">
<img src="env/grafics/lock.gif" alt="authentication required" /> <img src="env/grafics/lockclose.png" alt="authentication required" />
Show surftips to everyone Show surftips to everyone
</button> </button>
:: ::
<button type="submit" name="publicPage" class="btn btn-primary" value="0"> <button type="submit" name="publicPage" class="btn btn-primary" value="0">
<img src="env/grafics/lock.gif" alt="authentication required" /> <img src="env/grafics/lockclose.png" alt="authentication required" />
Hide surftips for users without autorization Hide surftips for users without autorization
</button> </button>
#(/publicSurftips)# #(/publicSurftips)#

@ -65,7 +65,7 @@ public class HostBalancer implements Balancer {
private final static ConcurrentLog log = new ConcurrentLog("HostBalancer"); private final static ConcurrentLog log = new ConcurrentLog("HostBalancer");
public final static HandleMap depthCache = new RowHandleMap(Word.commonHashLength, Word.commonHashOrder, 2, 8 * 1024 * 1024, "HostBalancer.DepthCache"); public final static HandleMap depthCache = new RowHandleMap(Word.commonHashLength, Word.commonHashOrder, 2, 8 * 1024 * 1024, "HostBalancer.DepthCache");
private final File hostsPath; private final File hostsPath;
private final boolean exceed134217727; private final boolean exceed134217727;
private final Map<String, HostQueue> queues; private final Map<String, HostQueue> queues;
@ -84,7 +84,7 @@ public class HostBalancer implements Balancer {
final boolean exceed134217727) { final boolean exceed134217727) {
this(hostsPath, onDemandLimit, exceed134217727, true); this(hostsPath, onDemandLimit, exceed134217727, true);
} }
/** /**
* Create a new instance and fills the queue by scanning the hostsPath directory. * Create a new instance and fills the queue by scanning the hostsPath directory.
* @param hostsPath * @param hostsPath
@ -100,7 +100,7 @@ public class HostBalancer implements Balancer {
this.hostsPath = hostsPath; this.hostsPath = hostsPath;
this.onDemandLimit = onDemandLimit; this.onDemandLimit = onDemandLimit;
this.exceed134217727 = exceed134217727; this.exceed134217727 = exceed134217727;
// create a stack for newly entered entries // create a stack for newly entered entries
if (!(hostsPath.exists())) hostsPath.mkdirs(); // make the path if (!(hostsPath.exists())) hostsPath.mkdirs(); // make the path
this.queues = new ConcurrentHashMap<String, HostQueue>(); this.queues = new ConcurrentHashMap<String, HostQueue>();
@ -114,7 +114,7 @@ public class HostBalancer implements Balancer {
* return immediately (as large unfinished crawls may take longer to load) * return immediately (as large unfinished crawls may take longer to load)
*/ */
private void init(final boolean async) { private void init(final boolean async) {
if(async) { if(async) {
Thread t = new Thread("HostBalancer.init") { Thread t = new Thread("HostBalancer.init") {
@Override @Override
public void run() { public void run() {
@ -122,10 +122,10 @@ public class HostBalancer implements Balancer {
} }
}; };
t.start(); t.start();
} else { } else {
runInit(); runInit();
} }
} }
/** /**
@ -185,7 +185,7 @@ public class HostBalancer implements Balancer {
} }
return c; return c;
} }
/** /**
* delete all urls which are stored for given host hashes * delete all urls which are stored for given host hashes
* @param hosthashes * @param hosthashes
@ -230,11 +230,11 @@ public class HostBalancer implements Balancer {
return c; return c;
} }
/** /**
* @return true when the URL is queued is this or any other HostBalancer * @return true when the URL is queued is this or any other HostBalancer
* instance (as {@link #depthCache} is shared between all HostBalancer * instance (as {@link #depthCache} is shared between all HostBalancer
* instances) * instances)
*/ */
@Override @Override
public boolean has(final byte[] urlhashb) { public boolean has(final byte[] urlhashb) {
if (depthCache.has(urlhashb)) return true; if (depthCache.has(urlhashb)) return true;
@ -313,7 +313,7 @@ public class HostBalancer implements Balancer {
tryagain: while (true) try { tryagain: while (true) try {
HostQueue rhq = null; HostQueue rhq = null;
String rhh = null; String rhh = null;
synchronized (this) { synchronized (this) {
if (this.roundRobinHostHashes.size() == 0) { if (this.roundRobinHostHashes.size() == 0) {
// refresh the round-robin cache // refresh the round-robin cache
@ -331,14 +331,21 @@ public class HostBalancer implements Balancer {
if (size <= 10) {smallStacksExist = true; break smallsearch;} if (size <= 10) {smallStacksExist = true; break smallsearch;}
} }
} }
if (singletonStacksExist || smallStacksExist) { Set<String> freshhosts = new HashSet<>();
Iterator<String> i = this.roundRobinHostHashes.iterator(); Iterator<String> i = this.roundRobinHostHashes.iterator();
smallstacks: while (i.hasNext()) { smallstacks: while (i.hasNext()) {
if (this.roundRobinHostHashes.size() <= 10) break smallstacks; // don't shrink the hosts until nothing is left if (this.roundRobinHostHashes.size() <= 10) break smallstacks; // don't shrink the hosts until nothing is left
String s = i.next(); String hosthash = i.next();
HostQueue hq = this.queues.get(s); HostQueue hq = this.queues.get(hosthash);
if (hq == null) {i.remove(); continue smallstacks;} if (hq == null) {i.remove(); continue smallstacks;}
int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), s, robots, ClientIdentification.yacyInternetCrawlerAgent); int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), hosthash, robots, ClientIdentification.yacyInternetCrawlerAgent);
if (delta == Integer.MIN_VALUE) {
// never-crawled hosts; we do not want to have too many of them in here. Loading new hosts means: waiting for robots.txt to load
freshhosts.add(hosthash);
i.remove();
continue smallstacks;
}
if (singletonStacksExist || smallStacksExist) {
if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things
// to protect all small stacks which have a fast throughput, remove all with long waiting time // to protect all small stacks which have a fast throughput, remove all with long waiting time
if (delta >= 1000) {i.remove(); continue smallstacks;} if (delta >= 1000) {i.remove(); continue smallstacks;}
@ -350,6 +357,10 @@ public class HostBalancer implements Balancer {
} }
} }
} }
// put at least one of the fresh hosts back
if (freshhosts.size() > 0) this.roundRobinHostHashes.add(freshhosts.iterator().next());
// result
if (this.roundRobinHostHashes.size() == 1) { if (this.roundRobinHostHashes.size() == 1) {
if (log.isFine()) log.fine("(re-)initialized the round-robin queue with one host"); if (log.isFine()) log.fine("(re-)initialized the round-robin queue with one host");
} else { } else {
@ -357,13 +368,13 @@ public class HostBalancer implements Balancer {
} }
} }
if (this.roundRobinHostHashes.size() == 0) return null; if (this.roundRobinHostHashes.size() == 0) return null;
// if the queue size is 1, just take that // if the queue size is 1, just take that
if (this.roundRobinHostHashes.size() == 1) { if (this.roundRobinHostHashes.size() == 1) {
rhh = this.roundRobinHostHashes.iterator().next(); rhh = this.roundRobinHostHashes.iterator().next();
rhq = this.queues.get(rhh); rhq = this.queues.get(rhh);
} }
if (rhq == null) { if (rhq == null) {
// mixed minimum sleep time / largest queue strategy: // mixed minimum sleep time / largest queue strategy:
// create a map of sleep time / queue relations with a fuzzy sleep time (ms / 500). // create a map of sleep time / queue relations with a fuzzy sleep time (ms / 500).
@ -449,7 +460,7 @@ public class HostBalancer implements Balancer {
} }
*/ */
} }
if (rhq == null) { if (rhq == null) {
this.roundRobinHostHashes.clear(); // force re-initialization this.roundRobinHostHashes.clear(); // force re-initialization
continue tryagain; continue tryagain;
@ -458,7 +469,7 @@ public class HostBalancer implements Balancer {
long timestamp = System.currentTimeMillis(); long timestamp = System.currentTimeMillis();
Request request = rhq.pop(delay, cs, robots); // this pop is outside of synchronization to prevent blocking of pushes Request request = rhq.pop(delay, cs, robots); // this pop is outside of synchronization to prevent blocking of pushes
long actualwaiting = System.currentTimeMillis() - timestamp; long actualwaiting = System.currentTimeMillis() - timestamp;
if (actualwaiting > 1000) { if (actualwaiting > 1000) {
synchronized (this) { synchronized (this) {
// to prevent that this occurs again, remove all stacks with positive delay times (which may be less after that waiting) // to prevent that this occurs again, remove all stacks with positive delay times (which may be less after that waiting)
@ -473,7 +484,7 @@ public class HostBalancer implements Balancer {
} }
} }
} }
if (rhq.isEmpty()) { if (rhq.isEmpty()) {
synchronized (this) { synchronized (this) {
this.queues.remove(rhh); this.queues.remove(rhh);
@ -545,7 +556,7 @@ public class HostBalancer implements Balancer {
@Override @Override
public List<Request> getDomainStackReferences(String host, int maxcount, long maxtime) { public List<Request> getDomainStackReferences(String host, int maxcount, long maxtime) {
if (host == null) { if (host == null) {
return Collections.emptyList(); return Collections.emptyList();
} }
try { try {
HostQueue hq = this.queues.get(DigestURL.hosthash(host, host.startsWith("ftp.") ? 21 : 80)); HostQueue hq = this.queues.get(DigestURL.hosthash(host, host.startsWith("ftp.") ? 21 : 80));

@ -85,8 +85,8 @@ public final class HTTPLoader {
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
return doc; return doc;
} }
/** /**
* Open an input stream on a requested HTTP resource. When the resource content size is small * Open an input stream on a requested HTTP resource. When the resource content size is small
* (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance. * (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance.
* @param request * @param request
@ -98,228 +98,231 @@ public final class HTTPLoader {
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when an error occurred * @throws IOException when an error occurred
*/ */
public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount, public StreamResponse openInputStream(
final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) final Request request, CrawlProfile profile, final int retryCount,
throws IOException { final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent
if (retryCount < 0) { ) throws IOException {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, if (retryCount < 0) {
FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException( throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
"retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); }
} DigestURL url = request.url();
DigestURL url = request.url();
final String host = url.getHost();
final String host = url.getHost(); if (host == null || host.length() < 2) {
if (host == null || host.length() < 2) { throw new IOException("host is not well-formed: '" + host + "'");
throw new IOException("host is not well-formed: '" + host + "'"); }
} final String path = url.getFile();
final String path = url.getFile(); int port = url.getPort();
int port = url.getPort(); final boolean ssl = url.getProtocol().equals("https");
final boolean ssl = url.getProtocol().equals("https"); if (port < 0)
if (port < 0) port = (ssl) ? 443 : 80;
port = (ssl) ? 443 : 80;
// check if url is in blacklist
// check if url is in blacklist final String hostlow = host.toLowerCase(Locale.ROOT);
final String hostlow = host.toLowerCase(Locale.ROOT); if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
"url in blacklist", -1); throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); }
}
// resolve yacy and yacyh domains
// resolve yacy and yacyh domains final AlternativeDomainNames yacyResolver = this.sb.peers;
final AlternativeDomainNames yacyResolver = this.sb.peers; if (yacyResolver != null) {
if (yacyResolver != null) { final String yAddress = yacyResolver.resolve(host);
final String yAddress = yacyResolver.resolve(host); if (yAddress != null) {
if (yAddress != null) { url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
url = new DigestURL(url.getProtocol() + "://" + yAddress + path); }
} }
}
// create a request header
// create a request header final RequestHeader requestHeader = createRequestheader(request, agent);
final RequestHeader requestHeader = createRequestheader(request, agent);
// HTTP-Client
// HTTP-Client final HTTPClient client = new HTTPClient(agent);
final HTTPClient client = new HTTPClient(agent); client.setRedirecting(false); // we want to handle redirection
client.setRedirecting(false); // we want to handle redirection // ourselves, so we don't index pages
// ourselves, so we don't index pages // twice
// twice client.setTimout(this.socketTimeout);
client.setTimout(this.socketTimeout); client.setHeader(requestHeader.entrySet());
client.setHeader(requestHeader.entrySet());
// send request
// send request client.GET(url, false);
client.GET(url, false); final StatusLine statusline = client.getHttpResponse().getStatusLine();
final StatusLine statusline = client.getHttpResponse().getStatusLine(); final int statusCode = statusline.getStatusCode();
final int statusCode = statusline.getStatusCode(); final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); String requestURLString = request.url().toNormalform(true);
String requestURLString = request.url().toNormalform(true);
// check redirection
// check redirection if (statusCode > 299 && statusCode < 310) {
if (statusCode > 299 && statusCode < 310) { client.finish();
client.finish();
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString);
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline,
responseHeader, requestURLString); if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// we have two use cases here: loading from a crawl or just
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { // loading the url. Check this:
// we have two use cases here: loading from a crawl or just if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
// loading the url. Check this: // put redirect url on the crawler queue to repeat a
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { // double-check
// put redirect url on the crawler queue to repeat a /* We have to clone the request instance and not to modify directly its URL,
// double-check * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
/* We have to clone the request instance and not to modify directly its URL,
* otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
Request redirectedRequest = new Request(request.initiator(), Request redirectedRequest = new Request(request.initiator(),
redirectionUrl, redirectionUrl,
request.referrerhash(), request.referrerhash(),
request.name(), request.name(),
request.appdate(), request.appdate(),
request.profileHandle(), request.profileHandle(),
request.depth(), request.depth(),
request.timezoneOffset()); request.timezoneOffset());
String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
if(rejectReason != null) { if(rejectReason != null) {
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
} }
// in the end we must throw an exception (even if this is // in the end we must throw an exception (even if this is
// not an error, just to abort the current process // not an error, just to abort the current process
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
+ redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
} }
// if we are already doing a shutdown we don't need to retry // if we are already doing a shutdown we don't need to retry
// crawling // crawling
if (Thread.currentThread().isInterrupted()) { if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
throw new IOException( throw new IOException(
"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$"); "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
} }
// retry crawling with new url // check if the redirected URL is the same as the requested URL
request.redirectURL(redirectionUrl); // this shortcuts a time-out using retryCount
return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); if (redirectionUrl.equals(url)) {
} this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirect to same url", -1);
// we don't want to follow redirects throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, }
FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline // retry crawling with new url
+ "' for URL '" + requestURLString + "'$"); request.redirectURL(redirectionUrl);
} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) { return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
// the transfer is ok }
// we don't want to follow redirects
/* this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
* When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$");
*/ } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
long contentLength = client.getHttpResponse().getEntity().getContentLength(); // the transfer is ok
InputStream contentStream;
if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) { /*
byte[] content = null; * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
try { */
content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize); long contentLength = client.getHttpResponse().getEntity().getContentLength();
Cache.store(url, responseHeader, content); InputStream contentStream;
} catch (final IOException e) { if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e); byte[] content = null;
} finally { try {
client.finish(); content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize);
} Cache.store(url, responseHeader, content);
} catch (final IOException e) {
contentStream = new ByteArrayInputStream(content); this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e);
} else { } finally {
/* client.finish();
* Content length may already be known now : check it before opening a stream }
*/
if (maxFileSize >= 0 && contentLength > maxFileSize) { contentStream = new ByteArrayInputStream(content);
throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes"); } else {
} /*
/* * Content length may already be known now : check it before opening a stream
* Create a HTTPInputStream delegating to */
* client.getContentstream(). Close method will ensure client is if (maxFileSize >= 0 && contentLength > maxFileSize) {
* properly closed. throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
*/ }
contentStream = new HTTPInputStream(client); /*
/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */ * Create a HTTPInputStream delegating to
if(maxFileSize >= 0) { * client.getContentstream(). Close method will ensure client is
contentStream = new StrictLimitInputStream(contentStream, maxFileSize, * properly closed.
"Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize)); */
} contentStream = new HTTPInputStream(client);
} /* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
if(maxFileSize >= 0) {
return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream); contentStream = new StrictLimitInputStream(contentStream, maxFileSize,
} else { "Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize));
client.finish(); }
// if the response has not the right response type then reject file }
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline } else {
+ "' for URL '" + requestURLString + "'$"); client.finish();
} // if the response has not the right response type then reject file
} this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
/** throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline
* Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null. + "' for URL '" + requestURLString + "'$");
* @return redirect URL }
* @throws IOException when an error occured }
*/
private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url, /**
final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString) * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null.
throws IOException { * @return redirect URL
// read redirection URL * @throws IOException when an error occured
String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); */
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url,
final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString)
if (redirectionUrlString.isEmpty()) { throws IOException {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, // read redirection URL
FailCategory.TEMPORARY_NETWORK_FAILURE, String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
"no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode()); redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$"); if (redirectionUrlString.isEmpty()) {
} this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
FailCategory.TEMPORARY_NETWORK_FAILURE,
// normalize URL "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode());
final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$");
// restart crawling with new url }
this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL "
+ requestURLString); // normalize URL
this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false)); final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
this.sb.webStructure.generateCitationReference(url, redirectionUrl); // restart crawling with new url
this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL "
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { + requestURLString);
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode());
} this.sb.webStructure.generateCitationReference(url, redirectionUrl);
return redirectionUrl;
} if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
/** FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode());
* Create request header for loading content. }
* @param request search request return redirectionUrl;
* @param agent agent identification information }
* @return a request header
* @throws IOException when an error occured /**
*/ * Create request header for loading content.
private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent) * @param request search request
throws IOException { * @param agent agent identification information
final RequestHeader requestHeader = new RequestHeader(); * @return a request header
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent); * @throws IOException when an error occured
if (request.referrerhash() != null) { */
private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent)
throws IOException {
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
if (request.referrerhash() != null) {
DigestURL refererURL = this.sb.getURL(request.referrerhash()); DigestURL refererURL = this.sb.getURL(request.referrerhash());
if (refererURL != null) { if (refererURL != null) {
requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true)); requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
} }
} }
requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT)); requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE,
this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE)); this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, requestHeader.put(HeaderFramework.ACCEPT_CHARSET,
this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET)); this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, requestHeader.put(HeaderFramework.ACCEPT_ENCODING,
this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
return requestHeader; return requestHeader;
} }
private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
@ -347,10 +350,10 @@ public final class HTTPLoader {
// resolve yacy and yacyh domains // resolve yacy and yacyh domains
final AlternativeDomainNames yacyResolver = this.sb.peers; final AlternativeDomainNames yacyResolver = this.sb.peers;
if(yacyResolver != null) { if(yacyResolver != null) {
final String yAddress = yacyResolver.resolve(host); final String yAddress = yacyResolver.resolve(host);
if(yAddress != null) { if(yAddress != null) {
url = new DigestURL(url.getProtocol() + "://" + yAddress + path); url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
} }
} }
// take a file from the net // take a file from the net
@ -366,41 +369,39 @@ public final class HTTPLoader {
client.setHeader(requestHeader.entrySet()); client.setHeader(requestHeader.entrySet());
// send request // send request
final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false); final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false);
final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
String requestURLString = request.url().toNormalform(true); String requestURLString = request.url().toNormalform(true);
// check redirection // check redirection
if (statusCode > 299 && statusCode < 310) { if (statusCode > 299 && statusCode < 310) {
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(), final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(),
responseHeader, requestURLString); responseHeader, requestURLString);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// we have two use cases here: loading from a crawl or just loading the url. Check this: // we have two use cases here: loading from a crawl or just loading the url. Check this:
if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
// put redirect url on the crawler queue to repeat a double-check // put redirect url on the crawler queue to repeat a double-check
/* We have to clone the request instance and not to modify directly its URL, /* We have to clone the request instance and not to modify directly its URL,
* otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
Request redirectedRequest = new Request(request.initiator(), Request redirectedRequest = new Request(request.initiator(),
redirectionUrl, redirectionUrl,
request.referrerhash(), request.referrerhash(),
request.name(), request.name(),
request.appdate(), request.appdate(),
request.profileHandle(), request.profileHandle(),
request.depth(), request.depth(),
request.timezoneOffset()); request.timezoneOffset());
String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
// in the end we must throw an exception (even if this is not an error, just to abort the current process // in the end we must throw an exception (even if this is not an error, just to abort the current process
if(rejectReason != null) { if(rejectReason != null) {
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
} }
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
}
}
// if we are already doing a shutdown we don't need to retry crawling // if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) { if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
@ -410,15 +411,15 @@ public final class HTTPLoader {
// retry crawling with new url // retry crawling with new url
request.redirectURL(redirectionUrl); request.redirectURL(redirectionUrl);
return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
} }
// we don't want to follow redirects // we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
} else if (responseBody == null) { } else if (responseBody == null) {
// no response, reject file // no response, reject file
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
} else if (statusCode == 200 || statusCode == 203) { } else if (statusCode == 200 || statusCode == 203) {
// the transfer is ok // the transfer is ok
// we write the new cache entry to file system directly // we write the new cache entry to file system directly
@ -427,8 +428,8 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading // check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) { if (maxFileSize >= 0 && contentLength > maxFileSize) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$"); throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
} }
// create a new cache entry // create a new cache entry
@ -442,9 +443,9 @@ public final class HTTPLoader {
); );
return response; return response;
} else { } else {
// if the response has not the right response type then reject file // if the response has not the right response type then reject file
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
} }
} }
@ -485,17 +486,17 @@ public final class HTTPLoader {
final HTTPClient client = new HTTPClient(agent); final HTTPClient client = new HTTPClient(agent);
client.setTimout(20000); client.setTimout(20000);
client.setHeader(requestHeader.entrySet()); client.setHeader(requestHeader.entrySet());
final byte[] responseBody = client.GETbytes(request.url(), null, null, false); final byte[] responseBody = client.GETbytes(request.url(), null, null, false);
final int code = client.getHttpResponse().getStatusLine().getStatusCode(); final int code = client.getHttpResponse().getStatusLine().getStatusCode();
final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
// FIXME: 30*-handling (bottom) is never reached // FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true // we always get the final content because httpClient.followRedirects = true
if (responseBody != null && (code == 200 || code == 203)) { if (responseBody != null && (code == 200 || code == 203)) {
// the transfer is ok // the transfer is ok
//statistics: //statistics:
ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length); ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);
// we write the new cache entry to file system directly // we write the new cache entry to file system directly
@ -513,7 +514,7 @@ public final class HTTPLoader {
} else if (code > 299 && code < 310) { } else if (code > 299 && code < 310) {
if (header.containsKey(HeaderFramework.LOCATION)) { if (header.containsKey(HeaderFramework.LOCATION)) {
// getting redirection URL // getting redirection URL
String redirectionUrlString = header.get(HeaderFramework.LOCATION); String redirectionUrlString = header.get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString.trim(); redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.isEmpty()) { if (redirectionUrlString.isEmpty()) {
@ -535,7 +536,7 @@ public final class HTTPLoader {
} }
} else { } else {
// if the response has not the right response type then reject file // if the response has not the right response type then reject file
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
} }
return response; return response;
} }

@ -399,7 +399,7 @@ public final class LoaderDispatcher {
// load resource from the internet // load resource from the internet
StreamResponse response; StreamResponse response;
if (protocol.equals("http") || protocol.equals("https")) { if (protocol.equals("http") || protocol.equals("https")) {
response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent); response = this.httpLoader.openInputStream(request, crawlProfile, 2, maxFileSize, blacklistType, agent);
} else if (protocol.equals("ftp")) { } else if (protocol.equals("ftp")) {
response = this.ftpLoader.openInputStream(request, true); response = this.ftpLoader.openInputStream(request, true);
} else if (protocol.equals("smb")) { } else if (protocol.equals("smb")) {

Loading…
Cancel
Save