*) Adding better https support for crawler

- solving problems with unkown certificates by implementing a dummy trust Manager
   - adding https support to robots-parser 
   - Seed File can now be downloaded from https resources
   - adapting plasmaHTCache.java to support https URLs properly

*) URL Normalization
   - sub URLs are now normalized properly during indexing
   - pointing urlNormalForm function of plasmaParser to htmlFilterContentScraper function
   - normalizing URLs which were received by a crawlOrder request

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1024 6c8d7289-2bf4-0310-a012-ef5d649a1542
theli 20 years ago
parent d2507c6081
commit b8ceb1ffde

@ -48,7 +48,9 @@ import java.util.Date;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.tools.crypt;
@ -166,7 +168,19 @@ public final class crawlOrder {
int count = Math.min(urlv.size(), refv.size());
if (count == 1) {
// old method: only one url
stackresult = stack(switchboard, (String) urlv.get(0), (String) refv.get(0), iam, youare);
// normalizing URL
String newURL = plasmaParser.urlNormalform((String)urlv.get(0));
if (!newURL.equals(urlv.get(0))) {
env.getLog().logWarning("crawlOrder: Received not normalized URL " + urlv.get(0));
String refURL = plasmaParser.urlNormalform((String) refv.get(0));
if ((refURL != null) && (!refURL.equals(refv.get(0)))) {
env.getLog().logWarning("crawlOrder: Received not normalized Referer URL " + refv.get(0) + " of URL " + urlv.get(0));
// adding URL to noticeURL Queue
stackresult = stack(switchboard, newURL, refURL, iam, youare);
response = (String) stackresult[0];
reason = (String) stackresult[1];
lurl = (String) stackresult[2];

@ -141,28 +141,21 @@ public final class robotsParser{
return deny;
public static boolean containsRobotsData(URL nexturl) {
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort());
urlHostPort = urlHostPort.toLowerCase();
// doing a DB lookup to determine if the robots data is already available
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort);
// if we have not found any data or the data is older than 7 days, we need to load it from the remote server
if ((robotsTxt4Host == null) || (robotsTxt4Host.getLoadedDate() == null) ||
(System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000)) {
return false;
return true;
public static boolean isDisallowed(URL nexturl) {
if (nexturl == null) throw new IllegalArgumentException();
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort());
String urlHostPort = null;
int port = nexturl.getPort();
if (port == -1) {
if (nexturl.getProtocol().equalsIgnoreCase("http")) {
port = 80;
} else if (nexturl.getProtocol().equalsIgnoreCase("https")) {
port = 443;
urlHostPort = nexturl.getHost() + ":" + port;
urlHostPort = urlHostPort.toLowerCase().intern();
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
@ -179,7 +172,7 @@ public final class robotsParser{
URL robotsURL = null;
// generating the proper url to download the robots txt
try {
robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),(nexturl.getPort()==-1)?80:nexturl.getPort(),"/robots.txt");
robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),port,"/robots.txt");
} catch (MalformedURLException e) {
serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for URL '" + nexturl.toString() + "'.");
return false;
@ -249,9 +242,9 @@ public final class robotsParser{
plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard();
//TODO: adding Traffic statistic for robots download?
if ((sb.remoteProxyConfig == null) || (!sb.remoteProxyConfig.useProxy())) {
con = httpc.getInstance(robotsURL.getHost(), robotsURL.getPort(), 10000, false);
con = httpc.getInstance(robotsURL.getHost(), robotsURL.getPort(), 10000, robotsURL.getProtocol().equalsIgnoreCase("https"));
} else {
con = httpc.getInstance(robotsURL.getHost(), robotsURL.getPort(), 10000, false, sb.remoteProxyConfig);
con = httpc.getInstance(robotsURL.getHost(), robotsURL.getPort(), 10000, robotsURL.getProtocol().equalsIgnoreCase("https"), sb.remoteProxyConfig);
// if we previously have downloaded this robots.txt then we can set the if-modified-since header

@ -110,10 +110,25 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public static String urlNormalform(String us) {
if (us == null) return null;
if (us.length() == 0) return null;
/* TODO: what about
* - case insensitive domain names
* - chars that should be escaped in URLs
int p;
// cutting of everything behind #
if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
if ((p = us.indexOf(":80/")) >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
if (us.startsWith("https")) {
if (us.endsWith(":443")) us = us.substring(0, us.length() - 4);
p = us.indexOf(":443/");
if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 4));
} else if (us.startsWith("http")) {
if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
p = us.indexOf(":80/");
if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
return us;

@ -65,7 +65,12 @@ import java.util.TimeZone;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverCodings;
@ -176,6 +181,43 @@ public final class httpc {
theHttpcPool = new httpcPool(new httpcFactory(),config);
// initializing a dummy trustManager to enable https connections
static SSLSocketFactory theSSLSockFactory = null;
static {
// Create a trust manager that does not validate certificate chains
TrustManager[] trustAllCerts = new TrustManager[] { new X509TrustManager() {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
public void checkClientTrusted(
java.security.cert.X509Certificate[] certs, String authType) {
public void checkServerTrusted(
java.security.cert.X509Certificate[] certs, String authType) {
} };
// Install the all-trusting trust manager
try {
SSLContext sc = SSLContext.getInstance("SSL");
// Create empty HostnameVerifier
HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, javax.net.ssl.SSLSession session) {
// logger.info("Warning: URL Host: "+urlHostName+"
// vs."+session.getPeerHost());
return true;
sc.init(null, trustAllCerts, new java.security.SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(theSSLSockFactory = sc.getSocketFactory());
} catch (Exception e) {
* A reusable readline buffer
* @see serverByteBuffer
@ -493,8 +535,9 @@ public final class httpc {
// creating a socket
this.socket = (ssl) ? SSLSocketFactory.getDefault().createSocket()
: new Socket();
this.socket = (ssl)
? theSSLSockFactory.createSocket()
: new Socket();
// creating a socket address
InetSocketAddress address = new InetSocketAddress(hostip, port);
@ -700,7 +743,7 @@ public final class httpc {
// send request
if ((this.remoteProxyUse) && (!(method.equals(httpHeader.METHOD_CONNECT))))
path = "http://" + this.savedRemoteHost + path;
path = (this.savedRemoteHost.endsWith("443")?"https://":"http://") + this.savedRemoteHost + path;
serverCore.send(this.clientOutput, method + " " + path + " HTTP/1.0"); // if set to HTTP/1.1, servers give time-outs?
// send header

@ -296,7 +296,6 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http
String path = conProp.getProperty(httpHeader.CONNECTION_PROP_PATH);
String argsString = conProp.getProperty(httpHeader.CONNECTION_PROP_ARGS); // is null if no args were given
String httpVersion= conProp.getProperty(httpHeader.CONNECTION_PROP_HTTP_VER);
String url = "http://" + requestHeader.get(httpHeader.HOST,"localhost") + path;
// check hack attacks in path
if (path.indexOf("..") >= 0) {

@ -254,15 +254,17 @@ public final class plasmaCrawlLURL extends plasmaURL {
return null;
public void removeStack(int stack, int pos) {
public boolean removeStack(int stack, int pos) {
Object prevElement = null;
switch (stack) {
case 1: externResultStack.remove(pos); break;
case 2: searchResultStack.remove(pos); break;
case 3: transfResultStack.remove(pos); break;
case 4: proxyResultStack.remove(pos); break;
case 5: lcrawlResultStack.remove(pos); break;
case 6: gcrawlResultStack.remove(pos); break;
case 1: prevElement = externResultStack.remove(pos); break;
case 2: prevElement = searchResultStack.remove(pos); break;
case 3: prevElement = transfResultStack.remove(pos); break;
case 4: prevElement = proxyResultStack.remove(pos); break;
case 5: prevElement = lcrawlResultStack.remove(pos); break;
case 6: prevElement = gcrawlResultStack.remove(pos); break;
return prevElement != null;
public void clearStack(int stack) {
@ -276,16 +278,18 @@ public final class plasmaCrawlLURL extends plasmaURL {
public void remove(String urlHash) {
public boolean remove(String urlHash) {
boolean exists1 = super.remove(urlHash);
for (int stack = 1; stack <= 6; stack++) {
for (int i = getStackSize(stack) - 1; i >= 0; i--) {
if (getUrlHash(stack,i).equals(urlHash)) {
boolean exits2 = removeStack(stack,i);
exists1 = exists1 || exits2;
return exists1;
return exists1;
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);

@ -352,10 +352,12 @@ public class plasmaCrawlNURL extends plasmaURL {
return new Entry(hash);
public synchronized void remove(String hash) {
public synchronized boolean remove(String hash) {
try {
} catch (IOException e) {}
return (urlHashCache.remove(hash.getBytes())!=null);
} catch (IOException e) {
return false;
public class Entry {

@ -436,8 +436,13 @@ public final class plasmaHTCache {
remotePath = remotePath + "ndx";
remotePath = remotePath.replaceAll("[?&:]", "_"); // yes this is not reversible, but that is not needed
final int port = url.getPort();
if (port < 0 || port == 80) {
int port = url.getPort();
if (port < 0) {
if (url.getProtocol().equalsIgnoreCase("http")) port = 80;
else if (url.getProtocol().equalsIgnoreCase("https")) port = 443;
else if (url.getProtocol().equalsIgnoreCase("ftp")) port = 21;
if (port == 80) {
return new File(this.cachePath, url.getHost() + remotePath);
} else {
return new File(this.cachePath, url.getHost() + "!" + port + remotePath);
@ -453,6 +458,8 @@ public final class plasmaHTCache {
// this.log.logFinest("plasmaHTCache: getURL: IN: File=[" + f + "]");
String s = f.toString().replace('\\', '/');
final String c = cachePath.toString().replace('\\', '/');
String protocol = "http";
int pos = s.lastIndexOf(c);
if (pos >= 0) {
s = s.substring(pos + c.length());
@ -466,12 +473,19 @@ public final class plasmaHTCache {
pos = s.indexOf("!");
if (pos >= 0) {
String temp = s.substring(pos + 1);
if (temp.startsWith("443/")) {
protocol = "https";
} else if (temp.startsWith("21/")) {
protocol = "ftp";
s = s.substring(0, pos) + ":" + s.substring(pos + 1);
if (s.endsWith("ndx")) { s = s.substring(0, s.length() - 3); }
// this.log.logFinest("plasmaHTCache: getURL: OUT=" + s);
try {
return new URL("http://" + s);
return new URL(protocol + "://" + s);
} catch (Exception e) {
return null;

@ -649,15 +649,7 @@ public final class plasmaParser {
public static String urlNormalform(String us) {
if (us == null) return null;
if (us.length() == 0) return null;
int p;
if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
p = us.indexOf(":80/");
if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
return us;
return htmlFilterContentScraper.urlNormalform(us);
static Map allReflinks(Map links) {

@ -1155,6 +1155,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
while (i.hasNext()) {
e = (Map.Entry) i.next();
nexturlstring = (String) e.getKey();
nexturlstring = plasmaParser.urlNormalform(nexturlstring);
sbStackCrawlThread.enqueue(nexturlstring, entry.url().toString(), initiatorHash, (String) e.getValue(), loadDate, entry.depth() + 1, entry.profile());
@ -1217,11 +1218,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String urlHash = newEntry.hash();
//log.logDebug("Remove NURL for '" + entry.normalizedURLString() + "'");
urlPool.noticeURL.remove(urlHash); // worked-off
if (((processCase == 4) || (processCase == 5) || (processCase == 6)) &&
(entry.profile().localIndexing())) {
if (((processCase == 4) || (processCase == 5) || (processCase == 6)) && (entry.profile().localIndexing())) {
// remove stopwords
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());
indexingEndTime = System.currentTimeMillis();
@ -1287,6 +1285,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (log.isLoggable(Level.INFO)) {
log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
" [" + entry.urlHash() + "]" +
"\n\tDescription: " + descr +
"\n\tMimeType: " + document.getMimeType() + " | " +
"Size: " + document.text.length + " bytes | " +
@ -1328,10 +1327,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} catch (IOException e) {
log.logSevere("ERROR in plasmaSwitchboard.process(): " + e.toString());
} finally {
// removing current entry from in process list
synchronized (this.indexingTasksInProcess) {
// removing current entry from notice URL queue
boolean removed = urlPool.noticeURL.remove(entry.urlHash()); // worked-off
if (!removed) {
log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect.");
// explicit delete/free resources
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) {

@ -452,11 +452,14 @@ public class plasmaURL {
public void remove(String urlHash) {
try {
} catch (IOException e) {}
public boolean remove(String urlHash) {
try {
boolean existsInIndex = this.existsIndex.remove(urlHash);
boolean existsInCache = (this.urlHashCache.remove(urlHash.getBytes())!= null);
return existsInIndex || existsInCache;
} catch (IOException e) {
return false;
public static final int flagTypeID(String hash) {
@ -495,7 +498,15 @@ public class plasmaURL {
dom = dom.substring(p + 1);
int port = url.getPort();
if (port <= 0) port = (isHTTP) ? 80 : 21;
if (port <= 0) {
if (isHTTP) {
port = 80;
} else if (url.getProtocol().equalsIgnoreCase("https")) {
port = 443;
} else {
port = 21;
String path = url.getPath();
if (path.startsWith("/")) path = path.substring(1);
if (path.endsWith("/")) path = path.substring(0, path.length() - 1);

@ -676,7 +676,12 @@ public class yacyCore {
final String seedURLStr = sb.getConfig("seedURL", "");
if (seedURLStr.length() == 0) { throw new MalformedURLException("The seed-file url must not be empty."); }
if (!seedURLStr.toLowerCase().startsWith("http://")) { throw new MalformedURLException("Unsupported protocol."); }
if (!(
seedURLStr.toLowerCase().startsWith("http://") ||
throw new MalformedURLException("Unsupported protocol.");
seedURL = new URL(seedURLStr);
} catch(MalformedURLException e) {
final String errorMsg = "Malformed seed file URL '" + sb.getConfig("seedURL", "") + "'. " + e.getMessage();

@ -157,7 +157,10 @@ public class yacyPeerActions {
for (int i = 0; i < superseed.size(); i++) {
if (Thread.currentThread().isInterrupted()) break;
seedListFileURL = (String) superseed.any();
if (seedListFileURL.startsWith("http://")) {
if (
seedListFileURL.startsWith("http://") ||
) {
// load the seed list
try {
httpHeader reqHeader = new httpHeader();
