From b8ceb1ffde51dac99008904b0f616cae43708fbb Mon Sep 17 00:00:00 2001 From: theli Date: Thu, 3 Nov 2005 15:28:37 +0000 Subject: [PATCH] *) Adding better https support for crawler - solving problems with unkown certificates by implementing a dummy trust Manager - adding https support to robots-parser - Seed File can now be downloaded from https resources - adapting plasmaHTCache.java to support https URLs properly *) URL Normalization - sub URLs are now normalized properly during indexing - pointing urlNormalForm function of plasmaParser to htmlFilterContentScraper function - normalizing URLs which were received by a crawlOrder request git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1024 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/crawlOrder.java | 16 +++++- source/de/anomic/data/robotsParser.java | 37 ++++++-------- .../htmlFilter/htmlFilterContentScraper.java | 19 ++++++- source/de/anomic/http/httpc.java | 49 +++++++++++++++++-- source/de/anomic/http/httpdFileHandler.java | 1 - source/de/anomic/plasma/plasmaCrawlLURL.java | 26 +++++----- source/de/anomic/plasma/plasmaCrawlNURL.java | 8 +-- .../de/anomic/plasma/plasmaCrawlWorker.java | 2 +- source/de/anomic/plasma/plasmaHTCache.java | 20 ++++++-- source/de/anomic/plasma/plasmaParser.java | 10 +--- .../de/anomic/plasma/plasmaSwitchboard.java | 14 ++++-- source/de/anomic/plasma/plasmaURL.java | 23 ++++++--- source/de/anomic/yacy/yacyCore.java | 7 ++- source/de/anomic/yacy/yacyPeerActions.java | 5 +- 14 files changed, 169 insertions(+), 68 deletions(-) diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index 07b663caf..7fa7f7486 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -48,7 +48,9 @@ import java.util.Date; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlLURL; +import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaURL; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.tools.crypt; @@ -166,7 +168,19 @@ public final class crawlOrder { int count = Math.min(urlv.size(), refv.size()); if (count == 1) { // old method: only one url - stackresult = stack(switchboard, (String) urlv.get(0), (String) refv.get(0), iam, youare); + + // normalizing URL + String newURL = plasmaParser.urlNormalform((String)urlv.get(0)); + if (!newURL.equals(urlv.get(0))) { + env.getLog().logWarning("crawlOrder: Received not normalized URL " + urlv.get(0)); + } + String refURL = plasmaParser.urlNormalform((String) refv.get(0)); + if ((refURL != null) && (!refURL.equals(refv.get(0)))) { + env.getLog().logWarning("crawlOrder: Received not normalized Referer URL " + refv.get(0) + " of URL " + urlv.get(0)); + } + + // adding URL to noticeURL Queue + stackresult = stack(switchboard, newURL, refURL, iam, youare); response = (String) stackresult[0]; reason = (String) stackresult[1]; lurl = (String) stackresult[2]; diff --git a/source/de/anomic/data/robotsParser.java b/source/de/anomic/data/robotsParser.java index c638df0dc..17b9c93c1 100644 --- a/source/de/anomic/data/robotsParser.java +++ b/source/de/anomic/data/robotsParser.java @@ -139,30 +139,23 @@ public final class robotsParser{ } return deny; - } - - public static boolean containsRobotsData(URL nexturl) { - // generating the hostname:poart string needed to do a DB lookup - String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort()); - urlHostPort = urlHostPort.toLowerCase(); - - // doing a DB lookup to determine if the robots data is already available - plasmaCrawlRobotsTxt.Entry robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort); - - // if we have not found any data or the data is older than 7 days, we need to load it from the remote server - if ((robotsTxt4Host == null) || (robotsTxt4Host.getLoadedDate() == null) || - (System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() > 7*24*60*60*1000)) { - return false; - } - return true; - } - + } public static boolean isDisallowed(URL nexturl) { if (nexturl == null) throw new IllegalArgumentException(); // generating the hostname:poart string needed to do a DB lookup - String urlHostPort = nexturl.getHost() + ":" + ((nexturl.getPort()==-1)?80:nexturl.getPort()); + String urlHostPort = null; + int port = nexturl.getPort(); + if (port == -1) { + if (nexturl.getProtocol().equalsIgnoreCase("http")) { + port = 80; + } else if (nexturl.getProtocol().equalsIgnoreCase("https")) { + port = 443; + } + + } + urlHostPort = nexturl.getHost() + ":" + port; urlHostPort = urlHostPort.toLowerCase().intern(); plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null; @@ -179,7 +172,7 @@ public final class robotsParser{ URL robotsURL = null; // generating the proper url to download the robots txt try { - robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),(nexturl.getPort()==-1)?80:nexturl.getPort(),"/robots.txt"); + robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),port,"/robots.txt"); } catch (MalformedURLException e) { serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for URL '" + nexturl.toString() + "'."); return false; @@ -249,9 +242,9 @@ public final class robotsParser{ plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard(); //TODO: adding Traffic statistic for robots download? if ((sb.remoteProxyConfig == null) || (!sb.remoteProxyConfig.useProxy())) { - con = httpc.getInstance(robotsURL.getHost(), robotsURL.getPort(), 10000, false); + con = httpc.getInstance(robotsURL.getHost(), robotsURL.getPort(), 10000, robotsURL.getProtocol().equalsIgnoreCase("https")); } else { - con = httpc.getInstance(robotsURL.getHost(), robotsURL.getPort(), 10000, false, sb.remoteProxyConfig); + con = httpc.getInstance(robotsURL.getHost(), robotsURL.getPort(), 10000, robotsURL.getProtocol().equalsIgnoreCase("https"), sb.remoteProxyConfig); } // if we previously have downloaded this robots.txt then we can set the if-modified-since header diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 883ef253a..c6f7db521 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -110,10 +110,25 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public static String urlNormalform(String us) { if (us == null) return null; if (us.length() == 0) return null; + + /* TODO: what about + * - case insensitive domain names + * - chars that should be escaped in URLs + */ int p; + + // cutting of everything behind # if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p); - if (us.endsWith(":80")) us = us.substring(0, us.length() - 3); - if ((p = us.indexOf(":80/")) >= 0) us = us.substring(0,p).concat(us.substring(p + 3)); + + if (us.startsWith("https")) { + if (us.endsWith(":443")) us = us.substring(0, us.length() - 4); + p = us.indexOf(":443/"); + if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 4)); + } else if (us.startsWith("http")) { + if (us.endsWith(":80")) us = us.substring(0, us.length() - 3); + p = us.indexOf(":80/"); + if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3)); + } if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1); return us; } diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java index 0303f09f0..4f3d11a3e 100644 --- a/source/de/anomic/http/httpc.java +++ b/source/de/anomic/http/httpc.java @@ -65,7 +65,12 @@ import java.util.TimeZone; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; +import javax.net.ssl.HostnameVerifier; +import javax.net.ssl.HttpsURLConnection; +import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSocketFactory; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; import de.anomic.server.serverByteBuffer; import de.anomic.server.serverCodings; @@ -175,6 +180,43 @@ public final class httpc { theHttpcPool = new httpcPool(new httpcFactory(),config); } + + // initializing a dummy trustManager to enable https connections + static SSLSocketFactory theSSLSockFactory = null; + static { + // Create a trust manager that does not validate certificate chains + TrustManager[] trustAllCerts = new TrustManager[] { new X509TrustManager() { + public java.security.cert.X509Certificate[] getAcceptedIssuers() { + return null; + } + + public void checkClientTrusted( + java.security.cert.X509Certificate[] certs, String authType) { + } + + public void checkServerTrusted( + java.security.cert.X509Certificate[] certs, String authType) { + } + } }; + + // Install the all-trusting trust manager + try { + SSLContext sc = SSLContext.getInstance("SSL"); + // Create empty HostnameVerifier + HostnameVerifier hv = new HostnameVerifier() { + public boolean verify(String urlHostName, javax.net.ssl.SSLSession session) { + // logger.info("Warning: URL Host: "+urlHostName+" + // vs."+session.getPeerHost()); + return true; + } + }; + + sc.init(null, trustAllCerts, new java.security.SecureRandom()); + HttpsURLConnection.setDefaultSSLSocketFactory(theSSLSockFactory = sc.getSocketFactory()); + HttpsURLConnection.setDefaultHostnameVerifier(hv); + } catch (Exception e) { + } + } /** * A reusable readline buffer @@ -493,8 +535,9 @@ public final class httpc { } // creating a socket - this.socket = (ssl) ? SSLSocketFactory.getDefault().createSocket() - : new Socket(); + this.socket = (ssl) + ? theSSLSockFactory.createSocket() + : new Socket(); // creating a socket address InetSocketAddress address = new InetSocketAddress(hostip, port); @@ -700,7 +743,7 @@ public final class httpc { // send request if ((this.remoteProxyUse) && (!(method.equals(httpHeader.METHOD_CONNECT)))) - path = "http://" + this.savedRemoteHost + path; + path = (this.savedRemoteHost.endsWith("443")?"https://":"http://") + this.savedRemoteHost + path; serverCore.send(this.clientOutput, method + " " + path + " HTTP/1.0"); // if set to HTTP/1.1, servers give time-outs? // send header diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index 1129faa8c..d3dfd6999 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -296,7 +296,6 @@ public final class httpdFileHandler extends httpdAbstractHandler implements http String path = conProp.getProperty(httpHeader.CONNECTION_PROP_PATH); String argsString = conProp.getProperty(httpHeader.CONNECTION_PROP_ARGS); // is null if no args were given String httpVersion= conProp.getProperty(httpHeader.CONNECTION_PROP_HTTP_VER); - String url = "http://" + requestHeader.get(httpHeader.HOST,"localhost") + path; // check hack attacks in path if (path.indexOf("..") >= 0) { diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 4fea7781f..ef0f8340e 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -254,15 +254,17 @@ public final class plasmaCrawlLURL extends plasmaURL { return null; } - public void removeStack(int stack, int pos) { + public boolean removeStack(int stack, int pos) { + Object prevElement = null; switch (stack) { - case 1: externResultStack.remove(pos); break; - case 2: searchResultStack.remove(pos); break; - case 3: transfResultStack.remove(pos); break; - case 4: proxyResultStack.remove(pos); break; - case 5: lcrawlResultStack.remove(pos); break; - case 6: gcrawlResultStack.remove(pos); break; + case 1: prevElement = externResultStack.remove(pos); break; + case 2: prevElement = searchResultStack.remove(pos); break; + case 3: prevElement = transfResultStack.remove(pos); break; + case 4: prevElement = proxyResultStack.remove(pos); break; + case 5: prevElement = lcrawlResultStack.remove(pos); break; + case 6: prevElement = gcrawlResultStack.remove(pos); break; } + return prevElement != null; } public void clearStack(int stack) { @@ -276,16 +278,18 @@ public final class plasmaCrawlLURL extends plasmaURL { } } - public void remove(String urlHash) { - super.remove(urlHash); + public boolean remove(String urlHash) { + boolean exists1 = super.remove(urlHash); for (int stack = 1; stack <= 6; stack++) { for (int i = getStackSize(stack) - 1; i >= 0; i--) { if (getUrlHash(stack,i).equals(urlHash)) { - removeStack(stack,i); - return; + boolean exits2 = removeStack(stack,i); + exists1 = exists1 || exits2; + return exists1; } } } + return exists1; } private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index bdd354858..4f5a837fa 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -352,10 +352,12 @@ public class plasmaCrawlNURL extends plasmaURL { return new Entry(hash); } - public synchronized void remove(String hash) { + public synchronized boolean remove(String hash) { try { - urlHashCache.remove(hash.getBytes()); - } catch (IOException e) {} + return (urlHashCache.remove(hash.getBytes())!=null); + } catch (IOException e) { + return false; + } } public class Entry { diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index bc5488d0b..fc128aa93 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -432,7 +432,7 @@ public final class plasmaCrawlWorker extends Thread { // generating url hash String urlhash = plasmaURL.urlHash(redirectionUrl); - + // removing url from loader queue plasmaCrawlLoader.switchboard.urlPool.noticeURL.remove(urlhash); diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 3964c6d95..6d5e95af5 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -436,8 +436,13 @@ public final class plasmaHTCache { remotePath = remotePath + "ndx"; } remotePath = remotePath.replaceAll("[?&:]", "_"); // yes this is not reversible, but that is not needed - final int port = url.getPort(); - if (port < 0 || port == 80) { + int port = url.getPort(); + if (port < 0) { + if (url.getProtocol().equalsIgnoreCase("http")) port = 80; + else if (url.getProtocol().equalsIgnoreCase("https")) port = 443; + else if (url.getProtocol().equalsIgnoreCase("ftp")) port = 21; + } + if (port == 80) { return new File(this.cachePath, url.getHost() + remotePath); } else { return new File(this.cachePath, url.getHost() + "!" + port + remotePath); @@ -453,6 +458,8 @@ public final class plasmaHTCache { // this.log.logFinest("plasmaHTCache: getURL: IN: File=[" + f + "]"); String s = f.toString().replace('\\', '/'); final String c = cachePath.toString().replace('\\', '/'); + + String protocol = "http"; int pos = s.lastIndexOf(c); if (pos >= 0) { s = s.substring(pos + c.length()); @@ -466,12 +473,19 @@ public final class plasmaHTCache { pos = s.indexOf("!"); if (pos >= 0) { + String temp = s.substring(pos + 1); + if (temp.startsWith("443/")) { + protocol = "https"; + } else if (temp.startsWith("21/")) { + protocol = "ftp"; + } + s = s.substring(0, pos) + ":" + s.substring(pos + 1); } if (s.endsWith("ndx")) { s = s.substring(0, s.length() - 3); } // this.log.logFinest("plasmaHTCache: getURL: OUT=" + s); try { - return new URL("http://" + s); + return new URL(protocol + "://" + s); } catch (Exception e) { return null; } diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index b0a602fc5..4d35d85ae 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -649,15 +649,7 @@ public final class plasmaParser { } public static String urlNormalform(String us) { - if (us == null) return null; - if (us.length() == 0) return null; - int p; - if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p); - if (us.endsWith(":80")) us = us.substring(0, us.length() - 3); - p = us.indexOf(":80/"); - if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3)); - if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1); - return us; + return htmlFilterContentScraper.urlNormalform(us); } static Map allReflinks(Map links) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 03c7861b7..3ff64f403 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1155,6 +1155,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser while (i.hasNext()) { e = (Map.Entry) i.next(); nexturlstring = (String) e.getKey(); + nexturlstring = plasmaParser.urlNormalform(nexturlstring); sbStackCrawlThread.enqueue(nexturlstring, entry.url().toString(), initiatorHash, (String) e.getValue(), loadDate, entry.depth() + 1, entry.profile()); @@ -1217,11 +1218,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ); String urlHash = newEntry.hash(); - //log.logDebug("Remove NURL for '" + entry.normalizedURLString() + "'"); - urlPool.noticeURL.remove(urlHash); // worked-off - if (((processCase == 4) || (processCase == 5) || (processCase == 6)) && - (entry.profile().localIndexing())) { + if (((processCase == 4) || (processCase == 5) || (processCase == 6)) && (entry.profile().localIndexing())) { // remove stopwords log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url()); indexingEndTime = System.currentTimeMillis(); @@ -1287,6 +1285,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (log.isLoggable(Level.INFO)) { log.logInfo("*Indexed " + words + " words in URL " + entry.url() + + " [" + entry.urlHash() + "]" + "\n\tDescription: " + descr + "\n\tMimeType: " + document.getMimeType() + " | " + "Size: " + document.text.length + " bytes | " + @@ -1328,10 +1327,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } catch (IOException e) { log.logSevere("ERROR in plasmaSwitchboard.process(): " + e.toString()); } finally { + // removing current entry from in process list synchronized (this.indexingTasksInProcess) { this.indexingTasksInProcess.remove(entry.urlHash()); } + // removing current entry from notice URL queue + boolean removed = urlPool.noticeURL.remove(entry.urlHash()); // worked-off + if (!removed) { + log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect."); + } + // explicit delete/free resources if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) { plasmaHTCache.filesInUse.remove(entry.cacheFile()); diff --git a/source/de/anomic/plasma/plasmaURL.java b/source/de/anomic/plasma/plasmaURL.java index 39c615be1..ba8d5d37e 100644 --- a/source/de/anomic/plasma/plasmaURL.java +++ b/source/de/anomic/plasma/plasmaURL.java @@ -452,11 +452,14 @@ public class plasmaURL { } } - public void remove(String urlHash) { - try { - existsIndex.remove(urlHash); - urlHashCache.remove(urlHash.getBytes()); - } catch (IOException e) {} + public boolean remove(String urlHash) { + try { + boolean existsInIndex = this.existsIndex.remove(urlHash); + boolean existsInCache = (this.urlHashCache.remove(urlHash.getBytes())!= null); + return existsInIndex || existsInCache; + } catch (IOException e) { + return false; + } } public static final int flagTypeID(String hash) { @@ -495,7 +498,15 @@ public class plasmaURL { dom = dom.substring(p + 1); } int port = url.getPort(); - if (port <= 0) port = (isHTTP) ? 80 : 21; + if (port <= 0) { + if (isHTTP) { + port = 80; + } else if (url.getProtocol().equalsIgnoreCase("https")) { + port = 443; + } else { + port = 21; + } + } String path = url.getPath(); if (path.startsWith("/")) path = path.substring(1); if (path.endsWith("/")) path = path.substring(0, path.length() - 1); diff --git a/source/de/anomic/yacy/yacyCore.java b/source/de/anomic/yacy/yacyCore.java index 4c0f3f0d1..b8ded14ca 100644 --- a/source/de/anomic/yacy/yacyCore.java +++ b/source/de/anomic/yacy/yacyCore.java @@ -676,7 +676,12 @@ public class yacyCore { try{ final String seedURLStr = sb.getConfig("seedURL", ""); if (seedURLStr.length() == 0) { throw new MalformedURLException("The seed-file url must not be empty."); } - if (!seedURLStr.toLowerCase().startsWith("http://")) { throw new MalformedURLException("Unsupported protocol."); } + if (!( + seedURLStr.toLowerCase().startsWith("http://") || + seedURLStr.toLowerCase().startsWith("https://") + )){ + throw new MalformedURLException("Unsupported protocol."); + } seedURL = new URL(seedURLStr); } catch(MalformedURLException e) { final String errorMsg = "Malformed seed file URL '" + sb.getConfig("seedURL", "") + "'. " + e.getMessage(); diff --git a/source/de/anomic/yacy/yacyPeerActions.java b/source/de/anomic/yacy/yacyPeerActions.java index af3419888..ae99fc974 100644 --- a/source/de/anomic/yacy/yacyPeerActions.java +++ b/source/de/anomic/yacy/yacyPeerActions.java @@ -157,7 +157,10 @@ public class yacyPeerActions { for (int i = 0; i < superseed.size(); i++) { if (Thread.currentThread().isInterrupted()) break; seedListFileURL = (String) superseed.any(); - if (seedListFileURL.startsWith("http://")) { + if ( + seedListFileURL.startsWith("http://") || + seedListFileURL.startsWith("https://") + ) { // load the seed list try { httpHeader reqHeader = new httpHeader();