From 5847492537803f4e4bea6bce1eb0ebd3afc1e73e Mon Sep 17 00:00:00 2001 From: theli Date: Mon, 4 Sep 2006 13:17:11 +0000 Subject: [PATCH] *) next step of restructuring for new crawlers - IndexCreate_p.java: correcting problems with ftp urls - URL.java does not cutout the userinfo anymore (needed to transport authentication info in ftp urls, e.g. ftp://username:pwd@ftp.irgendwas.de) - plasmaCrawlLoader.java: -- hack to re enable https urls -- adding function getSupportedProtocols git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2482 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexCreate_p.java | 3 ++- source/de/anomic/net/URL.java | 4 ++-- .../de/anomic/plasma/plasmaCrawlLoader.java | 22 ++++++++++++++----- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/htroot/IndexCreate_p.java b/htroot/IndexCreate_p.java index 486b43b74..8387e554a 100644 --- a/htroot/IndexCreate_p.java +++ b/htroot/IndexCreate_p.java @@ -140,7 +140,8 @@ public class IndexCreate_p { crawlingStart = crawlingStart.trim(); // adding the prefix http:// if necessary - if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart; + int pos = crawlingStart.indexOf("://"); + if (pos == -1) crawlingStart = "http://" + crawlingStart; // normalizing URL try {crawlingStart = new URL(crawlingStart).toNormalform();} catch (MalformedURLException e1) {} diff --git a/source/de/anomic/net/URL.java b/source/de/anomic/net/URL.java index d5148c201..4654d2b89 100644 --- a/source/de/anomic/net/URL.java +++ b/source/de/anomic/net/URL.java @@ -278,8 +278,8 @@ public class URL { matcher.reset(path); } - if (defaultPort) { return this.protocol + "://" + this.getHost().toLowerCase() + path; } - return this.protocol + "://" + this.getHost().toLowerCase() + ((defaultPort) ? "" : (":" + this.port)) + path; + if (defaultPort) { return this.protocol + "://" + (this.userInfo!=null?this.userInfo+"@":"") + this.getHost().toLowerCase() + path; } + return this.protocol + "://" + (this.userInfo!=null?this.userInfo+"@":"")+ this.getHost().toLowerCase() + ((defaultPort) ? "" : (":" + this.port)) + path; } public boolean equals(URL other) { diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index df4887a96..8601cb646 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -44,6 +44,9 @@ package de.anomic.plasma; +import java.util.Arrays; +import java.util.HashSet; + import org.apache.commons.pool.impl.GenericKeyedObjectPool; import org.apache.commons.pool.impl.GenericObjectPool; @@ -61,6 +64,8 @@ public final class plasmaCrawlLoader extends Thread { private final plasmaHTCache cacheManager; private final serverLog log; + private HashSet supportedProtocols; + private final plasmaCrawlerMsgQueue theQueue; private final plasmaCrawlerPool crawlwerPool; private GenericKeyedObjectPool.Config crawlerPoolConfig = null; @@ -76,6 +81,10 @@ public final class plasmaCrawlLoader extends Thread { this.cacheManager = theCacheManager; this.log = theLog; + // supported protocols + // TODO: change this, e.g. by loading settings from file + this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp"})); + // configuring the crawler messagequeue this.theQueue = new plasmaCrawlerMsgQueue(); @@ -124,11 +133,11 @@ public final class plasmaCrawlLoader extends Thread { public boolean isSupportedProtocol(String protocol) { if ((protocol == null) || (protocol.length() == 0)) return false; - - // TODO: read the supported protocols out from a config file - protocol = protocol.trim().toLowerCase(); - return protocol.equals("http") || - protocol.equals("https"); + return this.supportedProtocols.contains(protocol.trim().toLowerCase()); + } + + public HashSet getSupportedProtocols() { + return (HashSet) this.supportedProtocols.clone(); } public void close() { @@ -155,6 +164,9 @@ public final class plasmaCrawlLoader extends Thread { // getting the protocol of the next URL String protocol = theMsg.url.getProtocol(); + // TODO: remove this + if (protocol.equals("https")) protocol = "http"; + // getting a new crawler from the crawler pool plasmaCrawlWorker theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol); if (theWorker == null) {