*) next step of restructuring for new crawlers

- IndexCreate_p.java: correcting problems with ftp urls
   - URL.java does not cutout the userinfo anymore 
    (needed to transport authentication info in ftp urls, e.g. ftp://username:pwd@ftp.irgendwas.de)
   - plasmaCrawlLoader.java: 
   -- hack to re enable https urls
   -- adding function getSupportedProtocols

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2482 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 6cce47e217
commit 5847492537

@ -140,7 +140,8 @@ public class IndexCreate_p {
crawlingStart = crawlingStart.trim();
// adding the prefix http:// if necessary
if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;
int pos = crawlingStart.indexOf("://");
if (pos == -1) crawlingStart = "http://" + crawlingStart;
// normalizing URL
try {crawlingStart = new URL(crawlingStart).toNormalform();} catch (MalformedURLException e1) {}

@ -278,8 +278,8 @@ public class URL {
matcher.reset(path);
}
if (defaultPort) { return this.protocol + "://" + this.getHost().toLowerCase() + path; }
return this.protocol + "://" + this.getHost().toLowerCase() + ((defaultPort) ? "" : (":" + this.port)) + path;
if (defaultPort) { return this.protocol + "://" + (this.userInfo!=null?this.userInfo+"@":"") + this.getHost().toLowerCase() + path; }
return this.protocol + "://" + (this.userInfo!=null?this.userInfo+"@":"")+ this.getHost().toLowerCase() + ((defaultPort) ? "" : (":" + this.port)) + path;
}
public boolean equals(URL other) {

@ -44,6 +44,9 @@
package de.anomic.plasma;
import java.util.Arrays;
import java.util.HashSet;
import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import org.apache.commons.pool.impl.GenericObjectPool;
@ -61,6 +64,8 @@ public final class plasmaCrawlLoader extends Thread {
private final plasmaHTCache cacheManager;
private final serverLog log;
private HashSet supportedProtocols;
private final plasmaCrawlerMsgQueue theQueue;
private final plasmaCrawlerPool crawlwerPool;
private GenericKeyedObjectPool.Config crawlerPoolConfig = null;
@ -76,6 +81,10 @@ public final class plasmaCrawlLoader extends Thread {
this.cacheManager = theCacheManager;
this.log = theLog;
// supported protocols
// TODO: change this, e.g. by loading settings from file
this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp"}));
// configuring the crawler messagequeue
this.theQueue = new plasmaCrawlerMsgQueue();
@ -124,11 +133,11 @@ public final class plasmaCrawlLoader extends Thread {
public boolean isSupportedProtocol(String protocol) {
if ((protocol == null) || (protocol.length() == 0)) return false;
// TODO: read the supported protocols out from a config file
protocol = protocol.trim().toLowerCase();
return protocol.equals("http") ||
protocol.equals("https");
return this.supportedProtocols.contains(protocol.trim().toLowerCase());
}
public HashSet getSupportedProtocols() {
return (HashSet) this.supportedProtocols.clone();
}
public void close() {
@ -155,6 +164,9 @@ public final class plasmaCrawlLoader extends Thread {
// getting the protocol of the next URL
String protocol = theMsg.url.getProtocol();
// TODO: remove this
if (protocol.equals("https")) protocol = "http";
// getting a new crawler from the crawler pool
plasmaCrawlWorker theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
if (theWorker == null) {

Loading…
Cancel
Save