diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 6df5cc1eb..033dd2cf3 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -227,7 +227,7 @@ public final class plasmaCrawlStacker { */ URL nexturl = null; if ((initiatorHash == null) || (initiatorHash.length() == 0)) initiatorHash = plasmaURL.dummyHash; - String referrerHash = plasmaURL.urlHash(referrerString); + String referrerHash = (referrerString==null)?null:plasmaURL.urlHash(referrerString); try { nexturl = new URL(nexturlString); } catch (MalformedURLException e) { diff --git a/source/de/anomic/server/serverCore.java b/source/de/anomic/server/serverCore.java index fbfef0e67..a4a8c0c53 100644 --- a/source/de/anomic/server/serverCore.java +++ b/source/de/anomic/server/serverCore.java @@ -79,6 +79,7 @@ import org.apache.commons.pool.impl.GenericObjectPool.Config; import de.anomic.http.httpc; import de.anomic.icap.icapd; import de.anomic.server.logging.serverLog; +import de.anomic.urlRedirector.urlRedirectord; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeedDB; @@ -1042,12 +1043,15 @@ public final class serverCore extends serverAbstractThread implements serverThre // now we need to initialize the session if (this.commandCounter == 0) { // first we need to determine the proper protocol handler - if (this.request.indexOf("ICAP") >= 0) reqProtocol = "ICAP"; - else reqProtocol = "HTTP"; + if (this.request.indexOf("ICAP") >= 0) reqProtocol = "ICAP"; + else if (this.request.startsWith("REDIRECTOR")) reqProtocol = "REDIRECTOR"; + else reqProtocol = "HTTP"; // next we need to get the proper protocol handler if (reqProtocol.equals("ICAP")) { this.commandObj = new icapd(); + } else if (reqProtocol.equals("REDIRECTOR")) { + this.commandObj = new urlRedirectord(); } else { // if ((this.commandObj != null) && // (this.commandObj.getClass().getName().equals(serverCore.this.handlerPrototype.getClass().getName()))) { diff --git a/source/de/anomic/urlRedirector/urlRedirector.pl b/source/de/anomic/urlRedirector/urlRedirector.pl new file mode 100644 index 000000000..cbc05f3e4 --- /dev/null +++ b/source/de/anomic/urlRedirector/urlRedirector.pl @@ -0,0 +1,131 @@ +#!/usr/bin/perl -w +# +# This is an URL Redirector Script for squid that can be +# used to bundle YaCy and Squid together via the squid +# redirector support. +# See: http://www.squid-cache.org/Doc/FAQ/FAQ-15.html +# +# This scripts forwards URLs from squid to YaCy where the +# URLs are used to download and index the content of the URLs. + +use strict; +use Socket qw(:DEFAULT :crlf); +use IO::Handle; +use Digest::MD5; + +# setting administrator username + pwd, hostname + port +my $user = "admin"; +my $pwd = ""; +my $host = "localhost"; +my $port = "8080"; + +my %mediaExt; +my @requestData; +$|=1; + +sub isCGI { + my $url = lc shift; + return ((rindex $url, ".cgi") != -1) || + ((rindex $url, ".exe") != -1) || + ((rindex $url, ";jsessionid=") != -1) || + ((rindex $url, "sessionid/") != -1) || + ((rindex $url, "phpsessid=") != -1); +} + +sub isPOST { + my $url = lc shift; + return ((rindex $url, "?") != -1) || + ((rindex $url, "&") != -1); +} + +sub isMediaExt { + my $url = lc shift; + my $pos = rindex $url, "."; + if ($pos != -1) { + my $ext = substr($url,$pos+1,length($url)); + return exists($mediaExt{$ext}); + } + return 0; +} + +my ($bytes_out,$bytes_in) = (0,0); +my ($msg_in,$msg_out); + +my $protocol = getprotobyname('tcp'); +$host = inet_aton($host) or die "$host: unknown host"; + +socket(SOCK, AF_INET, SOCK_STREAM, $protocol) or die "socket() failed: $!"; +my $dest_addr = sockaddr_in($port,$host); +connect(SOCK,$dest_addr) or die "connect() failed: $!"; + +# enabling autoflush +SOCK->autoflush(1); + +# sending the REDIRECTOR command to yacy to enable the proper +# command handler +print SOCK "REDIRECTOR".CRLF; + +# Doing authentication +my $ctx = Digest::MD5->new; +$ctx->add($user.":".$pwd); +my $md5Pwd = $ctx->hexdigest; + +print SOCK "USER ".$user.CRLF; +print SOCK "PWD ".$md5Pwd.CRLF; + +# Getting a list of file extensions that should be ignored +print SOCK "MEDIAEXT".CRLF; +$msg_in = lc ; +%mediaExt = split(/,\s*/, $msg_in); + +# 1) Reading URLs from stdIn +# 2) Send it to Yacy +# 3) Receive response from YaCy +# 4) Print response to StdOut +while (defined($msg_out = <>)) { + chomp $msg_out; + + # splitting request into it's various parts + # + # One squid redirector request line typically looks like this: + # http://www.pageresource.com/styles/tuts.css 192.168.0.5/- - GET + @requestData = split(/\s+/, $msg_out); + + # testing if the URL is CGI + if (isCGI($requestData[0])) { + print STDOUT "URL is cgi: ".$msg_out.CRLF; + next; + } + + # testing if the URL is a POST request + if (isPOST($requestData[0])){ + print STDOUT "URL is post: ".$msg_out.CRLF; + next; + } + + # testing if the requested content is a media content + if (isMediaExt($requestData[0])) { + print STDOUT "URL has media extension: ".$msg_out.CRLF; + next; + } + + # sending the whole request line to YaCy + $msg_out .= CRLF; + print SOCK $msg_out; + + # reading the response + if (defined($msg_in = )) { + print STDOUT $msg_in; + } else { + close SOCK; + exit(1); + } + + $bytes_out += length($msg_out); + $bytes_in += length($msg_in); +} +print SOCK "EXIT".CRLF; + +close SOCK; +print STDERR "bytes_sent = $bytes_out, bytes_received = $bytes_in\n"; + diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java new file mode 100644 index 000000000..1b8fe1969 --- /dev/null +++ b/source/de/anomic/urlRedirector/urlRedirectord.java @@ -0,0 +1,186 @@ +package de.anomic.urlRedirector; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Date; + +import de.anomic.data.userDB; +import de.anomic.plasma.plasmaCrawlProfile; +import de.anomic.plasma.plasmaParser; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverCore; +import de.anomic.server.serverHandler; +import de.anomic.server.logging.serverLog; +import de.anomic.server.serverCore.Session; +import de.anomic.yacy.yacyCore; + +public class urlRedirectord implements serverHandler { + + private serverCore.Session session; + private static plasmaSwitchboard switchboard = null; + private serverLog theLogger = new serverLog("URL-REDIRECTOR"); + private static plasmaCrawlProfile.entry profile = null; + + public urlRedirectord() { + if (switchboard == null) { + switchboard = plasmaSwitchboard.getSwitchboard(); + } + + if (profile == null) { + try { + profile = switchboard.profiles.newEntry( + // name + "URL Redirector", + // start URL + "", + // crawling filter + ".*", + ".*", + // depth + 0, + 0, + // crawlDynamic + false, + // storeHTCache + false, + // storeTxCache + true, + //localIndexing + true, + // remoteIndexing + false, + // xsstopw + true, + // xdstopw + true, + // xpstopw + true + ); + } catch (IOException e) { + this.theLogger.logSevere("Unable to create a crawling profile for the URL-Redirector",e); + } + } + } + + public void initSession(Session theSession){ + // getting current session + this.session = theSession; + } + + public String greeting() { + return null; + } + + public String error(Throwable e) { + return null; + } + + public Object clone() { + return null; + } + + public void reset() { + this.session = null; + } + + public Boolean EMPTY(String arg) throws IOException { + return null; + } + + public Boolean UNKNOWN(String requestLine) throws IOException { + return null; + } + + public Boolean REDIRECTOR(String requestLine) throws IOException { + try { + + boolean authenticated = false; + String userName = null; + String md5Pwd = null; + + // setting timeout + this.session.controlSocket.setSoTimeout(0); + + String line = null; + BufferedReader inputReader = new BufferedReader(new InputStreamReader(this.session.in)); + PrintWriter outputWriter = new PrintWriter(this.session.out); + + while ((line = inputReader.readLine()) != null) { + if (line.equals("EXIT")) { + break; + } else if (line.startsWith("#")) { + continue; + } else if (line.startsWith("USER")) { + userName = line.substring(line.indexOf(" ")).trim(); + } else if (line.startsWith("PWD")) { + if (userName != null) { + userDB.Entry userEntry = switchboard.userDB.getEntry(userName); + if (userEntry != null) { + md5Pwd = line.substring(line.indexOf(" ")).trim(); + if (userEntry.getMD5EncodedUserPwd().equals(md5Pwd)) { + authenticated = true; + } + } + } + } else if (line.startsWith("MEDIAEXT")) { + String transferIgnoreList = plasmaParser.getMediaExtList(); + transferIgnoreList = transferIgnoreList.substring(1,transferIgnoreList.length()-1); + + outputWriter.print(transferIgnoreList); + outputWriter.print("\r\n"); + outputWriter.flush(); + } else { + if (!authenticated) { + return Boolean.FALSE; + } + + int pos = line.indexOf(" "); + String nextURL = (pos != -1) ? line.substring(0,pos):line; + + this.theLogger.logFine("Receiving request " + line); + outputWriter.print("\r\n"); + outputWriter.flush(); + + String reasonString = null; + try { + if (plasmaParser.supportedFileExt(new URL(nextURL))) { + // enqueuing URL for crawling + reasonString = switchboard.sbStackCrawlThread.stackCrawl( + nextURL, + null, + yacyCore.seedDB.mySeed.hash, + "URL Redirector", + new Date(), + 0, + profile + ); + } else { + reasonString = "Unsupporte file extension"; + } + } catch (MalformedURLException badUrlEx) { + reasonString = "Malformed URL"; + } + + if (reasonString != null) { + this.theLogger.logFine("URL " + nextURL + " rejected. Reason: " + reasonString); + } + } + } + + this.theLogger.logFine("Connection terminated"); + + // Terminating connection + return serverCore.TERMINATE_CONNECTION; + } catch (Exception e) { + this.theLogger.logSevere("Unexpected Error: " + e.getMessage(),e); + return serverCore.TERMINATE_CONNECTION; + } + } + + + +}