- with this script it's possible to pass URLs from squid to yacy via the squid redirector interface - this URLs are then used by YaCy to feed the crawler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1141 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
bdf30117c1
commit
b35c5a48bf
@ -0,0 +1,131 @@
|
||||
#!/usr/bin/perl -w
|
||||
#
|
||||
# This is an URL Redirector Script for squid that can be
|
||||
# used to bundle YaCy and Squid together via the squid
|
||||
# redirector support.
|
||||
# See: http://www.squid-cache.org/Doc/FAQ/FAQ-15.html
|
||||
#
|
||||
# This scripts forwards URLs from squid to YaCy where the
|
||||
# URLs are used to download and index the content of the URLs.
|
||||
|
||||
use strict;
|
||||
use Socket qw(:DEFAULT :crlf);
|
||||
use IO::Handle;
|
||||
use Digest::MD5;
|
||||
|
||||
# setting administrator username + pwd, hostname + port
|
||||
my $user = "admin";
|
||||
my $pwd = "";
|
||||
my $host = "localhost";
|
||||
my $port = "8080";
|
||||
|
||||
my %mediaExt;
|
||||
my @requestData;
|
||||
$|=1;
|
||||
|
||||
sub isCGI {
|
||||
my $url = lc shift;
|
||||
return ((rindex $url, ".cgi") != -1) ||
|
||||
((rindex $url, ".exe") != -1) ||
|
||||
((rindex $url, ";jsessionid=") != -1) ||
|
||||
((rindex $url, "sessionid/") != -1) ||
|
||||
((rindex $url, "phpsessid=") != -1);
|
||||
}
|
||||
|
||||
sub isPOST {
|
||||
my $url = lc shift;
|
||||
return ((rindex $url, "?") != -1) ||
|
||||
((rindex $url, "&") != -1);
|
||||
}
|
||||
|
||||
sub isMediaExt {
|
||||
my $url = lc shift;
|
||||
my $pos = rindex $url, ".";
|
||||
if ($pos != -1) {
|
||||
my $ext = substr($url,$pos+1,length($url));
|
||||
return exists($mediaExt{$ext});
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
my ($bytes_out,$bytes_in) = (0,0);
|
||||
my ($msg_in,$msg_out);
|
||||
|
||||
my $protocol = getprotobyname('tcp');
|
||||
$host = inet_aton($host) or die "$host: unknown host";
|
||||
|
||||
socket(SOCK, AF_INET, SOCK_STREAM, $protocol) or die "socket() failed: $!";
|
||||
my $dest_addr = sockaddr_in($port,$host);
|
||||
connect(SOCK,$dest_addr) or die "connect() failed: $!";
|
||||
|
||||
# enabling autoflush
|
||||
SOCK->autoflush(1);
|
||||
|
||||
# sending the REDIRECTOR command to yacy to enable the proper
|
||||
# command handler
|
||||
print SOCK "REDIRECTOR".CRLF;
|
||||
|
||||
# Doing authentication
|
||||
my $ctx = Digest::MD5->new;
|
||||
$ctx->add($user.":".$pwd);
|
||||
my $md5Pwd = $ctx->hexdigest;
|
||||
|
||||
print SOCK "USER ".$user.CRLF;
|
||||
print SOCK "PWD ".$md5Pwd.CRLF;
|
||||
|
||||
# Getting a list of file extensions that should be ignored
|
||||
print SOCK "MEDIAEXT".CRLF;
|
||||
$msg_in = lc <SOCK>;
|
||||
%mediaExt = split(/,\s*/, $msg_in);
|
||||
|
||||
# 1) Reading URLs from stdIn
|
||||
# 2) Send it to Yacy
|
||||
# 3) Receive response from YaCy
|
||||
# 4) Print response to StdOut
|
||||
while (defined($msg_out = <>)) {
|
||||
chomp $msg_out;
|
||||
|
||||
# splitting request into it's various parts
|
||||
#
|
||||
# One squid redirector request line typically looks like this:
|
||||
# http://www.pageresource.com/styles/tuts.css 192.168.0.5/- - GET
|
||||
@requestData = split(/\s+/, $msg_out);
|
||||
|
||||
# testing if the URL is CGI
|
||||
if (isCGI($requestData[0])) {
|
||||
print STDOUT "URL is cgi: ".$msg_out.CRLF;
|
||||
next;
|
||||
}
|
||||
|
||||
# testing if the URL is a POST request
|
||||
if (isPOST($requestData[0])){
|
||||
print STDOUT "URL is post: ".$msg_out.CRLF;
|
||||
next;
|
||||
}
|
||||
|
||||
# testing if the requested content is a media content
|
||||
if (isMediaExt($requestData[0])) {
|
||||
print STDOUT "URL has media extension: ".$msg_out.CRLF;
|
||||
next;
|
||||
}
|
||||
|
||||
# sending the whole request line to YaCy
|
||||
$msg_out .= CRLF;
|
||||
print SOCK $msg_out;
|
||||
|
||||
# reading the response
|
||||
if (defined($msg_in = <SOCK>)) {
|
||||
print STDOUT $msg_in;
|
||||
} else {
|
||||
close SOCK;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$bytes_out += length($msg_out);
|
||||
$bytes_in += length($msg_in);
|
||||
}
|
||||
print SOCK "EXIT".CRLF;
|
||||
|
||||
close SOCK;
|
||||
print STDERR "bytes_sent = $bytes_out, bytes_received = $bytes_in\n";
|
||||
|
@ -0,0 +1,186 @@
|
||||
package de.anomic.urlRedirector;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.PrintWriter;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Date;
|
||||
|
||||
import de.anomic.data.userDB;
|
||||
import de.anomic.plasma.plasmaCrawlProfile;
|
||||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.server.serverCore;
|
||||
import de.anomic.server.serverHandler;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
import de.anomic.server.serverCore.Session;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
|
||||
public class urlRedirectord implements serverHandler {
|
||||
|
||||
private serverCore.Session session;
|
||||
private static plasmaSwitchboard switchboard = null;
|
||||
private serverLog theLogger = new serverLog("URL-REDIRECTOR");
|
||||
private static plasmaCrawlProfile.entry profile = null;
|
||||
|
||||
public urlRedirectord() {
|
||||
if (switchboard == null) {
|
||||
switchboard = plasmaSwitchboard.getSwitchboard();
|
||||
}
|
||||
|
||||
if (profile == null) {
|
||||
try {
|
||||
profile = switchboard.profiles.newEntry(
|
||||
// name
|
||||
"URL Redirector",
|
||||
// start URL
|
||||
"",
|
||||
// crawling filter
|
||||
".*",
|
||||
".*",
|
||||
// depth
|
||||
0,
|
||||
0,
|
||||
// crawlDynamic
|
||||
false,
|
||||
// storeHTCache
|
||||
false,
|
||||
// storeTxCache
|
||||
true,
|
||||
//localIndexing
|
||||
true,
|
||||
// remoteIndexing
|
||||
false,
|
||||
// xsstopw
|
||||
true,
|
||||
// xdstopw
|
||||
true,
|
||||
// xpstopw
|
||||
true
|
||||
);
|
||||
} catch (IOException e) {
|
||||
this.theLogger.logSevere("Unable to create a crawling profile for the URL-Redirector",e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void initSession(Session theSession){
|
||||
// getting current session
|
||||
this.session = theSession;
|
||||
}
|
||||
|
||||
public String greeting() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public String error(Throwable e) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
this.session = null;
|
||||
}
|
||||
|
||||
public Boolean EMPTY(String arg) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
public Boolean UNKNOWN(String requestLine) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
public Boolean REDIRECTOR(String requestLine) throws IOException {
|
||||
try {
|
||||
|
||||
boolean authenticated = false;
|
||||
String userName = null;
|
||||
String md5Pwd = null;
|
||||
|
||||
// setting timeout
|
||||
this.session.controlSocket.setSoTimeout(0);
|
||||
|
||||
String line = null;
|
||||
BufferedReader inputReader = new BufferedReader(new InputStreamReader(this.session.in));
|
||||
PrintWriter outputWriter = new PrintWriter(this.session.out);
|
||||
|
||||
while ((line = inputReader.readLine()) != null) {
|
||||
if (line.equals("EXIT")) {
|
||||
break;
|
||||
} else if (line.startsWith("#")) {
|
||||
continue;
|
||||
} else if (line.startsWith("USER")) {
|
||||
userName = line.substring(line.indexOf(" ")).trim();
|
||||
} else if (line.startsWith("PWD")) {
|
||||
if (userName != null) {
|
||||
userDB.Entry userEntry = switchboard.userDB.getEntry(userName);
|
||||
if (userEntry != null) {
|
||||
md5Pwd = line.substring(line.indexOf(" ")).trim();
|
||||
if (userEntry.getMD5EncodedUserPwd().equals(md5Pwd)) {
|
||||
authenticated = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (line.startsWith("MEDIAEXT")) {
|
||||
String transferIgnoreList = plasmaParser.getMediaExtList();
|
||||
transferIgnoreList = transferIgnoreList.substring(1,transferIgnoreList.length()-1);
|
||||
|
||||
outputWriter.print(transferIgnoreList);
|
||||
outputWriter.print("\r\n");
|
||||
outputWriter.flush();
|
||||
} else {
|
||||
if (!authenticated) {
|
||||
return Boolean.FALSE;
|
||||
}
|
||||
|
||||
int pos = line.indexOf(" ");
|
||||
String nextURL = (pos != -1) ? line.substring(0,pos):line;
|
||||
|
||||
this.theLogger.logFine("Receiving request " + line);
|
||||
outputWriter.print("\r\n");
|
||||
outputWriter.flush();
|
||||
|
||||
String reasonString = null;
|
||||
try {
|
||||
if (plasmaParser.supportedFileExt(new URL(nextURL))) {
|
||||
// enqueuing URL for crawling
|
||||
reasonString = switchboard.sbStackCrawlThread.stackCrawl(
|
||||
nextURL,
|
||||
null,
|
||||
yacyCore.seedDB.mySeed.hash,
|
||||
"URL Redirector",
|
||||
new Date(),
|
||||
0,
|
||||
profile
|
||||
);
|
||||
} else {
|
||||
reasonString = "Unsupporte file extension";
|
||||
}
|
||||
} catch (MalformedURLException badUrlEx) {
|
||||
reasonString = "Malformed URL";
|
||||
}
|
||||
|
||||
if (reasonString != null) {
|
||||
this.theLogger.logFine("URL " + nextURL + " rejected. Reason: " + reasonString);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.theLogger.logFine("Connection terminated");
|
||||
|
||||
// Terminating connection
|
||||
return serverCore.TERMINATE_CONNECTION;
|
||||
} catch (Exception e) {
|
||||
this.theLogger.logSevere("Unexpected Error: " + e.getMessage(),e);
|
||||
return serverCore.TERMINATE_CONNECTION;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
Loading…
Reference in new issue