*) Robots.txt parser/ppt

cutting of comments at the line end
*) Adding Threadpool for stackCrawl Thread to speedup robots.txt download
   and double url checks

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@882 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent f6024792b1
commit 959eefbc4f

@ -65,15 +65,8 @@ import de.anomic.server.logging.serverLog;
/* /*
* A class for Parsing robots.txt files. * A class for Parsing robots.txt files.
* It only parses the Deny Part, yet. * It only parses the Deny Part, yet.
* TODO: Allow, Do not deny if User-Agent: != yacy * *
*
* http://www.robotstxt.org/wc/norobots-rfc.html * http://www.robotstxt.org/wc/norobots-rfc.html
*
* Use:
* robotsParser rp=new Robotsparser(robotsfile);
* if(rp.isAllowedRobots("/test")){
* System.out.println("/test is allowed");
* }
*/ */
public final class robotsParser{ public final class robotsParser{
@ -115,6 +108,13 @@ public final class robotsParser{
} else if (line.startsWith("#")) { } else if (line.startsWith("#")) {
// we can ignore this. Just a comment line // we can ignore this. Just a comment line
} else if ((!rule4Yacy) && (line.startsWith("User-agent:"))) { } else if ((!rule4Yacy) && (line.startsWith("User-agent:"))) {
// cutting off comments at the line end
pos = line.indexOf("#");
if (pos != -1) {
line = line.substring(0,pos);
}
// getting out the robots name
pos = line.indexOf(" "); pos = line.indexOf(" ");
if (pos != -1) { if (pos != -1) {
String userAgent = line.substring(pos).trim(); String userAgent = line.substring(pos).trim();

@ -109,7 +109,7 @@ public final class httpc {
private static final SimpleDateFormat EMLFormatter = new SimpleDateFormat("dd MMM yyyy HH:mm:ss", Locale.US); private static final SimpleDateFormat EMLFormatter = new SimpleDateFormat("dd MMM yyyy HH:mm:ss", Locale.US);
private static final SimpleDateFormat ShortFormatter = new SimpleDateFormat("yyyyMMddHHmmss"); private static final SimpleDateFormat ShortFormatter = new SimpleDateFormat("yyyyMMddHHmmss");
//Mo 06 Sep 2004 23:32 //Mo 06 Sep 2004 23:32
private static final HashMap reverseMappingCache = new HashMap(); static final HashMap reverseMappingCache = new HashMap();
// the dns cache // the dns cache
private static final HashMap nameCacheHit = new HashMap(); private static final HashMap nameCacheHit = new HashMap();
@ -124,7 +124,7 @@ public final class httpc {
// class variables // class variables
private Socket socket = null; // client socket for commands private Socket socket = null; // client socket for commands
private Thread socketOwner = null; private Thread socketOwner = null;
private String host = null; String host = null;
private long timeout; private long timeout;
private long handle; private long handle;
@ -134,7 +134,7 @@ public final class httpc {
private boolean remoteProxyUse = false; private boolean remoteProxyUse = false;
private String savedRemoteHost = null; private String savedRemoteHost = null;
private String requestPath = null; String requestPath = null;
private boolean allowContentEncoding = true; private boolean allowContentEncoding = true;
static boolean useYacyReferer = true; static boolean useYacyReferer = true;
public static boolean yacyDebugMode = false; public static boolean yacyDebugMode = false;
@ -294,7 +294,7 @@ public final class httpc {
*/ */
public boolean isClosed() { public boolean isClosed() {
if (this.socket == null) return true; if (this.socket == null) return true;
else return (!this.socket.isConnected()) || (this.socket.isClosed()); return (!this.socket.isConnected()) || (this.socket.isClosed());
} }
/** /**
@ -312,9 +312,8 @@ public final class httpc {
if ((ip != null) && (!(ip.equals("127.0.0.1"))) && (!(ip.equals("localhost")))) { if ((ip != null) && (!(ip.equals("127.0.0.1"))) && (!(ip.equals("localhost")))) {
nameCacheHit.put(host, ip); nameCacheHit.put(host, ip);
return ip; return ip;
} else {
return null;
} }
return null;
} catch (UnknownHostException e) { } catch (UnknownHostException e) {
//nameCacheMiss.add(host); //nameCacheMiss.add(host);
} }
@ -335,9 +334,8 @@ public final class httpc {
if ((ip != null) && (!(ip.equals("127.0.0.1"))) && (!(ip.equals("localhost")))) { if ((ip != null) && (!(ip.equals("127.0.0.1"))) && (!(ip.equals("localhost")))) {
nameCacheHit.put(host, ip); nameCacheHit.put(host, ip);
return true; return true;
} else {
return false;
} }
return false;
} catch (UnknownHostException e) { } catch (UnknownHostException e) {
//nameCacheMiss.add(host); //nameCacheMiss.add(host);
return false; return false;
@ -411,7 +409,7 @@ public final class httpc {
* @throws IOException * @throws IOException
*/ */
void init(String server, int port, int timeout, boolean ssl) throws IOException { void init(String server, int port, int timeout, boolean ssl) throws IOException {
handle = System.currentTimeMillis(); this.handle = System.currentTimeMillis();
//serverLog.logDebug("HTTPC", handle + " initialized"); //serverLog.logDebug("HTTPC", handle + " initialized");
this.remoteProxyUse = false; this.remoteProxyUse = false;
this.timeout = timeout; this.timeout = timeout;
@ -483,7 +481,7 @@ public final class httpc {
} }
if (this.socket != null) { if (this.socket != null) {
try {this.socket.close();} catch (Exception e) {} try {this.socket.close();} catch (Exception e) {}
this.unregisterOpenSocket(this.socket,this.socketOwner); httpc.unregisterOpenSocket(this.socket,this.socketOwner);
this.socket = null; this.socket = null;
this.socketOwner = null; this.socketOwner = null;
} }
@ -540,7 +538,7 @@ public final class httpc {
if ((path == null) || (path.length() == 0)) path = "/"; if ((path == null) || (path.length() == 0)) path = "/";
// for debuggug: // for debuggug:
requestPath = path; this.requestPath = path;
// prepare header // prepare header
if (header == null) header = new httpHeader(); if (header == null) header = new httpHeader();
@ -564,7 +562,7 @@ public final class httpc {
// the host is mandatory, if we use HTTP/1.1 // the host is mandatory, if we use HTTP/1.1
if (!(header.containsKey(httpHeader.HOST))) { if (!(header.containsKey(httpHeader.HOST))) {
if (this.remoteProxyUse) if (this.remoteProxyUse)
header.put(httpHeader.HOST, savedRemoteHost); header.put(httpHeader.HOST, this.savedRemoteHost);
else else
header.put(httpHeader.HOST, this.host); header.put(httpHeader.HOST, this.host);
} }
@ -608,7 +606,7 @@ public final class httpc {
// send request // send request
if ((this.remoteProxyUse) && (!(method.equals(httpHeader.METHOD_CONNECT)))) if ((this.remoteProxyUse) && (!(method.equals(httpHeader.METHOD_CONNECT))))
path = "http://" + this.savedRemoteHost + path; path = "http://" + this.savedRemoteHost + path;
serverCore.send(clientOutput, method + " " + path + " HTTP/1.0"); // if set to HTTP/1.1, servers give time-outs? serverCore.send(this.clientOutput, method + " " + path + " HTTP/1.0"); // if set to HTTP/1.1, servers give time-outs?
// send header // send header
//System.out.println("***HEADER for path " + path + ": PROXY TO SERVER = " + header.toString()); // DEBUG //System.out.println("***HEADER for path " + path + ": PROXY TO SERVER = " + header.toString()); // DEBUG
@ -695,18 +693,18 @@ public final class httpc {
len = Integer.parseInt(cl); len = Integer.parseInt(cl);
// transfer len bytes from ins to the server // transfer len bytes from ins to the server
while ((len > 0) && ((c = ins.read(buffer)) >= 0)) { while ((len > 0) && ((c = ins.read(buffer)) >= 0)) {
clientOutput.write(buffer, 0, c); this.clientOutput.write(buffer, 0, c);
len -= c; len -= c;
} }
} else { } else {
len = 0; len = 0;
while ((c = ins.read(buffer)) >= 0) { while ((c = ins.read(buffer)) >= 0) {
clientOutput.write(buffer, 0, c); this.clientOutput.write(buffer, 0, c);
len += c; len += c;
} }
requestHeader.put(httpHeader.CONTENT_LENGTH, Integer.toString(len)); requestHeader.put(httpHeader.CONTENT_LENGTH, Integer.toString(len));
} }
clientOutput.flush(); this.clientOutput.flush();
return new response(false); return new response(false);
} catch (SocketException e) { } catch (SocketException e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
@ -827,7 +825,7 @@ public final class httpc {
send(httpHeader.METHOD_POST, path, requestHeader, false); send(httpHeader.METHOD_POST, path, requestHeader, false);
// send the body // send the body
//System.out.println("body=" + buf.toString()); //System.out.println("body=" + buf.toString());
serverCore.send(clientOutput, body); serverCore.send(this.clientOutput, body);
return new response(false); return new response(false);
} }
@ -907,9 +905,8 @@ do upload
httpc.response res = con.GET(path, null); httpc.response res = con.GET(path, null);
if (res.status.startsWith("2")) { if (res.status.startsWith("2")) {
return res.writeContent(); return res.writeContent();
} else {
return res.status.getBytes();
} }
return res.status.getBytes();
} catch (Exception e) { } catch (Exception e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} finally { } finally {
@ -961,9 +958,8 @@ do upload
//System.out.println("response=" + res.toString()); //System.out.println("response=" + res.toString());
if (res.status.startsWith("2")) { if (res.status.startsWith("2")) {
return res.writeContent(); return res.writeContent();
} else {
return res.status.getBytes();
} }
return res.status.getBytes();
} catch (Exception e) { } catch (Exception e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} finally { } finally {
@ -1035,10 +1031,9 @@ do upload
if (res.status.startsWith("2")) { if (res.status.startsWith("2")) {
// success // success
return res.responseHeader; return res.responseHeader;
} else {
// fail
return res.responseHeader;
} }
// fail
return res.responseHeader;
} catch (Exception e) { } catch (Exception e) {
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} finally { } finally {
@ -1251,28 +1246,28 @@ do upload
public response(boolean zipped) throws IOException { public response(boolean zipped) throws IOException {
// lets start with worst-case attributes as set-up // lets start with worst-case attributes as set-up
responseHeader = new httpHeader(reverseMappingCache); this.responseHeader = new httpHeader(reverseMappingCache);
statusCode = 503; this.statusCode = 503;
statusText = "internal httpc error"; this.statusText = "internal httpc error";
status = Integer.toString(statusCode) + " " + statusText; this.status = Integer.toString(this.statusCode) + " " + this.statusText;
gzip = false; this.gzip = false;
// check connection status // check connection status
if (clientInput == null) { if (httpc.this.clientInput == null) {
// the server has meanwhile disconnected // the server has meanwhile disconnected
statusCode = 503; this.statusCode = 503;
statusText = "lost connection to server"; this.statusText = "lost connection to server";
status = Integer.toString(statusCode) + " " + statusText; this.status = Integer.toString(this.statusCode) + " " + this.statusText;
return; // in bad mood return; // in bad mood
} }
// reads in the http header, right now, right here // reads in the http header, right now, right here
byte[] b = serverCore.receive(clientInput, readLineBuffer, terminalMaxLength, false); byte[] b = serverCore.receive(httpc.this.clientInput, httpc.this.readLineBuffer, terminalMaxLength, false);
if (b == null) { if (b == null) {
// the server has meanwhile disconnected // the server has meanwhile disconnected
statusCode = 503; this.statusCode = 503;
statusText = "server has closed connection"; this.statusText = "server has closed connection";
status = Integer.toString(statusCode) + " " + statusText; this.status = Integer.toString(this.statusCode) + " " + this.statusText;
return; // in bad mood return; // in bad mood
} }
@ -1286,7 +1281,7 @@ do upload
if ((this.statusCode==500)&&(this.statusText.equals("status line parse error"))) { if ((this.statusCode==500)&&(this.statusText.equals("status line parse error"))) {
// flush in anything that comes without parsing // flush in anything that comes without parsing
while ((b != null) && (b.length != 0)) b = serverCore.receive(clientInput, readLineBuffer, terminalMaxLength, false); while ((b != null) && (b.length != 0)) b = serverCore.receive(httpc.this.clientInput, httpc.this.readLineBuffer, terminalMaxLength, false);
return; // in bad mood return; // in bad mood
} }
@ -1294,13 +1289,13 @@ do upload
if (this.statusCode == 400) { if (this.statusCode == 400) {
// bad request // bad request
// flush in anything that comes without parsing // flush in anything that comes without parsing
while ((b = serverCore.receive(clientInput, readLineBuffer, terminalMaxLength, false)).length != 0) {} while ((b = serverCore.receive(httpc.this.clientInput, httpc.this.readLineBuffer, terminalMaxLength, false)).length != 0) {}
return; // in bad mood return; // in bad mood
} }
// at this point we should have a valid response. read in the header properties // at this point we should have a valid response. read in the header properties
String key = ""; String key = "";
while ((b = serverCore.receive(clientInput, readLineBuffer, terminalMaxLength, false)) != null) { while ((b = serverCore.receive(httpc.this.clientInput, httpc.this.readLineBuffer, terminalMaxLength, false)) != null) {
if (b.length == 0) break; if (b.length == 0) break;
buffer = new String(b); buffer = new String(b);
//System.out.println("#H#" + buffer); // debug //System.out.println("#H#" + buffer); // debug
@ -1308,15 +1303,15 @@ do upload
// use old entry // use old entry
if (key.length() == 0) throw new IOException("header corrupted - input error"); if (key.length() == 0) throw new IOException("header corrupted - input error");
// attach new line // attach new line
if (!(responseHeader.containsKey(key))) throw new IOException("header corrupted - internal error"); if (!(this.responseHeader.containsKey(key))) throw new IOException("header corrupted - internal error");
responseHeader.put(key, (String) responseHeader.get(key) + " " + buffer.trim()); this.responseHeader.put(key, (String) this.responseHeader.get(key) + " " + buffer.trim());
} else { } else {
// create new entry // create new entry
int p = buffer.indexOf(":"); int p = buffer.indexOf(":");
if (p > 0) { if (p > 0) {
responseHeader.add(buffer.substring(0, p).trim(), buffer.substring(p + 1).trim()); this.responseHeader.add(buffer.substring(0, p).trim(), buffer.substring(p + 1).trim());
} else { } else {
serverLog.logSevere("HTTPC", "RESPONSE PARSE ERROR: HOST='" + host + "', PATH='" + requestPath + "', STATUS='" + status + "'"); serverLog.logSevere("HTTPC", "RESPONSE PARSE ERROR: HOST='" + httpc.this.host + "', PATH='" + httpc.this.requestPath + "', STATUS='" + this.status + "'");
serverLog.logSevere("HTTPC", "..............BUFFER: " + buffer); serverLog.logSevere("HTTPC", "..............BUFFER: " + buffer);
} }
} }
@ -1325,11 +1320,11 @@ do upload
// we will now manipulate the header if the content is gzip encoded, because // we will now manipulate the header if the content is gzip encoded, because
// reading the content with "writeContent" will gunzip on-the-fly // reading the content with "writeContent" will gunzip on-the-fly
gzip = ((zipped) && (responseHeader.gzip())); this.gzip = ((zipped) && (this.responseHeader.gzip()));
if (gzip) { if (this.gzip) {
responseHeader.remove("CONTENT-ENCODING"); // we fake that we don't have encoding, since what comes out does not have gzip and we also don't know what was encoded this.responseHeader.remove(httpHeader.CONTENT_ENCODING); // we fake that we don't have encoding, since what comes out does not have gzip and we also don't know what was encoded
responseHeader.remove("CONTENT-LENGTH"); // we cannot use the length during gunzippig yet; still we can hope that it works this.responseHeader.remove(httpHeader.CONTENT_LENGTH); // we cannot use the length during gunzippig yet; still we can hope that it works
} }
} }
@ -1353,7 +1348,7 @@ do upload
* @return True, if the request was successfull. * @return True, if the request was successfull.
*/ */
public boolean success() { public boolean success() {
return ((status.charAt(0) == '2') || (status.charAt(0) == '3')); return ((this.status.charAt(0) == '2') || (this.status.charAt(0) == '3'));
} }
/** /**
@ -1366,7 +1361,7 @@ do upload
public byte[] writeContent() throws IOException { public byte[] writeContent() throws IOException {
int contentLength = (int) this.responseHeader.contentLength(); int contentLength = (int) this.responseHeader.contentLength();
serverByteBuffer sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength); serverByteBuffer sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength);
writeContentX(null, sbb, clientInput); writeContentX(null, sbb, httpc.this.clientInput);
return sbb.getBytes(); return sbb.getBytes();
} }
@ -1381,7 +1376,7 @@ do upload
public byte[] writeContent(OutputStream procOS) throws IOException { public byte[] writeContent(OutputStream procOS) throws IOException {
int contentLength = (int) this.responseHeader.contentLength(); int contentLength = (int) this.responseHeader.contentLength();
serverByteBuffer sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength); serverByteBuffer sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength);
writeContentX(procOS, sbb, clientInput); writeContentX(procOS, sbb, httpc.this.clientInput);
return sbb.getBytes(); return sbb.getBytes();
} }
@ -1399,7 +1394,7 @@ do upload
FileOutputStream bufferOS = null; FileOutputStream bufferOS = null;
try { try {
if (file != null) bufferOS = new FileOutputStream(file); if (file != null) bufferOS = new FileOutputStream(file);
writeContentX(procOS, bufferOS, clientInput); writeContentX(procOS, bufferOS, httpc.this.clientInput);
} finally { } finally {
if (bufferOS != null) { if (bufferOS != null) {
bufferOS.close(); bufferOS.close();
@ -1476,7 +1471,7 @@ do upload
* status of this instance. * status of this instance.
*/ */
public void print() { public void print() {
serverLog.logInfo("HTTPC", "RESPONSE: status=" + status + ", header=" + responseHeader.toString()); serverLog.logInfo("HTTPC", "RESPONSE: status=" + this.status + ", header=" + this.responseHeader.toString());
} }
} }

@ -46,11 +46,10 @@ import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.io.FileDescriptor; import java.io.FileDescriptor;
import java.io.SyncFailedException;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
public class kelondroFileRA extends kelondroAbstractRA implements kelondroRA { public final class kelondroFileRA extends kelondroAbstractRA implements kelondroRA {
protected RandomAccessFile RAFile; protected RandomAccessFile RAFile;
protected FileDescriptor RADescriptor; protected FileDescriptor RADescriptor;

@ -57,12 +57,11 @@ import java.util.Map;
import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
public class plasmaCrawlRobotsTxt { public class plasmaCrawlRobotsTxt {
private kelondroMap robotsTable; private kelondroMap robotsTable;
private File robotsTableFile; private final File robotsTableFile;
private int bufferkb; private int bufferkb;
public plasmaCrawlRobotsTxt(File robotsTableFile, int bufferkb) throws IOException { public plasmaCrawlRobotsTxt(File robotsTableFile, int bufferkb) throws IOException {

@ -112,7 +112,7 @@ public final class plasmaSearch {
wordHash = plasmaWordIndexEntry.word2hash(word); wordHash = plasmaWordIndexEntry.word2hash(word);
entry = new plasmaWordIndexEntry(urlHash, count, p++, 0, 0, entry = new plasmaWordIndexEntry(urlHash, count, p++, 0, 0,
age, quality, language, doctype, true); age, quality, language, doctype, true);
wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry)); this.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));
} }
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries"); //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
return condenser.getWords().size(); return condenser.getWords().size();

@ -6,12 +6,14 @@ import java.net.InetAddress;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.data.robotsParser; import de.anomic.data.robotsParser;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroRecords.Node; import de.anomic.kelondro.kelondroRecords.Node;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
@ -22,8 +24,10 @@ import de.anomic.yacy.yacyCore;
public final class plasmaStackCrawlThread extends Thread { public final class plasmaStackCrawlThread extends Thread {
private final serverLog log = new serverLog("STACKCRAWL"); final WorkerPool theWorkerPool;
private final plasmaSwitchboard sb; final ThreadGroup theWorkerThreadGroup = new ThreadGroup("stackCrawlThreadGroup");
final serverLog log = new serverLog("STACKCRAWL");
final plasmaSwitchboard sb;
private boolean stopped = false; private boolean stopped = false;
private stackCrawlQueue queue; private stackCrawlQueue queue;
@ -34,6 +38,9 @@ public final class plasmaStackCrawlThread extends Thread {
this.queue = new stackCrawlQueue(dbPath,dbCacheSize); this.queue = new stackCrawlQueue(dbPath,dbCacheSize);
this.log.logInfo(this.queue.size() + " entries in the stackCrawl queue."); this.log.logInfo(this.queue.size() + " entries in the stackCrawl queue.");
this.log.logInfo("STACKCRAWL thread initialized."); this.log.logInfo("STACKCRAWL thread initialized.");
this.theWorkerPool = new WorkerPool(new WorkterFactory(this.theWorkerThreadGroup));
} }
public void stopIt() { public void stopIt() {
@ -58,22 +65,11 @@ public final class plasmaStackCrawlThread extends Thread {
// getting a new message from the crawler queue // getting a new message from the crawler queue
stackCrawlMessage theMsg = this.queue.waitForMessage(); stackCrawlMessage theMsg = this.queue.waitForMessage();
// process message // getting a free session thread from the pool
String rejectReason = stackCrawlDequeue(theMsg); Worker worker = (Worker) this.theWorkerPool.borrowObject();
if (rejectReason != null) {
this.sb.urlPool.errorURL.newEntry(
new URL(theMsg.url()),
theMsg.referrerHash(),
theMsg.initiatorHash(),
yacyCore.seedDB.mySeed.hash,
theMsg.name,
rejectReason,
new bitfield(plasmaURL.urlFlagLength),
false
);
}
// processing the new request
worker.execute(theMsg);
} catch (InterruptedException e) { } catch (InterruptedException e) {
Thread.interrupted(); Thread.interrupted();
this.stopped = true; this.stopped = true;
@ -83,6 +79,14 @@ public final class plasmaStackCrawlThread extends Thread {
} }
} }
try {
this.log.logFine("Shutdown. Terminationg worker threads.");
this.theWorkerPool.close();
} catch (Exception e1) {
this.log.logSevere("Unable to shutdown all remaining stackCrawl threads", e1);
}
try { try {
this.log.logFine("Shutdown. Closing stackCrawl queue."); this.log.logFine("Shutdown. Closing stackCrawl queue.");
this.queue.close(); this.queue.close();
@ -141,6 +145,7 @@ public final class plasmaStackCrawlThread extends Thread {
// stacks a crawl item. The position can also be remote // stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful // returns null if successful, a reason string if not successful
long startTime = System.currentTimeMillis();
String reason = null; // failure reason String reason = null; // failure reason
// strange errors // strange errors
@ -163,7 +168,8 @@ public final class plasmaStackCrawlThread extends Thread {
nexturl = new URL(nexturlString); nexturl = new URL(nexturlString);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
reason = "denied_(url_'" + nexturlString + "'_wrong)"; reason = "denied_(url_'" + nexturlString + "'_wrong)";
this.log.logSevere("Wrong URL in stackCrawl: " + nexturlString); this.log.logSevere("Wrong URL in stackCrawl: " + nexturlString +
". Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason; return reason;
} }
@ -172,16 +178,19 @@ public final class plasmaStackCrawlThread extends Thread {
InetAddress hostAddress = InetAddress.getByName(nexturl.getHost()); InetAddress hostAddress = InetAddress.getByName(nexturl.getHost());
if (hostAddress.isSiteLocalAddress()) { if (hostAddress.isSiteLocalAddress()) {
reason = "denied_(private_ip_address)"; reason = "denied_(private_ip_address)";
this.log.logFine("Host in URL '" + nexturlString + "' has private ip address."); this.log.logFine("Host in URL '" + nexturlString + "' has private ip address." +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason; return reason;
} else if (hostAddress.isLoopbackAddress()) { } else if (hostAddress.isLoopbackAddress()) {
reason = "denied_(loopback_ip_address)"; reason = "denied_(loopback_ip_address)";
this.log.logFine("Host in URL '" + nexturlString + "' has loopback ip address."); this.log.logFine("Host in URL '" + nexturlString + "' has loopback ip address." +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason; return reason;
} }
} catch (UnknownHostException e) { } catch (UnknownHostException e) {
reason = "denied_(unknown_host)"; reason = "denied_(unknown_host)";
this.log.logFine("Unknown host in URL '" + nexturlString + "'."); this.log.logFine("Unknown host in URL '" + nexturlString + "'." +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason; return reason;
} }
@ -189,7 +198,8 @@ public final class plasmaStackCrawlThread extends Thread {
String hostlow = nexturl.getHost().toLowerCase(); String hostlow = nexturl.getHost().toLowerCase();
if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, nexturl.getPath())) { if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, nexturl.getPath())) {
reason = "denied_(url_in_blacklist)"; reason = "denied_(url_in_blacklist)";
this.log.logFine("URL '" + nexturlString + "' is in blacklist."); this.log.logFine("URL '" + nexturlString + "' is in blacklist." +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason; return reason;
} }
@ -199,7 +209,8 @@ public final class plasmaStackCrawlThread extends Thread {
/* /*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
this.log.logFine("URL '" + nexturlString + "' does not match crawling filter '" + profile.generalFilter() + "'."); this.log.logFine("URL '" + nexturlString + "' does not match crawling filter '" + profile.generalFilter() + "'." +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason; return reason;
} }
@ -209,7 +220,8 @@ public final class plasmaStackCrawlThread extends Thread {
/* /*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
this.log.logFine("URL '" + nexturlString + "' is cgi URL."); this.log.logFine("URL '" + nexturlString + "' is cgi URL." +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason; return reason;
} }
@ -219,7 +231,8 @@ public final class plasmaStackCrawlThread extends Thread {
/* /*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
this.log.logFine("URL '" + nexturlString + "' is post URL."); this.log.logFine("URL '" + nexturlString + "' is post URL." +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason; return reason;
} }
@ -231,7 +244,8 @@ public final class plasmaStackCrawlThread extends Thread {
/* /*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
this.log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'."); this.log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'." +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason; return reason;
} }
@ -241,7 +255,8 @@ public final class plasmaStackCrawlThread extends Thread {
/* /*
urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash, urlPool.errorURL.newEntry(nexturl, referrerHash, initiatorHash, yacyCore.seedDB.mySeed.hash,
name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/ name, reason, new bitfield(plasmaURL.urlFlagLength), false);*/
this.log.logFine("Crawling of URL '" + nexturlString + "' disallowed by robots.txt."); this.log.logFine("Crawling of URL '" + nexturlString + "' disallowed by robots.txt." +
"Stack processing time: " + (System.currentTimeMillis()-startTime));
return reason; return reason;
} }
@ -256,7 +271,7 @@ public final class plasmaStackCrawlThread extends Thread {
(yacyCore.seedDB.mySeed.isPrincipal())) /* qualified */; (yacyCore.seedDB.mySeed.isPrincipal())) /* qualified */;
if ((!local)&&(!global)) { if ((!local)&&(!global)) {
this.log.logFine("URL '" + nexturlString + "' can neither be crawled local nor global."); this.log.logSevere("URL '" + nexturlString + "' can neither be crawled local nor global.");
} }
this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ this.sb.urlPool.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
@ -507,7 +522,7 @@ public final class plasmaStackCrawlThread extends Thread {
byte[][] entryBytes = null; byte[][] entryBytes = null;
stackCrawlMessage newMessage = null; stackCrawlMessage newMessage = null;
synchronized(this.urlEntryHashCache) { synchronized(this.urlEntryHashCache) {
urlHash = (String) this.urlEntryHashCache.removeLast(); urlHash = (String) this.urlEntryHashCache.removeFirst();
entryBytes = this.urlEntryCache.remove(urlHash.getBytes()); entryBytes = this.urlEntryCache.remove(urlHash.getBytes());
} }
@ -518,4 +533,262 @@ public final class plasmaStackCrawlThread extends Thread {
} }
} }
public final class WorkterFactory implements org.apache.commons.pool.PoolableObjectFactory {
final ThreadGroup workerThreadGroup;
public WorkterFactory(ThreadGroup theWorkerThreadGroup) {
super();
if (theWorkerThreadGroup == null)
throw new IllegalArgumentException("The threadgroup object must not be null.");
this.workerThreadGroup = theWorkerThreadGroup;
}
public Object makeObject() {
Worker newWorker = new Worker(this.workerThreadGroup);
newWorker.setPriority(Thread.MAX_PRIORITY);
return newWorker;
}
/**
* @see org.apache.commons.pool.PoolableObjectFactory#destroyObject(java.lang.Object)
*/
public void destroyObject(Object obj) {
if (obj instanceof Worker) {
Worker theWorker = (Worker) obj;
theWorker.setStopped(true);
}
}
/**
* @see org.apache.commons.pool.PoolableObjectFactory#validateObject(java.lang.Object)
*/
public boolean validateObject(Object obj) {
if (obj instanceof Worker)
{
Worker theWorker = (Worker) obj;
if (!theWorker.isAlive() || theWorker.isInterrupted()) return false;
if (theWorker.isRunning()) return true;
return false;
}
return true;
}
/**
* @param obj
*
*/
public void activateObject(Object obj) {
//log.debug(" activateObject...");
}
/**
* @param obj
*
*/
public void passivateObject(Object obj) {
//log.debug(" passivateObject..." + obj);
// if (obj instanceof Session) {
// Session theSession = (Session) obj;
// }
}
}
public final class WorkerPool extends GenericObjectPool {
public boolean isClosed = false;
/**
* First constructor.
* @param objFactory
*/
public WorkerPool(WorkterFactory objFactory) {
super(objFactory);
this.setMaxIdle(10); // Maximum idle threads.
this.setMaxActive(50); // Maximum active threads.
this.setMinEvictableIdleTimeMillis(30000); //Evictor runs every 30 secs.
//this.setMaxWait(1000); // Wait 1 second till a thread is available
}
public WorkerPool(plasmaStackCrawlThread.WorkterFactory objFactory,
GenericObjectPool.Config config) {
super(objFactory, config);
}
public Object borrowObject() throws Exception {
return super.borrowObject();
}
public void returnObject(Object obj) throws Exception {
super.returnObject(obj);
}
public synchronized void close() throws Exception {
/*
* shutdown all still running session threads ...
*/
this.isClosed = true;
/* waiting for all threads to finish */
int threadCount = plasmaStackCrawlThread.this.theWorkerThreadGroup.activeCount();
Thread[] threadList = new Thread[threadCount];
threadCount = plasmaStackCrawlThread.this.theWorkerThreadGroup.enumerate(threadList);
try {
// trying to gracefull stop all still running sessions ...
plasmaStackCrawlThread.this.log.logInfo("Signaling shutdown to " + threadCount + " remaining stackCrawl threads ...");
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) {
((Worker)currentThread).setStopped(true);
}
}
// waiting a frew ms for the session objects to continue processing
try { Thread.sleep(500); } catch (InterruptedException ex) {}
// interrupting all still running or pooled threads ...
plasmaStackCrawlThread.this.log.logInfo("Sending interruption signal to " + plasmaStackCrawlThread.this.theWorkerThreadGroup.activeCount() + " remaining stackCrawl threads ...");
plasmaStackCrawlThread.this.theWorkerThreadGroup.interrupt();
// if there are some sessions that are blocking in IO, we simply close the socket
plasmaStackCrawlThread.this.log.logFine("Trying to abort " + plasmaStackCrawlThread.this.theWorkerThreadGroup.activeCount() + " remaining stackCrawl threads ...");
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) {
plasmaStackCrawlThread.this.log.logInfo("Trying to shutdown stackCrawl thread '" + currentThread.getName() + "' [" + currentThreadIdx + "].");
((Worker)currentThread).close();
}
}
// we need to use a timeout here because of missing interruptable session threads ...
plasmaStackCrawlThread.this.log.logFine("Waiting for " + plasmaStackCrawlThread.this.theWorkerThreadGroup.activeCount() + " remaining stackCrawl threads to finish shutdown ...");
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) {
plasmaStackCrawlThread.this.log.logFine("Waiting for stackCrawl thread '" + currentThread.getName() + "' [" + currentThreadIdx + "] to finish shutdown.");
try { currentThread.join(500); } catch (InterruptedException ex) {}
}
}
plasmaStackCrawlThread.this.log.logInfo("Shutdown of remaining stackCrawl threads finish.");
} catch (Exception e) {
plasmaStackCrawlThread.this.log.logSevere("Unexpected error while trying to shutdown all remaining stackCrawl threads.",e);
}
super.close();
}
}
public final class Worker extends Thread {
private boolean running = false;
private boolean stopped = false;
private boolean done = false;
private stackCrawlMessage theMsg;
public Worker(ThreadGroup theThreadGroup) {
super(theThreadGroup,"stackCrawlThread");
}
public void setStopped(boolean stopped) {
this.stopped = stopped;
}
public void close() {
if (this.isAlive()) {
try {
// trying to close all still open httpc-Sockets first
int closedSockets = httpc.closeOpenSockets(this);
if (closedSockets > 0) {
plasmaStackCrawlThread.this.log.logInfo(closedSockets + " HTTP-client sockets of thread '" + this.getName() + "' closed.");
}
} catch (Exception e) {}
}
}
public synchronized void execute(stackCrawlMessage newMsg) {
this.theMsg = newMsg;
this.done = false;
if (!this.running) {
// this.setDaemon(true);
this.start();
} else {
this.notifyAll();
}
}
public void reset() {
this.done = true;
this.theMsg = null;
}
public boolean isRunning() {
return this.running;
}
public void run() {
this.running = true;
// The thread keeps running.
while (!this.stopped && !Thread.interrupted()) {
if (this.done) {
// We are waiting for a task now.
synchronized (this) {
try {
this.wait(); //Wait until we get a request to process.
} catch (InterruptedException e) {
this.stopped = true;
// log.error("", e);
}
}
} else {
//There is a task....let us execute it.
try {
execute();
} catch (Exception e) {
// log.error("", e);
} finally {
reset();
if (!this.stopped && !this.isInterrupted() && !plasmaStackCrawlThread.this.theWorkerPool.isClosed) {
try {
this.setName("stackCrawlThread_inPool");
plasmaStackCrawlThread.this.theWorkerPool.returnObject(this);
} catch (Exception e1) {
// e1.printStackTrace();
this.stopped = true;
}
}
}
}
}
}
private void execute() throws InterruptedException {
try {
String rejectReason = stackCrawlDequeue(this.theMsg);
if (rejectReason != null) {
plasmaStackCrawlThread.this.sb.urlPool.errorURL.newEntry(
new URL(this.theMsg.url()),
this.theMsg.referrerHash(),
this.theMsg.initiatorHash(),
yacyCore.seedDB.mySeed.hash,
this.theMsg.name,
rejectReason,
new bitfield(plasmaURL.urlFlagLength),
false
);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
this.done = true;
}
}
}
} }

@ -60,8 +60,8 @@ import de.anomic.yacy.yacySeedDB;
public final class plasmaWordIndex { public final class plasmaWordIndex {
final File databaseRoot; private final File databaseRoot;
final plasmaWordIndexCache ramCache; private final plasmaWordIndexCache ramCache;
public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) throws IOException { public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) throws IOException {
this.databaseRoot = databaseRoot; this.databaseRoot = databaseRoot;

@ -466,7 +466,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
public synchronized int addEntries(plasmaWordIndexEntryContainer container, long updateTime) { public synchronized int addEntries(plasmaWordIndexEntryContainer container, long updateTime) {
// this puts the entries into the cache, not into the assortment directly // this puts the entries into the cache, not into the assortment directly
// check cache space // check cache space
if (cache.size() > 0) try { if (cache.size() > 0) try {
// pause to get space in the cache (while it is flushed) // pause to get space in the cache (while it is flushed)
@ -480,14 +480,14 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
// stop flushing now for one moment // stop flushing now for one moment
flushThread.pause(); flushThread.pause();
//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size()); //serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
// put new words into cache // put new words into cache
int added = 0; int added = 0;
synchronized (cache) { String wordHash = container.wordHash();
String wordHash = container.wordHash(); synchronized (cache) {
plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null
if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash); if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash);
added = entries.add(container); added = entries.add(container);
if (added > 0) { if (added > 0) {
cache.put(wordHash, entries); cache.put(wordHash, entries);
@ -495,7 +495,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
hashDate.setScore(wordHash, intTime(updateTime)); hashDate.setScore(wordHash, intTime(updateTime));
} }
entries = null; entries = null;
} }
//System.out.println("DEBUG: cache = " + cache.toString()); //System.out.println("DEBUG: cache = " + cache.toString());
flushThread.proceed(); flushThread.proceed();
return added; return added;

@ -792,7 +792,7 @@ public final class plasmaWordIndexDistribution {
} else if (selectionTime < transferTime){ } else if (selectionTime < transferTime){
this.chunkSize +=100; this.chunkSize +=100;
//chunkSize+=50; //chunkSize+=50;
} else if (transferTime >= selectionTime){ } else if (selectionTime >= selectionTime){
if (chunkSize>200) chunkSize-=100; if (chunkSize>200) chunkSize-=100;
} }

@ -475,7 +475,7 @@ ramCacheRobots = 2097152
ramCacheProfiles = 8192 ramCacheProfiles = 8192
# ram cache for stack crawl thread db # ram cache for stack crawl thread db
ramCachePreNURL = 8192 ramCachePreNURL = 4194304
# default memory settings for startup of yacy # default memory settings for startup of yacy
# is only valid in unix/shell environments and # is only valid in unix/shell environments and

Loading…
Cancel
Save