From 93cadb47b97bc8eb9ca0513cfdc8090e185a0e14 Mon Sep 17 00:00:00 2001 From: theli Date: Tue, 8 Nov 2005 07:41:25 +0000 Subject: [PATCH] *) More tolerant robots parser for robots-files which missing empty lines between rule blocks See: http://www.yacy-forum.de/viewtopic.php?p=12471 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1048 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/data/robotsParser.java | 104 +++++++++++++++++------- 1 file changed, 73 insertions(+), 31 deletions(-) diff --git a/source/de/anomic/data/robotsParser.java b/source/de/anomic/data/robotsParser.java index 42e6be1b2..aeaa2ab53 100644 --- a/source/de/anomic/data/robotsParser.java +++ b/source/de/anomic/data/robotsParser.java @@ -99,48 +99,62 @@ public final class robotsParser{ int pos; String line = null, lineUpper = null; - boolean rule4Yacy = false; + boolean rule4Yacy = false, inBlock = false; + while ((line = reader.readLine()) != null) { line = line.trim(); lineUpper = line.toUpperCase(); + if (line.length() == 0) { // we have reached the end of the rule block - rule4Yacy = false; + rule4Yacy = false; inBlock = false; } else if (line.startsWith("#")) { // we can ignore this. Just a comment line - } else if ((!rule4Yacy) && (lineUpper.startsWith("User-agent:".toUpperCase()))) { - // cutting off comments at the line end - pos = line.indexOf("#"); - if (pos != -1) { - line = line.substring(0,pos); - } + } else if (lineUpper.startsWith("User-agent:".toUpperCase())) { - // getting out the robots name - pos = line.indexOf(" "); - if (pos != -1) { - String userAgent = line.substring(pos).trim(); - rule4Yacy = (userAgent.equals("*") || (userAgent.toLowerCase().indexOf("yacy") >=0)); - } - } else if (lineUpper.startsWith("Disallow:".toUpperCase()) && rule4Yacy) { - // cutting off comments at the line end - pos = line.indexOf("#"); - if (pos != -1) { - line = line.substring(0,pos); + if (inBlock) { + inBlock = false; + rule4Yacy = false; } - pos = line.indexOf(" "); - if (pos != -1) { - // getting the path - String path = line.substring(pos).trim(); - - // unencoding all special charsx - path = URLDecoder.decode(path,"UTF-8"); + if (!rule4Yacy) { + // cutting off comments at the line end + pos = line.indexOf("#"); + if (pos != -1) { + line = line.substring(0,pos); + } - // escaping all occurences of ; because this char is used as special char in the Robots DB - path = path.replaceAll(";","%3B"); + // getting out the robots name + pos = line.indexOf(" "); + if (pos != -1) { + String userAgent = line.substring(pos).trim(); + rule4Yacy = (userAgent.equals("*") || (userAgent.toLowerCase().indexOf("yacy") >=0)); + } + } + } else if (lineUpper.startsWith("Disallow:".toUpperCase())) { + inBlock = true; + + if (rule4Yacy) { + // cutting off comments at the line end + pos = line.indexOf("#"); + if (pos != -1) { + line = line.substring(0,pos); + } - // adding it to the pathlist - deny.add(path); + pos = line.indexOf(" "); + if (pos != -1) { + // getting the path + String path = line.substring(pos).trim(); + + // unencoding all special charsx + path = URLDecoder.decode(path,"UTF-8"); + + // escaping all occurences of ; because this char is used as special char in the Robots DB + path = path.replaceAll(";","%3B"); + + // adding it to the pathlist + deny.add(path); + } } } } @@ -248,7 +262,11 @@ public final class robotsParser{ downloadStart = System.currentTimeMillis(); plasmaSwitchboard sb = plasmaSwitchboard.getSwitchboard(); //TODO: adding Traffic statistic for robots download? - if ((sb.remoteProxyConfig == null) || (!sb.remoteProxyConfig.useProxy())) { + if ( + (sb == null) || + (sb.remoteProxyConfig == null) || + (!sb.remoteProxyConfig.useProxy()) + ) { con = httpc.getInstance(robotsURL.getHost(), robotsURL.getPort(), 10000, robotsURL.getProtocol().equalsIgnoreCase("https")); } else { con = httpc.getInstance(robotsURL.getHost(), robotsURL.getPort(), 10000, robotsURL.getProtocol().equalsIgnoreCase("https"), sb.remoteProxyConfig); @@ -320,4 +338,28 @@ public final class robotsParser{ } return new Object[]{new Boolean(accessCompletelyRestricted),robotsTxt,eTag,lastMod}; } + + public static void main(String[] args) { + try { + robotsParser parser = new robotsParser(); + + URL robotsURL = new URL("http://www.bigfoot2002.de.vu/robots.txt"); + Object[] result = parser.downloadRobotsTxt(robotsURL,5,null); + + if (result != null) { + boolean accessCompletelyRestricted = ((Boolean)result[0]).booleanValue(); + byte[] robotsTxt = (byte[])result[1]; + String eTag = (String) result[2]; + Date modDate = (Date) result[3]; + + if (!accessCompletelyRestricted) { + ArrayList denyPath = robotsParser.parse(robotsTxt); + } + + } + }catch(Exception e) { + e.printStackTrace(); + } + } + }