From 2399ed817cf0f915fb76ee93efa5b056475721c0 Mon Sep 17 00:00:00 2001 From: theli Date: Thu, 26 Apr 2007 15:42:38 +0000 Subject: [PATCH] *) robots.txt parser now extracts the sitemap-URL (will be used later) *) some javadoc added *) junit testclass for robots.txt parser added git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3602 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Surftips.java | 3 +- source/de/anomic/data/robotsParser.java | 68 ++++++++----------- source/de/anomic/index/indexURLEntry.java | 23 ++++++- .../anomic/plasma/plasmaCrawlRobotsTxt.java | 15 ++-- test/de/anomic/data/robotsParserTest.java | 20 ++++++ 5 files changed, 84 insertions(+), 45 deletions(-) create mode 100644 test/de/anomic/data/robotsParserTest.java diff --git a/htroot/Surftips.java b/htroot/Surftips.java index 474beb576..550ce4739 100644 --- a/htroot/Surftips.java +++ b/htroot/Surftips.java @@ -133,7 +133,8 @@ public class Surftips { refid = row.getColString(3, null); voted = false; try { - voted = (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null) || (yacyCore.newsPool.getSpecific(yacyNewsPool.PUBLISHED_DB, "stippavt", "refid", refid) != null); + voted = (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null) || + (yacyCore.newsPool.getSpecific(yacyNewsPool.PUBLISHED_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null); } catch (IOException e) { e.printStackTrace(); } diff --git a/source/de/anomic/data/robotsParser.java b/source/de/anomic/data/robotsParser.java index a39f5d907..c41b80fb1 100644 --- a/source/de/anomic/data/robotsParser.java +++ b/source/de/anomic/data/robotsParser.java @@ -80,11 +80,16 @@ import de.anomic.server.logging.serverLog; * See: http://www.kollar.com/robots.html */ public final class robotsParser{ - + public static final int DOWNLOAD_ACCESS_RESTRICTED = 0; + public static final int DOWNLOAD_ROBOTS_TXT = 1; + public static final int DOWNLOAD_ETAG = 2; + public static final int DOWNLOAD_MODDATE = 3; + public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase(); public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase(); public static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); public static final String ROBOTS_COMMENT = "#"; + public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase(); /*public robotsParser(URL robotsUrl){ }*/ @@ -93,29 +98,29 @@ public final class robotsParser{ * at the Moment it only creates a list of Deny Paths */ - public static ArrayList parse(File robotsFile) throws IOException { + public static Object[] parse(File robotsFile) throws IOException { BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(robotsFile)); return parse(reader); } finally { - if (reader != null) try{reader.close();}catch(Exception e){} + if (reader != null) try{reader.close();}catch(Exception e){/* ignore this */} } } - public static ArrayList parse(byte[] robotsTxt) throws IOException { - if ((robotsTxt == null)||(robotsTxt.length == 0)) return new ArrayList(0); + public static Object[] parse(byte[] robotsTxt) throws IOException { + if ((robotsTxt == null)||(robotsTxt.length == 0)) return new Object[]{new ArrayList(0),null}; ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt); BufferedReader reader = new BufferedReader(new InputStreamReader(bin)); return parse(reader); } - public static ArrayList parse(BufferedReader reader) throws IOException{ + public static Object[] parse(BufferedReader reader) throws IOException{ ArrayList deny4AllAgents = new ArrayList(); ArrayList deny4YaCyAgent = new ArrayList(); int pos; - String line = null, lineUpper = null; + String line = null, lineUpper = null, sitemap = null; boolean isRuleBlock4AllAgents = false, isRuleBlock4YaCyAgent = false, rule4YaCyFound = false, @@ -132,6 +137,11 @@ public final class robotsParser{ // NEW: just ignore it } else if (line.startsWith(ROBOTS_COMMENT)) { // we can ignore this. Just a comment line + } else if (line.startsWith(ROBOTS_SITEMAP)) { + pos = line.indexOf(" "); + if (pos != -1) { + sitemap = line.substring(pos).trim(); + } } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) { if (inBlock) { @@ -200,7 +210,8 @@ public final class robotsParser{ } } - return (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents; + ArrayList denyList = (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents; + return new Object[]{denyList,sitemap}; } public static boolean isDisallowed(URL nexturl) { @@ -250,10 +261,10 @@ public final class robotsParser{ result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host); if (result != null) { - accessCompletelyRestricted = ((Boolean)result[0]).booleanValue(); - robotsTxt = (byte[])result[1]; - eTag = (String) result[2]; - modDate = (Date) result[3]; + accessCompletelyRestricted = ((Boolean)result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue(); + robotsTxt = (byte[])result[DOWNLOAD_ROBOTS_TXT]; + eTag = (String) result[DOWNLOAD_ETAG]; + modDate = (Date) result[DOWNLOAD_MODDATE]; } else if (robotsTxt4Host != null) { robotsTxt4Host.setLoadedDate(new Date()); plasmaSwitchboard.robots.addEntry(robotsTxt4Host); @@ -264,20 +275,23 @@ public final class robotsParser{ if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) { ArrayList denyPath = null; + String sitemap = null; if (accessCompletelyRestricted) { denyPath = new ArrayList(); denyPath.add("/"); } else { // parsing the robots.txt Data and converting it into an arraylist try { - denyPath = robotsParser.parse(robotsTxt); + Object[] parserResult = robotsParser.parse(robotsTxt); + denyPath = (ArrayList) parserResult[0]; + sitemap = (String) parserResult[1]; } catch (IOException e) { serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'."); } } // storing the data into the robots DB - robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag); + robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap); } } } @@ -288,7 +302,7 @@ public final class robotsParser{ return false; } - private static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, plasmaCrawlRobotsTxt.Entry entry) throws Exception { + static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, plasmaCrawlRobotsTxt.Entry entry) throws Exception { if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null}; redirectionCount--; @@ -395,28 +409,4 @@ public final class robotsParser{ } return new Object[]{new Boolean(accessCompletelyRestricted),robotsTxt,eTag,lastMod}; } - - public static void main(String[] args) { - try { - //robotsParser parser = new robotsParser(); - - URL robotsURL = new URL("http://www.bigfoot2002.de.vu/robots.txt"); - Object[] result = downloadRobotsTxt(robotsURL,5,null); - - if (result != null) { - boolean accessCompletelyRestricted = ((Boolean)result[0]).booleanValue(); - byte[] robotsTxt = (byte[])result[1]; - //String eTag = (String) result[2]; - //Date modDate = (Date) result[3]; - - if (!accessCompletelyRestricted) { - /*ArrayList denyPath =*/ robotsParser.parse(robotsTxt); - } - - } - }catch(Exception e) { - e.printStackTrace(); - } - } - } diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index f579dad1f..505c0e3f0 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -75,23 +75,44 @@ public class indexURLEntry { kelondroBase64Order.enhancedCoder, 0); + /* =========================================================================== + * Constants to access the various columns of an URL entry + * =========================================================================== */ + /** the url's hash */ private static final int col_hash = 0; + /** components: the url, description, author and tags. As 5th element, an ETag is possible */ private static final int col_comp = 1; + /** components: the url, description, author and tags. As 5th element, an ETag is possible */ private static final int col_mod = 2; + /** time when the url was loaded */ private static final int col_load = 3; + /** time until this url is fresh */ private static final int col_fresh = 4; + /** time when the url was loaded */ private static final int col_referrer = 5; + /** the md5 of the url content (to identify changes) */ private static final int col_md5 = 6; + /** size of file in bytes */ private static final int col_size = 7; + /** size of file by number of words; for video and audio: seconds */ private static final int col_wc = 8; + /** doctype, taken from extension or any other heuristic */ private static final int col_dt = 9; + /** flags; any stuff (see Word-Entity definition) */ private static final int col_flags = 10; + /** language */ private static final int col_lang = 11; + /** of outlinks to same domain; for video and image: width */ private static final int col_llocal = 12; + /** of outlinks to outside domain; for video and image: height */ private static final int col_lother = 13; + /** of embedded image links */ private static final int col_limage = 14; + /** of embedded audio links; for audio: track number; for video: number of audio tracks */ private static final int col_laudio = 15; + /** of embedded video links */ private static final int col_lvideo = 16; + /** of embedded links to applications */ private static final int col_lapp = 17; private kelondroRow.Entry entry; @@ -405,7 +426,7 @@ public class indexURLEntry { } /** - * Returns this object as String.
+ * @return the object as String.
* This e.g. looks like this: *
{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}
*/ diff --git a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java index 422a05863..f17fc7fad 100644 --- a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java +++ b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java @@ -111,8 +111,8 @@ public class plasmaCrawlRobotsTxt { } } - public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag) { - Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag); + public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, String sitemap) { + Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap); addEntry(entry); return entry; } @@ -132,6 +132,7 @@ public class plasmaCrawlRobotsTxt { public static final String LOADED_DATE = "date"; public static final String MOD_DATE = "modDate"; public static final String ETAG = "etag"; + public static final String SITEMAP = "sitemap"; // this is a simple record structure that hold all properties of a single crawl start private Map mem; @@ -161,16 +162,18 @@ public class plasmaCrawlRobotsTxt { ArrayList disallowPathList, Date loadedDate, Date modDate, - String eTag) { + String eTag, + String sitemap) { if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException(); this.hostName = hostName.trim().toLowerCase(); this.disallowPathList = new LinkedList(); - this.mem = new HashMap(); + this.mem = new HashMap(5); if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime())); if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); if (eTag != null) this.mem.put(ETAG,eTag); + if (sitemap != null) this.mem.put(SITEMAP,sitemap); if ((disallowPathList != null)&&(disallowPathList.size()>0)) { this.disallowPathList.addAll(disallowPathList); @@ -196,6 +199,10 @@ public class plasmaCrawlRobotsTxt { return str.toString(); } + public String getSitemap() { + return this.mem.containsKey(SITEMAP)? (String)this.mem.get(LOADED_DATE): null; + } + public Date getLoadedDate() { if (this.mem.containsKey(LOADED_DATE)) { return new Date(Long.valueOf((String) this.mem.get(LOADED_DATE)).longValue()); diff --git a/test/de/anomic/data/robotsParserTest.java b/test/de/anomic/data/robotsParserTest.java new file mode 100644 index 000000000..daf2d8c8c --- /dev/null +++ b/test/de/anomic/data/robotsParserTest.java @@ -0,0 +1,20 @@ +package de.anomic.data; + +import junit.framework.TestCase; +import de.anomic.net.URL; + +public class robotsParserTest extends TestCase { + public void testDownload() throws Exception { + URL robotsURL = new URL("http://www.bigfoot2002.de.vu/robots.txt"); + Object[] result = robotsParser.downloadRobotsTxt(robotsURL,5,null); + + if (result != null) { + System.out.println("Access restricted: " + result[robotsParser.DOWNLOAD_ACCESS_RESTRICTED]); + System.out.println("ETag: " + result[robotsParser.DOWNLOAD_ETAG]); + System.out.println("Mod-Date: " + result[robotsParser.DOWNLOAD_MODDATE]); + System.out.println("-------------------------------- Robots.txt START: -------------------------------"); + System.out.println(new String((byte[])result[robotsParser.DOWNLOAD_ROBOTS_TXT])); + System.out.println("-------------------------------- Robots.txt END: ---------------------------------"); + } + } +}