*) robots.txt parser now extracts the sitemap-URL (will be used later)

*) some javadoc added *) junit testclass for robots.txt parser added git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3602 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · 2399ed817c
parent fa012789b2
commit 2399ed817c
5 changed files with 84 additions and 45 deletions
--- a/htroot/Surftips.java
+++ b/htroot/Surftips.java
@ -133,7 +133,8 @@ public class Surftips {
                refid = row.getColString(3, null);
                voted = false;
                try {
-                    voted = (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null) || (yacyCore.newsPool.getSpecific(yacyNewsPool.PUBLISHED_DB, "stippavt", "refid", refid) != null);
+                    voted = (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null) || 
+                    		(yacyCore.newsPool.getSpecific(yacyNewsPool.PUBLISHED_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null);
                } catch (IOException e) {
                    e.printStackTrace();
                }
--- a/source/de/anomic/data/robotsParser.java
+++ b/source/de/anomic/data/robotsParser.java
@ -80,11 +80,16 @@ import de.anomic.server.logging.serverLog;
 *        See: http://www.kollar.com/robots.html
 */
 public final class robotsParser{
-    
+	public static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
+	public static final int DOWNLOAD_ROBOTS_TXT = 1;
+	public static final int DOWNLOAD_ETAG = 2;
+	public static final int DOWNLOAD_MODDATE = 3;
+	
    public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
    public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
    public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
    public static final String ROBOTS_COMMENT = "#";
+    public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
    
    /*public robotsParser(URL robotsUrl){
     }*/
@ -93,29 +98,29 @@ public final class robotsParser{
     * at the Moment it only creates a list of Deny Paths
     */
    
-    public static ArrayList parse(File robotsFile) throws IOException {
+    public static Object[] parse(File robotsFile) throws IOException {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(robotsFile));
            return parse(reader);
        } finally {
-            if (reader != null) try{reader.close();}catch(Exception e){}
+            if (reader != null) try{reader.close();}catch(Exception e){/* ignore this */}
        }
    }
    
-    public static ArrayList parse(byte[] robotsTxt) throws IOException {
-        if ((robotsTxt == null)||(robotsTxt.length == 0)) return new ArrayList(0);
+    public static Object[] parse(byte[] robotsTxt) throws IOException {
+        if ((robotsTxt == null)||(robotsTxt.length == 0)) return new Object[]{new ArrayList(0),null};
        ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
        BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
        return parse(reader);
    }
    
-    public static ArrayList parse(BufferedReader reader) throws IOException{
+    public static Object[] parse(BufferedReader reader) throws IOException{
        ArrayList deny4AllAgents = new ArrayList();
        ArrayList deny4YaCyAgent = new ArrayList();
        
        int pos;
-        String line = null, lineUpper = null;
+        String line = null, lineUpper = null, sitemap = null;
        boolean isRuleBlock4AllAgents = false,
                isRuleBlock4YaCyAgent = false,
                rule4YaCyFound = false,
@ -132,6 +137,11 @@ public final class robotsParser{
                // NEW: just ignore it
            } else if (line.startsWith(ROBOTS_COMMENT)) {
                // we can ignore this. Just a comment line
+            } else if (line.startsWith(ROBOTS_SITEMAP)) {
+                pos = line.indexOf(" ");
+                if (pos != -1) {
+                    sitemap = line.substring(pos).trim();
+                }
            } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
                
                if (inBlock) {
@ -200,7 +210,8 @@ public final class robotsParser{
            }
        }
        
-        return (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
+        ArrayList denyList = (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
+        return new Object[]{denyList,sitemap};
    }        
    
    public static boolean isDisallowed(URL nexturl) {
@ -250,10 +261,10 @@ public final class robotsParser{
                    result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host);
                    
                    if (result != null) {
-                        accessCompletelyRestricted = ((Boolean)result[0]).booleanValue();
-                        robotsTxt = (byte[])result[1];
-                        eTag = (String) result[2];
-                        modDate = (Date) result[3];
+                        accessCompletelyRestricted = ((Boolean)result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue();
+                        robotsTxt = (byte[])result[DOWNLOAD_ROBOTS_TXT];
+                        eTag = (String) result[DOWNLOAD_ETAG];
+                        modDate = (Date) result[DOWNLOAD_MODDATE];
                    } else if (robotsTxt4Host != null) {
                        robotsTxt4Host.setLoadedDate(new Date());
                        plasmaSwitchboard.robots.addEntry(robotsTxt4Host);
@ -264,20 +275,23 @@ public final class robotsParser{
                
                if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) {
                    ArrayList denyPath = null;
+                    String sitemap = null;
                    if (accessCompletelyRestricted) {
                        denyPath = new ArrayList();
                        denyPath.add("/");
                    } else {
                        // parsing the robots.txt Data and converting it into an arraylist
                        try {
-                            denyPath = robotsParser.parse(robotsTxt);
+                            Object[] parserResult = robotsParser.parse(robotsTxt);
+                            denyPath = (ArrayList) parserResult[0];
+                            sitemap = (String) parserResult[1];
                        } catch (IOException e) {
                            serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'.");
                        }
                    } 
                    
                    // storing the data into the robots DB
-                    robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag);
+                    robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap);
                } 
            }
        }
@ -288,7 +302,7 @@ public final class robotsParser{
        return false;
    }
    
-    private static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, plasmaCrawlRobotsTxt.Entry entry) throws Exception {
+    static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, plasmaCrawlRobotsTxt.Entry entry) throws Exception {
        
        if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null};
        redirectionCount--;
@ -395,28 +409,4 @@ public final class robotsParser{
        }            
        return new Object[]{new Boolean(accessCompletelyRestricted),robotsTxt,eTag,lastMod};
    }
-    
-    public static void main(String[] args) {
-        try {
-        //robotsParser parser = new robotsParser();
-        
-        URL robotsURL = new URL("http://www.bigfoot2002.de.vu/robots.txt");
-        Object[] result = downloadRobotsTxt(robotsURL,5,null);
-        
-        if (result != null) {
-            boolean accessCompletelyRestricted = ((Boolean)result[0]).booleanValue();
-            byte[] robotsTxt = (byte[])result[1];
-            //String eTag = (String) result[2];
-            //Date modDate = (Date) result[3];
-        
-            if (!accessCompletelyRestricted) {
-                /*ArrayList denyPath =*/ robotsParser.parse(robotsTxt);
-            }
-
-        }
-        }catch(Exception e) {
-            e.printStackTrace();
-        }
-    }
-
 }
--- a/source/de/anomic/index/indexURLEntry.java
+++ b/source/de/anomic/index/indexURLEntry.java
@ -75,23 +75,44 @@ public class indexURLEntry {
        kelondroBase64Order.enhancedCoder,
        0);      
    
+    /* ===========================================================================
+     * Constants to access the various columns of an URL entry
+     * =========================================================================== */
+    /** the url's hash */
    private static final int col_hash     =  0;
+    /** components: the url, description, author and tags. As 5th element, an ETag is possible */
    private static final int col_comp     =  1;
+    /** components: the url, description, author and tags. As 5th element, an ETag is possible */
    private static final int col_mod      =  2;
+    /** time when the url was loaded */
    private static final int col_load     =  3;
+    /** time until this url is fresh */
    private static final int col_fresh    =  4;
+    /** time when the url was loaded */
    private static final int col_referrer =  5;
+    /** the md5 of the url content (to identify changes) */
    private static final int col_md5      =  6;
+    /** size of file in bytes */
    private static final int col_size     =  7;
+    /** size of file by number of words; for video and audio: seconds */
    private static final int col_wc       =  8;
+    /** doctype, taken from extension or any other heuristic */
    private static final int col_dt       =  9;
+    /** flags; any stuff (see Word-Entity definition) */
    private static final int col_flags    = 10;
+    /** language */
    private static final int col_lang     = 11;
+    /** of outlinks to same domain; for video and image: width */
    private static final int col_llocal   = 12;
+    /** of outlinks to outside domain; for video and image: height */
    private static final int col_lother   = 13;
+    /** of embedded image links */
    private static final int col_limage   = 14;
+    /** of embedded audio links; for audio: track number; for video: number of audio tracks */
    private static final int col_laudio   = 15;
+    /** of embedded video links */
    private static final int col_lvideo   = 16;
+    /** of embedded links to applications */
    private static final int col_lapp     = 17;
    
    private kelondroRow.Entry entry;
@ -405,7 +426,7 @@ public class indexURLEntry {
    }
    
    /**
-     * Returns this object as String.<br> 
+     * @return the object as String.<br> 
     * This e.g. looks like this:
     * <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
     */
--- a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java
+++ b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java
@ -111,8 +111,8 @@ public class plasmaCrawlRobotsTxt {
        }
    }    
    
-    public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag) {
-        Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag);
+    public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, String sitemap) {
+        Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap);
        addEntry(entry);
        return entry;
    }
@ -132,6 +132,7 @@ public class plasmaCrawlRobotsTxt {
        public static final String LOADED_DATE = "date";
        public static final String MOD_DATE = "modDate";
        public static final String ETAG = "etag";
+        public static final String SITEMAP = "sitemap";
        
        // this is a simple record structure that hold all properties of a single crawl start
        private Map mem;
@ -161,16 +162,18 @@ public class plasmaCrawlRobotsTxt {
                ArrayList disallowPathList, 
                Date loadedDate,
                Date modDate,
-                String eTag) {
+                String eTag,
+                String sitemap) {
            if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException();
            
            this.hostName = hostName.trim().toLowerCase();
            this.disallowPathList = new LinkedList();
            
-            this.mem = new HashMap();
+            this.mem = new HashMap(5);
            if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime()));
            if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
            if (eTag != null) this.mem.put(ETAG,eTag);
+            if (sitemap != null) this.mem.put(SITEMAP,sitemap);
            
            if ((disallowPathList != null)&&(disallowPathList.size()>0)) {
                this.disallowPathList.addAll(disallowPathList);
@ -196,6 +199,10 @@ public class plasmaCrawlRobotsTxt {
            return str.toString();
        }    
        
+        public String getSitemap() {
+            return this.mem.containsKey(SITEMAP)? (String)this.mem.get(LOADED_DATE): null;
+        }
+        
        public Date getLoadedDate() {
            if (this.mem.containsKey(LOADED_DATE)) {
                return new Date(Long.valueOf((String) this.mem.get(LOADED_DATE)).longValue());
--- a/test/de/anomic/data/robotsParserTest.java
+++ b/test/de/anomic/data/robotsParserTest.java
@ -0,0 +1,20 @@
+package de.anomic.data;
+
+import junit.framework.TestCase;
+import de.anomic.net.URL;
+
+public class robotsParserTest extends TestCase {
+	public void testDownload() throws Exception {
+        URL robotsURL = new URL("http://www.bigfoot2002.de.vu/robots.txt");
+        Object[] result = robotsParser.downloadRobotsTxt(robotsURL,5,null);
+        
+        if (result != null) {
+        	System.out.println("Access restricted: " + result[robotsParser.DOWNLOAD_ACCESS_RESTRICTED]);
+            System.out.println("ETag: " + result[robotsParser.DOWNLOAD_ETAG]);
+            System.out.println("Mod-Date: " + result[robotsParser.DOWNLOAD_MODDATE]);
+            System.out.println("-------------------------------- Robots.txt START: -------------------------------");
+            System.out.println(new String((byte[])result[robotsParser.DOWNLOAD_ROBOTS_TXT]));
+            System.out.println("-------------------------------- Robots.txt END: ---------------------------------");
+        }
+	}
+}