*) robots.txt parser now extracts the sitemap-URL (will be used later)

*) some javadoc added
*) junit testclass for robots.txt parser added

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3602 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 18 years ago
parent fa012789b2
commit 2399ed817c

@ -133,7 +133,8 @@ public class Surftips {
refid = row.getColString(3, null);
voted = false;
try {
voted = (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null) || (yacyCore.newsPool.getSpecific(yacyNewsPool.PUBLISHED_DB, "stippavt", "refid", refid) != null);
voted = (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null) ||
(yacyCore.newsPool.getSpecific(yacyNewsPool.PUBLISHED_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null);
} catch (IOException e) {
e.printStackTrace();
}

@ -80,11 +80,16 @@ import de.anomic.server.logging.serverLog;
* See: http://www.kollar.com/robots.html
*/
public final class robotsParser{
public static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
public static final int DOWNLOAD_ROBOTS_TXT = 1;
public static final int DOWNLOAD_ETAG = 2;
public static final int DOWNLOAD_MODDATE = 3;
public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
public static final String ROBOTS_COMMENT = "#";
public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
/*public robotsParser(URL robotsUrl){
}*/
@ -93,29 +98,29 @@ public final class robotsParser{
* at the Moment it only creates a list of Deny Paths
*/
public static ArrayList parse(File robotsFile) throws IOException {
public static Object[] parse(File robotsFile) throws IOException {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(robotsFile));
return parse(reader);
} finally {
if (reader != null) try{reader.close();}catch(Exception e){}
if (reader != null) try{reader.close();}catch(Exception e){/* ignore this */}
}
}
public static ArrayList parse(byte[] robotsTxt) throws IOException {
if ((robotsTxt == null)||(robotsTxt.length == 0)) return new ArrayList(0);
public static Object[] parse(byte[] robotsTxt) throws IOException {
if ((robotsTxt == null)||(robotsTxt.length == 0)) return new Object[]{new ArrayList(0),null};
ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
return parse(reader);
}
public static ArrayList parse(BufferedReader reader) throws IOException{
public static Object[] parse(BufferedReader reader) throws IOException{
ArrayList deny4AllAgents = new ArrayList();
ArrayList deny4YaCyAgent = new ArrayList();
int pos;
String line = null, lineUpper = null;
String line = null, lineUpper = null, sitemap = null;
boolean isRuleBlock4AllAgents = false,
isRuleBlock4YaCyAgent = false,
rule4YaCyFound = false,
@ -132,6 +137,11 @@ public final class robotsParser{
// NEW: just ignore it
} else if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line
} else if (line.startsWith(ROBOTS_SITEMAP)) {
pos = line.indexOf(" ");
if (pos != -1) {
sitemap = line.substring(pos).trim();
}
} else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
if (inBlock) {
@ -200,7 +210,8 @@ public final class robotsParser{
}
}
return (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
ArrayList denyList = (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
return new Object[]{denyList,sitemap};
}
public static boolean isDisallowed(URL nexturl) {
@ -250,10 +261,10 @@ public final class robotsParser{
result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host);
if (result != null) {
accessCompletelyRestricted = ((Boolean)result[0]).booleanValue();
robotsTxt = (byte[])result[1];
eTag = (String) result[2];
modDate = (Date) result[3];
accessCompletelyRestricted = ((Boolean)result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue();
robotsTxt = (byte[])result[DOWNLOAD_ROBOTS_TXT];
eTag = (String) result[DOWNLOAD_ETAG];
modDate = (Date) result[DOWNLOAD_MODDATE];
} else if (robotsTxt4Host != null) {
robotsTxt4Host.setLoadedDate(new Date());
plasmaSwitchboard.robots.addEntry(robotsTxt4Host);
@ -264,20 +275,23 @@ public final class robotsParser{
if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) {
ArrayList denyPath = null;
String sitemap = null;
if (accessCompletelyRestricted) {
denyPath = new ArrayList();
denyPath.add("/");
} else {
// parsing the robots.txt Data and converting it into an arraylist
try {
denyPath = robotsParser.parse(robotsTxt);
Object[] parserResult = robotsParser.parse(robotsTxt);
denyPath = (ArrayList) parserResult[0];
sitemap = (String) parserResult[1];
} catch (IOException e) {
serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'.");
}
}
// storing the data into the robots DB
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag);
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap);
}
}
}
@ -288,7 +302,7 @@ public final class robotsParser{
return false;
}
private static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, plasmaCrawlRobotsTxt.Entry entry) throws Exception {
static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, plasmaCrawlRobotsTxt.Entry entry) throws Exception {
if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null};
redirectionCount--;
@ -395,28 +409,4 @@ public final class robotsParser{
}
return new Object[]{new Boolean(accessCompletelyRestricted),robotsTxt,eTag,lastMod};
}
public static void main(String[] args) {
try {
//robotsParser parser = new robotsParser();
URL robotsURL = new URL("http://www.bigfoot2002.de.vu/robots.txt");
Object[] result = downloadRobotsTxt(robotsURL,5,null);
if (result != null) {
boolean accessCompletelyRestricted = ((Boolean)result[0]).booleanValue();
byte[] robotsTxt = (byte[])result[1];
//String eTag = (String) result[2];
//Date modDate = (Date) result[3];
if (!accessCompletelyRestricted) {
/*ArrayList denyPath =*/ robotsParser.parse(robotsTxt);
}
}
}catch(Exception e) {
e.printStackTrace();
}
}
}

@ -75,23 +75,44 @@ public class indexURLEntry {
kelondroBase64Order.enhancedCoder,
0);
/* ===========================================================================
* Constants to access the various columns of an URL entry
* =========================================================================== */
/** the url's hash */
private static final int col_hash = 0;
/** components: the url, description, author and tags. As 5th element, an ETag is possible */
private static final int col_comp = 1;
/** components: the url, description, author and tags. As 5th element, an ETag is possible */
private static final int col_mod = 2;
/** time when the url was loaded */
private static final int col_load = 3;
/** time until this url is fresh */
private static final int col_fresh = 4;
/** time when the url was loaded */
private static final int col_referrer = 5;
/** the md5 of the url content (to identify changes) */
private static final int col_md5 = 6;
/** size of file in bytes */
private static final int col_size = 7;
/** size of file by number of words; for video and audio: seconds */
private static final int col_wc = 8;
/** doctype, taken from extension or any other heuristic */
private static final int col_dt = 9;
/** flags; any stuff (see Word-Entity definition) */
private static final int col_flags = 10;
/** language */
private static final int col_lang = 11;
/** of outlinks to same domain; for video and image: width */
private static final int col_llocal = 12;
/** of outlinks to outside domain; for video and image: height */
private static final int col_lother = 13;
/** of embedded image links */
private static final int col_limage = 14;
/** of embedded audio links; for audio: track number; for video: number of audio tracks */
private static final int col_laudio = 15;
/** of embedded video links */
private static final int col_lvideo = 16;
/** of embedded links to applications */
private static final int col_lapp = 17;
private kelondroRow.Entry entry;
@ -405,7 +426,7 @@ public class indexURLEntry {
}
/**
* Returns this object as String.<br>
* @return the object as String.<br>
* This e.g. looks like this:
* <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
*/

@ -111,8 +111,8 @@ public class plasmaCrawlRobotsTxt {
}
}
public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag) {
Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag);
public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, String sitemap) {
Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap);
addEntry(entry);
return entry;
}
@ -132,6 +132,7 @@ public class plasmaCrawlRobotsTxt {
public static final String LOADED_DATE = "date";
public static final String MOD_DATE = "modDate";
public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap";
// this is a simple record structure that hold all properties of a single crawl start
private Map mem;
@ -161,16 +162,18 @@ public class plasmaCrawlRobotsTxt {
ArrayList disallowPathList,
Date loadedDate,
Date modDate,
String eTag) {
String eTag,
String sitemap) {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException();
this.hostName = hostName.trim().toLowerCase();
this.disallowPathList = new LinkedList();
this.mem = new HashMap();
this.mem = new HashMap(5);
if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime()));
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
if (eTag != null) this.mem.put(ETAG,eTag);
if (sitemap != null) this.mem.put(SITEMAP,sitemap);
if ((disallowPathList != null)&&(disallowPathList.size()>0)) {
this.disallowPathList.addAll(disallowPathList);
@ -196,6 +199,10 @@ public class plasmaCrawlRobotsTxt {
return str.toString();
}
public String getSitemap() {
return this.mem.containsKey(SITEMAP)? (String)this.mem.get(LOADED_DATE): null;
}
public Date getLoadedDate() {
if (this.mem.containsKey(LOADED_DATE)) {
return new Date(Long.valueOf((String) this.mem.get(LOADED_DATE)).longValue());

@ -0,0 +1,20 @@
package de.anomic.data;
import junit.framework.TestCase;
import de.anomic.net.URL;
public class robotsParserTest extends TestCase {
public void testDownload() throws Exception {
URL robotsURL = new URL("http://www.bigfoot2002.de.vu/robots.txt");
Object[] result = robotsParser.downloadRobotsTxt(robotsURL,5,null);
if (result != null) {
System.out.println("Access restricted: " + result[robotsParser.DOWNLOAD_ACCESS_RESTRICTED]);
System.out.println("ETag: " + result[robotsParser.DOWNLOAD_ETAG]);
System.out.println("Mod-Date: " + result[robotsParser.DOWNLOAD_MODDATE]);
System.out.println("-------------------------------- Robots.txt START: -------------------------------");
System.out.println(new String((byte[])result[robotsParser.DOWNLOAD_ROBOTS_TXT]));
System.out.println("-------------------------------- Robots.txt END: ---------------------------------");
}
}
}
Loading…
Cancel
Save