diff --git a/htroot/Surftips.java b/htroot/Surftips.java
index 474beb576..550ce4739 100644
--- a/htroot/Surftips.java
+++ b/htroot/Surftips.java
@@ -133,7 +133,8 @@ public class Surftips {
refid = row.getColString(3, null);
voted = false;
try {
- voted = (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null) || (yacyCore.newsPool.getSpecific(yacyNewsPool.PUBLISHED_DB, "stippavt", "refid", refid) != null);
+ voted = (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null) ||
+ (yacyCore.newsPool.getSpecific(yacyNewsPool.PUBLISHED_DB, yacyNewsPool.CATEGORY_SURFTIPP_VOTE_ADD, "refid", refid) != null);
} catch (IOException e) {
e.printStackTrace();
}
diff --git a/source/de/anomic/data/robotsParser.java b/source/de/anomic/data/robotsParser.java
index a39f5d907..c41b80fb1 100644
--- a/source/de/anomic/data/robotsParser.java
+++ b/source/de/anomic/data/robotsParser.java
@@ -80,11 +80,16 @@ import de.anomic.server.logging.serverLog;
* See: http://www.kollar.com/robots.html
*/
public final class robotsParser{
-
+ public static final int DOWNLOAD_ACCESS_RESTRICTED = 0;
+ public static final int DOWNLOAD_ROBOTS_TXT = 1;
+ public static final int DOWNLOAD_ETAG = 2;
+ public static final int DOWNLOAD_MODDATE = 3;
+
public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
public static final String ROBOTS_COMMENT = "#";
+ public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
/*public robotsParser(URL robotsUrl){
}*/
@@ -93,29 +98,29 @@ public final class robotsParser{
* at the Moment it only creates a list of Deny Paths
*/
- public static ArrayList parse(File robotsFile) throws IOException {
+ public static Object[] parse(File robotsFile) throws IOException {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(robotsFile));
return parse(reader);
} finally {
- if (reader != null) try{reader.close();}catch(Exception e){}
+ if (reader != null) try{reader.close();}catch(Exception e){/* ignore this */}
}
}
- public static ArrayList parse(byte[] robotsTxt) throws IOException {
- if ((robotsTxt == null)||(robotsTxt.length == 0)) return new ArrayList(0);
+ public static Object[] parse(byte[] robotsTxt) throws IOException {
+ if ((robotsTxt == null)||(robotsTxt.length == 0)) return new Object[]{new ArrayList(0),null};
ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
return parse(reader);
}
- public static ArrayList parse(BufferedReader reader) throws IOException{
+ public static Object[] parse(BufferedReader reader) throws IOException{
ArrayList deny4AllAgents = new ArrayList();
ArrayList deny4YaCyAgent = new ArrayList();
int pos;
- String line = null, lineUpper = null;
+ String line = null, lineUpper = null, sitemap = null;
boolean isRuleBlock4AllAgents = false,
isRuleBlock4YaCyAgent = false,
rule4YaCyFound = false,
@@ -132,6 +137,11 @@ public final class robotsParser{
// NEW: just ignore it
} else if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line
+ } else if (line.startsWith(ROBOTS_SITEMAP)) {
+ pos = line.indexOf(" ");
+ if (pos != -1) {
+ sitemap = line.substring(pos).trim();
+ }
} else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
if (inBlock) {
@@ -200,7 +210,8 @@ public final class robotsParser{
}
}
- return (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
+ ArrayList denyList = (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
+ return new Object[]{denyList,sitemap};
}
public static boolean isDisallowed(URL nexturl) {
@@ -250,10 +261,10 @@ public final class robotsParser{
result = downloadRobotsTxt(robotsURL,5,robotsTxt4Host);
if (result != null) {
- accessCompletelyRestricted = ((Boolean)result[0]).booleanValue();
- robotsTxt = (byte[])result[1];
- eTag = (String) result[2];
- modDate = (Date) result[3];
+ accessCompletelyRestricted = ((Boolean)result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue();
+ robotsTxt = (byte[])result[DOWNLOAD_ROBOTS_TXT];
+ eTag = (String) result[DOWNLOAD_ETAG];
+ modDate = (Date) result[DOWNLOAD_MODDATE];
} else if (robotsTxt4Host != null) {
robotsTxt4Host.setLoadedDate(new Date());
plasmaSwitchboard.robots.addEntry(robotsTxt4Host);
@@ -264,20 +275,23 @@ public final class robotsParser{
if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) {
ArrayList denyPath = null;
+ String sitemap = null;
if (accessCompletelyRestricted) {
denyPath = new ArrayList();
denyPath.add("/");
} else {
// parsing the robots.txt Data and converting it into an arraylist
try {
- denyPath = robotsParser.parse(robotsTxt);
+ Object[] parserResult = robotsParser.parse(robotsTxt);
+ denyPath = (ArrayList) parserResult[0];
+ sitemap = (String) parserResult[1];
} catch (IOException e) {
serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'.");
}
}
// storing the data into the robots DB
- robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag);
+ robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap);
}
}
}
@@ -288,7 +302,7 @@ public final class robotsParser{
return false;
}
- private static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, plasmaCrawlRobotsTxt.Entry entry) throws Exception {
+ static Object[] downloadRobotsTxt(URL robotsURL, int redirectionCount, plasmaCrawlRobotsTxt.Entry entry) throws Exception {
if (redirectionCount < 0) return new Object[]{Boolean.FALSE,null,null};
redirectionCount--;
@@ -395,28 +409,4 @@ public final class robotsParser{
}
return new Object[]{new Boolean(accessCompletelyRestricted),robotsTxt,eTag,lastMod};
}
-
- public static void main(String[] args) {
- try {
- //robotsParser parser = new robotsParser();
-
- URL robotsURL = new URL("http://www.bigfoot2002.de.vu/robots.txt");
- Object[] result = downloadRobotsTxt(robotsURL,5,null);
-
- if (result != null) {
- boolean accessCompletelyRestricted = ((Boolean)result[0]).booleanValue();
- byte[] robotsTxt = (byte[])result[1];
- //String eTag = (String) result[2];
- //Date modDate = (Date) result[3];
-
- if (!accessCompletelyRestricted) {
- /*ArrayList denyPath =*/ robotsParser.parse(robotsTxt);
- }
-
- }
- }catch(Exception e) {
- e.printStackTrace();
- }
- }
-
}
diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java
index f579dad1f..505c0e3f0 100644
--- a/source/de/anomic/index/indexURLEntry.java
+++ b/source/de/anomic/index/indexURLEntry.java
@@ -75,23 +75,44 @@ public class indexURLEntry {
kelondroBase64Order.enhancedCoder,
0);
+ /* ===========================================================================
+ * Constants to access the various columns of an URL entry
+ * =========================================================================== */
+ /** the url's hash */
private static final int col_hash = 0;
+ /** components: the url, description, author and tags. As 5th element, an ETag is possible */
private static final int col_comp = 1;
+ /** components: the url, description, author and tags. As 5th element, an ETag is possible */
private static final int col_mod = 2;
+ /** time when the url was loaded */
private static final int col_load = 3;
+ /** time until this url is fresh */
private static final int col_fresh = 4;
+ /** time when the url was loaded */
private static final int col_referrer = 5;
+ /** the md5 of the url content (to identify changes) */
private static final int col_md5 = 6;
+ /** size of file in bytes */
private static final int col_size = 7;
+ /** size of file by number of words; for video and audio: seconds */
private static final int col_wc = 8;
+ /** doctype, taken from extension or any other heuristic */
private static final int col_dt = 9;
+ /** flags; any stuff (see Word-Entity definition) */
private static final int col_flags = 10;
+ /** language */
private static final int col_lang = 11;
+ /** of outlinks to same domain; for video and image: width */
private static final int col_llocal = 12;
+ /** of outlinks to outside domain; for video and image: height */
private static final int col_lother = 13;
+ /** of embedded image links */
private static final int col_limage = 14;
+ /** of embedded audio links; for audio: track number; for video: number of audio tracks */
private static final int col_laudio = 15;
+ /** of embedded video links */
private static final int col_lvideo = 16;
+ /** of embedded links to applications */
private static final int col_lapp = 17;
private kelondroRow.Entry entry;
@@ -405,7 +426,7 @@ public class indexURLEntry {
}
/**
- * Returns this object as String.
+ * @return the object as String.
* This e.g. looks like this:
*
{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}*/ diff --git a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java index 422a05863..f17fc7fad 100644 --- a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java +++ b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java @@ -111,8 +111,8 @@ public class plasmaCrawlRobotsTxt { } } - public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag) { - Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag); + public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, String sitemap) { + Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap); addEntry(entry); return entry; } @@ -132,6 +132,7 @@ public class plasmaCrawlRobotsTxt { public static final String LOADED_DATE = "date"; public static final String MOD_DATE = "modDate"; public static final String ETAG = "etag"; + public static final String SITEMAP = "sitemap"; // this is a simple record structure that hold all properties of a single crawl start private Map mem; @@ -161,16 +162,18 @@ public class plasmaCrawlRobotsTxt { ArrayList disallowPathList, Date loadedDate, Date modDate, - String eTag) { + String eTag, + String sitemap) { if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException(); this.hostName = hostName.trim().toLowerCase(); this.disallowPathList = new LinkedList(); - this.mem = new HashMap(); + this.mem = new HashMap(5); if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime())); if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); if (eTag != null) this.mem.put(ETAG,eTag); + if (sitemap != null) this.mem.put(SITEMAP,sitemap); if ((disallowPathList != null)&&(disallowPathList.size()>0)) { this.disallowPathList.addAll(disallowPathList); @@ -196,6 +199,10 @@ public class plasmaCrawlRobotsTxt { return str.toString(); } + public String getSitemap() { + return this.mem.containsKey(SITEMAP)? (String)this.mem.get(LOADED_DATE): null; + } + public Date getLoadedDate() { if (this.mem.containsKey(LOADED_DATE)) { return new Date(Long.valueOf((String) this.mem.get(LOADED_DATE)).longValue()); diff --git a/test/de/anomic/data/robotsParserTest.java b/test/de/anomic/data/robotsParserTest.java new file mode 100644 index 000000000..daf2d8c8c --- /dev/null +++ b/test/de/anomic/data/robotsParserTest.java @@ -0,0 +1,20 @@ +package de.anomic.data; + +import junit.framework.TestCase; +import de.anomic.net.URL; + +public class robotsParserTest extends TestCase { + public void testDownload() throws Exception { + URL robotsURL = new URL("http://www.bigfoot2002.de.vu/robots.txt"); + Object[] result = robotsParser.downloadRobotsTxt(robotsURL,5,null); + + if (result != null) { + System.out.println("Access restricted: " + result[robotsParser.DOWNLOAD_ACCESS_RESTRICTED]); + System.out.println("ETag: " + result[robotsParser.DOWNLOAD_ETAG]); + System.out.println("Mod-Date: " + result[robotsParser.DOWNLOAD_MODDATE]); + System.out.println("-------------------------------- Robots.txt START: -------------------------------"); + System.out.println(new String((byte[])result[robotsParser.DOWNLOAD_ROBOTS_TXT])); + System.out.println("-------------------------------- Robots.txt END: ---------------------------------"); + } + } +}