@ -11,16 +11,16 @@
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General P ublic License as published by
//it under the terms of the GNU General p ublic License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General P ublic License for more details.
//GNU General p ublic License for more details.
//
//You should have received a copy of the GNU General P ublic License
//You should have received a copy of the GNU General p ublic License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@ -51,15 +51,15 @@ public class RobotsTxt {
private static Logger log = Logger . getLogger ( RobotsTxt . class ) ;
p ublic static final String ROBOTS_DB_PATH_SEPARATOR = ";" ;
p ublic static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern . compile ( ROBOTS_DB_PATH_SEPARATOR ) ;
p rotected static final String ROBOTS_DB_PATH_SEPARATOR = ";" ;
p rotected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern . compile ( ROBOTS_DB_PATH_SEPARATOR ) ;
BEncodedHeap robotsTable ;
private final ConcurrentHashMap < String , DomSync > syncObjects ;
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
private static class DomSync {
p ublic DomSync ( ) { }
p rivate DomSync ( ) { }
}
public RobotsTxt ( final BEncodedHeap robotsTable ) {
@ -78,16 +78,16 @@ public class RobotsTxt {
return this . robotsTable . size ( ) ;
}
public Robots Entry getEntry ( final MultiProtocolURI theURL , final Set < String > thisAgents ) throws IOException {
public Robots Txt Entry getEntry ( final MultiProtocolURI theURL , final Set < String > thisAgents ) throws IOException {
if ( theURL = = null ) throw new IllegalArgumentException ( ) ;
if ( ! theURL . getProtocol ( ) . startsWith ( "http" ) ) return null ;
return getEntry ( theURL , thisAgents , true ) ;
}
private Robots Entry getEntry ( final MultiProtocolURI theURL , final Set < String > thisAgents , final boolean fetchOnlineIfNotAvailableOrNotFresh ) throws IOException {
private Robots Txt Entry getEntry ( final MultiProtocolURI theURL , final Set < String > thisAgents , final boolean fetchOnlineIfNotAvailableOrNotFresh ) throws IOException {
// this method will always return a non-null value
String urlHostPort = getHostPort ( theURL ) ;
Robots Entry robotsTxt4Host = null ;
Robots Txt Entry robotsTxt4Host = null ;
Map < String , byte [ ] > record ;
try {
record = this . robotsTable . get ( this . robotsTable . encodedKey ( urlHostPort ) ) ;
@ -95,7 +95,7 @@ public class RobotsTxt {
log . warn ( "memory exhausted" , e ) ;
record = null ;
}
if ( record ! = null ) robotsTxt4Host = new Robots Entry( urlHostPort , record ) ;
if ( record ! = null ) robotsTxt4Host = new Robots Txt Entry( urlHostPort , record ) ;
if ( fetchOnlineIfNotAvailableOrNotFresh & & (
robotsTxt4Host = = null | |
@ -123,7 +123,7 @@ public class RobotsTxt {
log . warn ( "memory exhausted" , e ) ;
record = null ;
}
if ( record ! = null ) robotsTxt4Host = new Robots Entry( urlHostPort , record ) ;
if ( record ! = null ) robotsTxt4Host = new Robots Txt Entry( urlHostPort , record ) ;
if ( robotsTxt4Host ! = null & &
robotsTxt4Host . getLoadedDate ( ) ! = null & &
System . currentTimeMillis ( ) - robotsTxt4Host . getLoadedDate ( ) . getTime ( ) < = 1 * 24 * 60 * 60 * 1000 ) {
@ -160,7 +160,7 @@ public class RobotsTxt {
// no robots.txt available, make an entry to prevent that the robots loading is done twice
if ( robotsTxt4Host = = null ) {
// generate artificial entry
robotsTxt4Host = new Robots Entry(
robotsTxt4Host = new Robots Txt Entry(
robotsURL ,
new ArrayList < String > ( ) ,
new ArrayList < String > ( ) ,
@ -183,7 +183,7 @@ public class RobotsTxt {
addEntry ( robotsTxt4Host ) ;
}
} else {
final robotsParser parserResult = new robots Parser( ( byte [ ] ) result [ DOWNLOAD_ROBOTS_TXT ] , thisAgents ) ;
final RobotsTxtParser parserResult = new RobotsTxt Parser( ( byte [ ] ) result [ DOWNLOAD_ROBOTS_TXT ] , thisAgents ) ;
ArrayList < String > denyPath = parserResult . denyList ( ) ;
if ( ( ( Boolean ) result [ DOWNLOAD_ACCESS_RESTRICTED ] ) . booleanValue ( ) ) {
denyPath = new ArrayList < String > ( ) ;
@ -208,7 +208,7 @@ public class RobotsTxt {
return robotsTxt4Host ;
}
private Robots Entry addEntry (
private Robots Txt Entry addEntry (
final MultiProtocolURI theURL ,
final ArrayList < String > allowPathList ,
final ArrayList < String > denyPathList ,
@ -219,7 +219,7 @@ public class RobotsTxt {
final long crawlDelayMillis ,
final String agentName
) {
final Robots Entry entry = new Robots Entry(
final Robots Txt Entry entry = new Robots Txt Entry(
theURL , allowPathList , denyPathList ,
loadedDate , modDate ,
eTag , sitemap , crawlDelayMillis , agentName ) ;
@ -227,7 +227,7 @@ public class RobotsTxt {
return entry ;
}
private String addEntry ( final Robots Entry entry ) {
private String addEntry ( final Robots Txt Entry entry ) {
// writes a new page and returns key
try {
this . robotsTable . insert ( this . robotsTable . encodedKey ( entry . getHostName ( ) ) , entry . getMem ( ) ) ;
@ -240,10 +240,10 @@ public class RobotsTxt {
// methods that had been in robotsParser.java:
p ublic static final int DOWNLOAD_ACCESS_RESTRICTED = 0 ;
p ublic static final int DOWNLOAD_ROBOTS_TXT = 1 ;
p ublic static final int DOWNLOAD_ETAG = 2 ;
p ublic static final int DOWNLOAD_MODDATE = 3 ;
p rivate static final int DOWNLOAD_ACCESS_RESTRICTED = 0 ;
p rivate static final int DOWNLOAD_ROBOTS_TXT = 1 ;
p rivate static final int DOWNLOAD_ETAG = 2 ;
p rivate static final int DOWNLOAD_MODDATE = 3 ;
static final String getHostPort ( final MultiProtocolURI theURL ) {
String urlHostPort = null ;
@ -267,7 +267,7 @@ public class RobotsTxt {
return port ;
}
private static Object [ ] downloadRobotsTxt ( final MultiProtocolURI robotsURL , int redirectionCount , final Robots Entry entry ) throws Exception {
private static Object [ ] downloadRobotsTxt ( final MultiProtocolURI robotsURL , int redirectionCount , final Robots Txt Entry entry ) throws Exception {
if ( robotsURL = = null | | ! robotsURL . getProtocol ( ) . startsWith ( "http" ) ) return null ;
if ( redirectionCount < 0 ) return new Object [ ] { Boolean . FALSE , null , null } ;