diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index ba1db4cdf..c763d41a8 100755 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -3,6 +3,10 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Set; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; @@ -10,19 +14,16 @@ import net.yacy.document.parser.html.ContentScraper; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; -import de.anomic.crawler.RobotsTxtEntry; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; +import de.anomic.crawler.RobotsTxtEntry; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + public class getpageinfo_p { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { @@ -49,7 +50,7 @@ public class getpageinfo_p { actions=post.get("actions"); String url=post.get("url"); if (url.toLowerCase().startsWith("ftp://")) { - prop.put("robots-allowed", "1"); + prop.put("robots-allowed", "1"); // ok to crawl prop.put("robotsInfo", "ftp does not follow robots.txt"); prop.putXML("title", "FTP: " + url); return prop; @@ -72,6 +73,8 @@ public class getpageinfo_p { scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST); } catch (final IOException e) { Log.logException(e); + // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" + // that should not affect the robots.txt validity } if (scraper != null) { // put the document title @@ -140,7 +143,7 @@ public class getpageinfo_p { final DigestURI theURL = new DigestURI(url + "?verb=Identify"); - String oairesult = checkOAI(theURL.toString()); + final String oairesult = checkOAI(theURL.toString()); prop.put("oai", oairesult == "" ? 0 : 1); @@ -156,7 +159,7 @@ public class getpageinfo_p { // return rewrite properties return prop; } - + private static String checkOAI(final String url) { final DocumentBuilderFactory factory = DocumentBuilderFactory .newInstance(); @@ -173,7 +176,7 @@ public class getpageinfo_p { return ""; } - + private static String parseXML(final Document doc) { String repositoryName = null; @@ -205,6 +208,6 @@ public class getpageinfo_p { } return repositoryName; } - + } diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index fed0e3474..91e7444f9 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -44,9 +45,12 @@ import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.kelondro.blob.BEncodedHeap; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.io.ByteCount; +import net.yacy.kelondro.logging.Log; import org.apache.log4j.Logger; +import de.anomic.data.WorkTables; + public class RobotsTxt { private static Logger log = Logger.getLogger(RobotsTxt.class); @@ -54,28 +58,35 @@ public class RobotsTxt { protected static final String ROBOTS_DB_PATH_SEPARATOR = ";"; protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR); - private final BEncodedHeap robotsTable; private final ConcurrentHashMap syncObjects; //private static final HashSet loadedRobots = new HashSet(); // only for debugging + private final WorkTables tables; private static class DomSync { private DomSync() {} } - public RobotsTxt(final BEncodedHeap robotsTable) { - this.robotsTable = robotsTable; + public RobotsTxt(final WorkTables worktables) { this.syncObjects = new ConcurrentHashMap(); - log.info("initiated robots table: " + robotsTable.getFile()); + this.tables = worktables; + try { + log.info("initiated robots table: " + this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).getFile()); + } catch (final IOException e) { + try { + this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).clear(); + } catch (final IOException e1) { + } + } } - public void clear() { + public void clear() throws IOException { log.info("clearing robots table"); - this.robotsTable.clear(); + this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).clear(); this.syncObjects.clear(); } - public int size() { - return this.robotsTable.size(); + public int size() throws IOException { + return this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).size(); } public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents) throws IOException { @@ -89,8 +100,9 @@ public class RobotsTxt { final String urlHostPort = getHostPort(theURL); RobotsTxtEntry robotsTxt4Host = null; Map record; + final BEncodedHeap robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME); try { - record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort)); + record = robotsTable.get(robotsTable.encodedKey(urlHostPort)); } catch (final RowSpaceExceededException e) { log.warn("memory exhausted", e); record = null; @@ -118,7 +130,7 @@ public class RobotsTxt { // check the robots table again for all threads that come here because they waited for another one // to complete a download try { - record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort)); + record = robotsTable.get(robotsTable.encodedKey(urlHostPort)); } catch (final RowSpaceExceededException e) { log.warn("memory exhausted", e); record = null; @@ -175,15 +187,17 @@ public class RobotsTxt { } // store the data into the robots DB - final int sz = this.robotsTable.size(); + final int sz = robotsTable.size(); addEntry(robotsTxt4Host); - if (this.robotsTable.size() <= sz) { + if (robotsTable.size() <= sz) { log.fatal("new entry in robots.txt table failed, resetting database"); clear(); addEntry(robotsTxt4Host); } } else { - final RobotsTxtParser parserResult = new RobotsTxtParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents); + final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT]; + Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + UTF8.String(robotsTxt)); // debug TODO remove + final RobotsTxtParser parserResult = new RobotsTxtParser(robotsTxt, thisAgents); ArrayList denyPath = parserResult.denyList(); if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) { denyPath = new ArrayList(); @@ -230,7 +244,8 @@ public class RobotsTxt { private String addEntry(final RobotsTxtEntry entry) { // writes a new page and returns key try { - this.robotsTable.insert(this.robotsTable.encodedKey(entry.getHostName()), entry.getMem()); + final BEncodedHeap robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME); + robotsTable.insert(robotsTable.encodedKey(entry.getHostName()), entry.getMem()); return entry.getHostName(); } catch (final Exception e) { log.warn("cannot write robots.txt entry", e); diff --git a/source/de/anomic/crawler/RobotsTxtEntry.java b/source/de/anomic/crawler/RobotsTxtEntry.java index 394f87802..314110214 100644 --- a/source/de/anomic/crawler/RobotsTxtEntry.java +++ b/source/de/anomic/crawler/RobotsTxtEntry.java @@ -239,7 +239,7 @@ public class RobotsTxtEntry { // disallow rule if (path.startsWith(element)) { - this.info = "path '" + path + "' starts with '" + element + "' from deny path list"; + this.info = "path '" + path + "' starts with '" + element + "' from deny path list = " + this.denyPathList.toString(); return true; } } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 7170abaf8..96f586521 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -105,7 +105,6 @@ import net.yacy.document.content.SurrogateReader; import net.yacy.document.importer.OAIListFriendsLoader; import net.yacy.document.parser.html.Evaluation; import net.yacy.gui.Tray; -import net.yacy.kelondro.blob.BEncodedHeap; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -513,14 +512,7 @@ public final class Switchboard extends serverSwitch { // load the robots.txt db this.log.logConfig("Initializing robots.txt DB"); - try { - final BEncodedHeap robotsHeap = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME); - this.robots = new RobotsTxt(robotsHeap); - } catch (final IOException e) { - this.tables.clear(WorkTables.TABLE_ROBOTS_NAME); - final BEncodedHeap robotsHeap = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME); - this.robots = new RobotsTxt(robotsHeap); - } + this.robots = new RobotsTxt(this.tables); this.log.logConfig("Loaded robots.txt DB: " + this.robots.size() + " entries"); // start a cache manager