- enhanced logging in robots.txt parser for remote debugging

- robots.txt is now more robust against database operations

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8043 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 13 years ago
parent 5a7cec59f3
commit 017a01714d

@ -3,6 +3,10 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
@ -10,19 +14,16 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import de.anomic.crawler.RobotsTxtEntry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import de.anomic.crawler.RobotsTxtEntry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class getpageinfo_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
@ -49,7 +50,7 @@ public class getpageinfo_p {
actions=post.get("actions");
String url=post.get("url");
if (url.toLowerCase().startsWith("ftp://")) {
prop.put("robots-allowed", "1");
prop.put("robots-allowed", "1"); // ok to crawl
prop.put("robotsInfo", "ftp does not follow robots.txt");
prop.putXML("title", "FTP: " + url);
return prop;
@ -72,6 +73,8 @@ public class getpageinfo_p {
scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST);
} catch (final IOException e) {
Log.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
// that should not affect the robots.txt validity
}
if (scraper != null) {
// put the document title
@ -140,7 +143,7 @@ public class getpageinfo_p {
final DigestURI theURL = new DigestURI(url
+ "?verb=Identify");
String oairesult = checkOAI(theURL.toString());
final String oairesult = checkOAI(theURL.toString());
prop.put("oai", oairesult == "" ? 0 : 1);
@ -156,7 +159,7 @@ public class getpageinfo_p {
// return rewrite properties
return prop;
}
private static String checkOAI(final String url) {
final DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
@ -173,7 +176,7 @@ public class getpageinfo_p {
return "";
}
private static String parseXML(final Document doc) {
String repositoryName = null;
@ -205,6 +208,6 @@ public class getpageinfo_p {
}
return repositoryName;
}
}

@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
@ -44,9 +45,12 @@ import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.kelondro.blob.BEncodedHeap;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.io.ByteCount;
import net.yacy.kelondro.logging.Log;
import org.apache.log4j.Logger;
import de.anomic.data.WorkTables;
public class RobotsTxt {
private static Logger log = Logger.getLogger(RobotsTxt.class);
@ -54,28 +58,35 @@ public class RobotsTxt {
protected static final String ROBOTS_DB_PATH_SEPARATOR = ";";
protected static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR);
private final BEncodedHeap robotsTable;
private final ConcurrentHashMap<String, DomSync> syncObjects;
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
private final WorkTables tables;
private static class DomSync {
private DomSync() {}
}
public RobotsTxt(final BEncodedHeap robotsTable) {
this.robotsTable = robotsTable;
public RobotsTxt(final WorkTables worktables) {
this.syncObjects = new ConcurrentHashMap<String, DomSync>();
log.info("initiated robots table: " + robotsTable.getFile());
this.tables = worktables;
try {
log.info("initiated robots table: " + this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).getFile());
} catch (final IOException e) {
try {
this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).clear();
} catch (final IOException e1) {
}
}
}
public void clear() {
public void clear() throws IOException {
log.info("clearing robots table");
this.robotsTable.clear();
this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).clear();
this.syncObjects.clear();
}
public int size() {
return this.robotsTable.size();
public int size() throws IOException {
return this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME).size();
}
public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
@ -89,8 +100,9 @@ public class RobotsTxt {
final String urlHostPort = getHostPort(theURL);
RobotsTxtEntry robotsTxt4Host = null;
Map<String, byte[]> record;
final BEncodedHeap robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
try {
record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort));
record = robotsTable.get(robotsTable.encodedKey(urlHostPort));
} catch (final RowSpaceExceededException e) {
log.warn("memory exhausted", e);
record = null;
@ -118,7 +130,7 @@ public class RobotsTxt {
// check the robots table again for all threads that come here because they waited for another one
// to complete a download
try {
record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort));
record = robotsTable.get(robotsTable.encodedKey(urlHostPort));
} catch (final RowSpaceExceededException e) {
log.warn("memory exhausted", e);
record = null;
@ -175,15 +187,17 @@ public class RobotsTxt {
}
// store the data into the robots DB
final int sz = this.robotsTable.size();
final int sz = robotsTable.size();
addEntry(robotsTxt4Host);
if (this.robotsTable.size() <= sz) {
if (robotsTable.size() <= sz) {
log.fatal("new entry in robots.txt table failed, resetting database");
clear();
addEntry(robotsTxt4Host);
}
} else {
final RobotsTxtParser parserResult = new RobotsTxtParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents);
final byte[] robotsTxt = (byte[]) result[DOWNLOAD_ROBOTS_TXT];
Log.logInfo("RobotsTxt", "robots of " + robotsURL.toNormalform(true, true) + ":\n" + UTF8.String(robotsTxt)); // debug TODO remove
final RobotsTxtParser parserResult = new RobotsTxtParser(robotsTxt, thisAgents);
ArrayList<String> denyPath = parserResult.denyList();
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
denyPath = new ArrayList<String>();
@ -230,7 +244,8 @@ public class RobotsTxt {
private String addEntry(final RobotsTxtEntry entry) {
// writes a new page and returns key
try {
this.robotsTable.insert(this.robotsTable.encodedKey(entry.getHostName()), entry.getMem());
final BEncodedHeap robotsTable = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
robotsTable.insert(robotsTable.encodedKey(entry.getHostName()), entry.getMem());
return entry.getHostName();
} catch (final Exception e) {
log.warn("cannot write robots.txt entry", e);

@ -239,7 +239,7 @@ public class RobotsTxtEntry {
// disallow rule
if (path.startsWith(element)) {
this.info = "path '" + path + "' starts with '" + element + "' from deny path list";
this.info = "path '" + path + "' starts with '" + element + "' from deny path list = " + this.denyPathList.toString();
return true;
}
}

@ -105,7 +105,6 @@ import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.parser.html.Evaluation;
import net.yacy.gui.Tray;
import net.yacy.kelondro.blob.BEncodedHeap;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -513,14 +512,7 @@ public final class Switchboard extends serverSwitch {
// load the robots.txt db
this.log.logConfig("Initializing robots.txt DB");
try {
final BEncodedHeap robotsHeap = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
this.robots = new RobotsTxt(robotsHeap);
} catch (final IOException e) {
this.tables.clear(WorkTables.TABLE_ROBOTS_NAME);
final BEncodedHeap robotsHeap = this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME);
this.robots = new RobotsTxt(robotsHeap);
}
this.robots = new RobotsTxt(this.tables);
this.log.logConfig("Loaded robots.txt DB: " + this.robots.size() + " entries");
// start a cache manager

Loading…
Cancel
Save