From 88245e44d86e5612be13376cfaacbc2a9e907cf6 Mon Sep 17 00:00:00 2001 From: karlchenofhell Date: Fri, 2 Mar 2007 01:19:38 +0000 Subject: [PATCH] - improved version of robots.txt (delete your old htroot/robots.txt before updating): - robots.txt is a servlet now - no need to rewrite the whole file each time a section is added or removed - user-defined disallows, added manually, won't be overwritten anymore - new config-setting: httpd.robots.txt, holding names of the disallowed sections git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3423 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ConfigRobotsTxt_p.html | 9 +- htroot/ConfigRobotsTxt_p.java | 157 +++---------- htroot/robots.java | 71 ++++++ htroot/robots.txt | 54 +++++ source/de/anomic/http/httpdFileHandler.java | 3 +- .../de/anomic/http/httpdRobotsTxtConfig.java | 220 ++++++++++++++++++ .../de/anomic/plasma/plasmaSwitchboard.java | 9 + yacy.init | 16 ++ 8 files changed, 406 insertions(+), 133 deletions(-) create mode 100644 htroot/robots.java create mode 100644 htroot/robots.txt create mode 100644 source/de/anomic/http/httpdRobotsTxtConfig.java diff --git a/htroot/ConfigRobotsTxt_p.html b/htroot/ConfigRobotsTxt_p.html index e723d0462..472e85d1a 100644 --- a/htroot/ConfigRobotsTxt_p.html +++ b/htroot/ConfigRobotsTxt_p.html @@ -9,14 +9,17 @@ #%env/templates/submenuConfig.template%#

Exclude Web-Spiders

+ Here you can set robots.txt-settings for all webcrawlers that try to access your peer. + robots.txt is a volunteer agreement most search-engines (including YaCy) follow. + It disallows crawlers to access webpages or even entire domains.

#(error)# ::

Unable to access the local file: #[msg]#

::

Deletion of htroot/robots.txt failed

#(/error)#
- - -
Restrict access to +
Restrict access for + +
diff --git a/htroot/ConfigRobotsTxt_p.java b/htroot/ConfigRobotsTxt_p.java index c37e45ae7..0f0062322 100644 --- a/htroot/ConfigRobotsTxt_p.java +++ b/htroot/ConfigRobotsTxt_p.java @@ -48,157 +48,56 @@ // javac -classpath .:../classes ConfigRobotsTxt_p.java // if the shell's current path is HTROOT -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.regex.Matcher; import java.util.regex.Pattern; import de.anomic.http.httpHeader; +import de.anomic.http.httpdRobotsTxtConfig; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.servletProperties; -import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyCore; public class ConfigRobotsTxt_p { public static final Pattern entryBeginPattern = Pattern.compile("# (\\w*) \\((\\d*) entries\\)"); - private static HashMap disallowMap = null; - - private static Map getDisallowMap(String htrootPath) { - if (disallowMap == null) { - final File htroot = new File(htrootPath); - if (!htroot.exists()) return null; - disallowMap = new /* */ HashMap(); - final ArrayList htrootFiles = new ArrayList(); - final ArrayList htrootDirs = new ArrayList(); - final String[] htroots = htroot.list(); - File file; - for (int i=0, dot; i