From e81be7d4f2f7679e64a8c90e4c8295db4bb1ccd7 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 4 Jul 2008 11:03:03 +0000 Subject: [PATCH] added many missing user-agent declarations for yacy http client connections. the most important fix was the addition of the yacybot user-agent for robots.txt loading, because web masters look for that access to see if the crawler behaves correctly. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4968 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ConfigLanguage_p.java | 5 +- htroot/ConfigSkins_p.java | 5 +- htroot/{CookieTest.html => CookieTest_p.html} | 2 +- htroot/{CookieTest.java => CookieTest_p.java} | 4 +- htroot/Network.java | 4 +- htroot/sharedBlacklist_p.java | 10 ++- htroot/xml/util/getpageinfo_p.java | 6 +- source/de/anomic/crawler/HTTPLoader.java | 3 +- source/de/anomic/data/SitemapParser.java | 6 +- source/de/anomic/data/robotsParser.java | 4 ++ .../htmlFilter/htmlFilterContentScraper.java | 6 +- source/de/anomic/http/HttpClient.java | 72 +++---------------- .../anomic/http/JakartaCommonsHttpClient.java | 2 +- source/de/anomic/http/httpdProxyHandler.java | 4 +- source/de/anomic/net/natLib.java | 8 +-- .../anomic/plasma/parser/odt/odtParser.java | 6 +- .../anomic/plasma/parser/rpm/rpmParser.java | 6 +- .../anomic/plasma/parser/vcf/vcfParser.java | 6 +- .../de/anomic/plasma/plasmaSwitchboard.java | 15 ++-- source/de/anomic/tools/loaderThreads.java | 11 +-- source/de/anomic/yacy/yacyClient.java | 15 ++-- source/de/anomic/yacy/yacySeedDB.java | 4 +- source/de/anomic/yacy/yacyVersion.java | 6 +- source/yacy.java | 2 +- 24 files changed, 103 insertions(+), 109 deletions(-) rename htroot/{CookieTest.html => CookieTest_p.html} (94%) rename htroot/{CookieTest.java => CookieTest_p.java} (95%) diff --git a/htroot/ConfigLanguage_p.java b/htroot/ConfigLanguage_p.java index 8fd16465f..918446cc8 100644 --- a/htroot/ConfigLanguage_p.java +++ b/htroot/ConfigLanguage_p.java @@ -54,6 +54,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import de.anomic.crawler.HTTPLoader; import de.anomic.data.listManager; import de.anomic.data.translator; import de.anomic.http.HttpClient; @@ -96,7 +97,9 @@ public class ConfigLanguage_p { ArrayList langVector; try{ yacyURL u = new yacyURL(url, null); - langVector = nxTools.strings(HttpClient.wget(u.toString()), "UTF-8"); + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.yacyUserAgent); + langVector = nxTools.strings(HttpClient.wget(u.toString(), reqHeader, 10000), "UTF-8"); }catch(IOException e){ prop.put("status", "1");//unable to get url prop.put("status_url", url); diff --git a/htroot/ConfigSkins_p.java b/htroot/ConfigSkins_p.java index cd9d1f73d..9ba2c35c0 100644 --- a/htroot/ConfigSkins_p.java +++ b/htroot/ConfigSkins_p.java @@ -33,6 +33,7 @@ import java.io.PrintWriter; import java.util.ArrayList; import java.util.Iterator; +import de.anomic.crawler.HTTPLoader; import de.anomic.data.listManager; import de.anomic.http.HttpClient; import de.anomic.http.httpHeader; @@ -88,7 +89,9 @@ public class ConfigSkins_p { ArrayList skinVector; try { yacyURL u = new yacyURL(url, null); - skinVector = nxTools.strings(HttpClient.wget(u.toString()), "UTF-8"); + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.yacyUserAgent); + skinVector = nxTools.strings(HttpClient.wget(u.toString(), reqHeader, 10000), "UTF-8"); } catch (IOException e) { prop.put("status", "1");// unable to get URL prop.put("status_url", url); diff --git a/htroot/CookieTest.html b/htroot/CookieTest_p.html similarity index 94% rename from htroot/CookieTest.html rename to htroot/CookieTest_p.html index 707689670..06545d0d6 100644 --- a/htroot/CookieTest.html +++ b/htroot/CookieTest_p.html @@ -24,7 +24,7 @@
- +
diff --git a/htroot/CookieTest.java b/htroot/CookieTest_p.java similarity index 95% rename from htroot/CookieTest.java rename to htroot/CookieTest_p.java index e621440a4..d3a41451f 100644 --- a/htroot/CookieTest.java +++ b/htroot/CookieTest_p.java @@ -54,7 +54,7 @@ import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.servletProperties; -public class CookieTest { +public class CookieTest_p { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { @@ -69,7 +69,7 @@ public class CookieTest { final servletProperties prop = new servletProperties(); if(post.containsKey("act")&&post.get("act").equals("clear_cookie")) { - httpHeader outgoingHeader = new httpHeader(); + httpHeader outgoingHeader = new httpHeader(); Iterator> it = header.entrySet().iterator(); Map.Entry e; while (it.hasNext()) { diff --git a/htroot/Network.java b/htroot/Network.java index 1dba77de7..9db82acb0 100644 --- a/htroot/Network.java +++ b/htroot/Network.java @@ -54,6 +54,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import de.anomic.crawler.HTTPLoader; import de.anomic.http.HttpClient; import de.anomic.http.JakartaCommonsHttpClient; import de.anomic.http.httpHeader; @@ -375,8 +376,7 @@ public class Network { prop.putHTML(STR_TABLE_LIST + conCount + "_fullname", seed.get(yacySeed.NAME, "deadlink")); userAgent = null; if (seed.hash.equals(sb.webIndex.seedDB.mySeed().hash)) { - final JakartaCommonsHttpClient httpClient = new JakartaCommonsHttpClient(10000, null, null); - userAgent = httpClient.getUserAgent(); + userAgent = HTTPLoader.yacyUserAgent; location = HttpClient.generateLocation(); } else { userAgent = sb.webIndex.peerActions.getUserAgent(seed.getIP()); diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index f26579f2b..bdcf1b3a9 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -55,6 +55,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; +import de.anomic.crawler.HTTPLoader; import de.anomic.data.listManager; import de.anomic.http.HttpClient; import de.anomic.http.httpHeader; @@ -128,10 +129,11 @@ public class sharedBlacklist_p { httpHeader reqHeader = new httpHeader(); reqHeader.put(httpHeader.PRAGMA,"no-cache"); reqHeader.put(httpHeader.CACHE_CONTROL,"no-cache"); - + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.yacyUserAgent); + // get List yacyURL u = new yacyURL(downloadURL, null); - otherBlacklist = nxTools.strings(HttpClient.wget(u.toString(), reqHeader), "UTF-8"); + otherBlacklist = nxTools.strings(HttpClient.wget(u.toString(), reqHeader, 1000), "UTF-8"); } catch (Exception e) { prop.put("status", STATUS_PEER_UNKNOWN); prop.put("page", "1"); @@ -147,7 +149,9 @@ public class sharedBlacklist_p { try { yacyURL u = new yacyURL(downloadURL, null); - otherBlacklist = nxTools.strings(HttpClient.wget(u.toString()), "UTF-8"); //get List + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.yacyUserAgent); + otherBlacklist = nxTools.strings(HttpClient.wget(u.toString(), reqHeader, 10000), "UTF-8"); //get List } catch (Exception e) { prop.put("status", STATUS_URL_PROBLEM); prop.putHTML("status_address",downloadURL); diff --git a/htroot/xml/util/getpageinfo_p.java b/htroot/xml/util/getpageinfo_p.java index 8242258f5..da20ae5bf 100644 --- a/htroot/xml/util/getpageinfo_p.java +++ b/htroot/xml/util/getpageinfo_p.java @@ -49,6 +49,7 @@ import java.io.IOException; import java.io.Writer; import java.net.MalformedURLException; +import de.anomic.crawler.HTTPLoader; import de.anomic.data.robotsParser; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; @@ -60,6 +61,7 @@ import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyURL; public class getpageinfo_p { + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { serverObjects prop = new serverObjects(); prop.put("sitemap", ""); @@ -81,7 +83,9 @@ public class getpageinfo_p { if (actions.indexOf("title")>=0) { try { yacyURL u = new yacyURL(url, null); - byte[] r = HttpClient.wget(u.toString()); + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); + byte[] r = HttpClient.wget(u.toString(), reqHeader, 5000); if (r == null) return prop; String contentString=new String(r); diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index 3762c0036..01bbac639 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -78,7 +78,8 @@ public final class HTTPLoader { private static final String DEFAULT_CHARSET = "ISO-8859-1,utf-8;q=0.7,*;q=0.7"; private static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10; public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5; - private static final String crawlerUserAgent = "yacybot (" + HttpClient.getSystemOST() +") http://yacy.net/bot.html"; + public static final String crawlerUserAgent = "yacybot (" + HttpClient.getSystemOST() +") http://yacy.net/bot.html"; + public static final String yacyUserAgent = "yacy (" + HttpClient.getSystemOST() +") yacy.net"; /** * The socket timeout that should be used diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index eb1e1cba3..0ce58232c 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -59,9 +59,11 @@ import org.xml.sax.helpers.DefaultHandler; import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.ZURL; import de.anomic.http.JakartaCommonsHttpClient; import de.anomic.http.JakartaCommonsHttpResponse; +import de.anomic.http.httpHeader; import de.anomic.http.httpdByteCountInputStream; import de.anomic.index.indexURLReference; import de.anomic.plasma.plasmaSwitchboard; @@ -169,7 +171,9 @@ public class SitemapParser extends DefaultHandler { */ public void parse() { // download document - JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(5000, null, null); + httpHeader header = new httpHeader(); + header.put(httpHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); + JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(5000, header, null); JakartaCommonsHttpResponse res = null; try { res = client.GET(siteMapURL.toString()); diff --git a/source/de/anomic/data/robotsParser.java b/source/de/anomic/data/robotsParser.java index 4bff91cc9..6546b7401 100644 --- a/source/de/anomic/data/robotsParser.java +++ b/source/de/anomic/data/robotsParser.java @@ -56,6 +56,7 @@ import java.net.URLDecoder; import java.util.ArrayList; import java.util.Date; +import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.RobotsTxt; import de.anomic.http.HttpClient; import de.anomic.http.JakartaCommonsHttpClient; @@ -393,6 +394,9 @@ public final class robotsParser{ // if we previously have downloaded this robots.txt then we can set the if-modified-since header httpHeader reqHeaders = new httpHeader(); + // add yacybot user agent + reqHeaders.put(httpHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); + // adding referer reqHeaders.put(httpHeader.REFERER, (yacyURL.newURL(robotsURL,"/")).toNormalform(true, true)); diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 69eaaecf1..b863e4aa9 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -61,8 +61,10 @@ import java.util.Properties; import javax.swing.event.EventListenerList; +import de.anomic.crawler.HTTPLoader; import de.anomic.data.htmlTools; import de.anomic.http.HttpClient; +import de.anomic.http.httpHeader; import de.anomic.server.serverCharBuffer; import de.anomic.server.serverFileUtils; import de.anomic.yacy.yacyURL; @@ -507,7 +509,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen public static htmlFilterContentScraper parseResource(yacyURL location) throws IOException { // load page - byte[] page = HttpClient.wget(location.toString()); + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); + byte[] page = HttpClient.wget(location.toString(), reqHeader, 10000); if (page == null) throw new IOException("no response from url " + location.toString()); // scrape content diff --git a/source/de/anomic/http/HttpClient.java b/source/de/anomic/http/HttpClient.java index 18a68fdc3..a9db3a94a 100644 --- a/source/de/anomic/http/HttpClient.java +++ b/source/de/anomic/http/HttpClient.java @@ -78,62 +78,7 @@ public abstract class HttpClient { return JakartaCommonsHttpClient.date2String(date); } - /** - * Gets a page (as raw bytes) - * - * @param uri - * @return - */ - public static byte[] wget(final String uri) { - return wget(uri, null, null); - } - - /** - * Gets a page (as raw bytes) addressing vhost at host in uri - * - * @param uri - * @param vhost used if host in uri cannot be resolved (yacy tld) - * @return - */ - public static byte[] wget(final String uri, final String vhost) { - return wget(uri, null, vhost); - } - - /** - * Gets a page (as raw bytes) aborting after timeout - * - * @param uri - * @param timeout in milliseconds - * @return - */ - public static byte[] wget(final String uri, final int timeout) { - return wget(uri, null, null, timeout); - } - - /** - * Gets a page (as raw bytes) with specified header - * - * @param uri - * @param header - * @return - */ - public static byte[] wget(final String uri, final httpHeader header) { - return wget(uri, header, null); - } - - /** - * Gets a page (as raw bytes) addressing vhost at host in uri with specified header - * - * @param uri - * @param header - * @param vhost - * @return - * @require uri != null - */ - public static byte[] wget(final String uri, final httpHeader header, final String vhost) { - return wget(uri, header, vhost, 10000); - } - + /** * Gets a page (as raw bytes) addressing vhost at host in uri with specified header and timeout * @@ -143,13 +88,13 @@ public abstract class HttpClient { * @param timeout in milliseconds * @return */ - public static byte[] wget(final String uri, httpHeader header, final String vhost, final int timeout) { + public static byte[] wget(final String uri, final httpHeader header, int timeout) { + return wget(uri, header, timeout, null); + } + public static byte[] wget(final String uri, httpHeader header, final int timeout, final String vhost) { assert uri != null : "precondition violated: uri != null"; - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(timeout, null, null); - - // set header - header = addHostHeader(header, vhost); - client.setHeader(header); + addHostHeader(header, vhost); + final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(timeout, header, null); // do the request try { @@ -168,7 +113,7 @@ public abstract class HttpClient { * @param vhost * @return */ - private static httpHeader addHostHeader(httpHeader header, final String vhost) { + private static void addHostHeader(httpHeader header, final String vhost) { if (vhost != null) { if (header != null) { header = new httpHeader(); @@ -176,7 +121,6 @@ public abstract class HttpClient { // set host-header header.add(httpHeader.HOST, vhost); } - return header; } /** diff --git a/source/de/anomic/http/JakartaCommonsHttpClient.java b/source/de/anomic/http/JakartaCommonsHttpClient.java index 7e6e59ea3..cce5789a6 100644 --- a/source/de/anomic/http/JakartaCommonsHttpClient.java +++ b/source/de/anomic/http/JakartaCommonsHttpClient.java @@ -546,7 +546,7 @@ public class JakartaCommonsHttpClient { // wget System.out.println("wget " + url); System.out.println("--------------------------------------"); - System.out.println(new String(de.anomic.http.HttpClient.wget(url))); + System.out.println(new String(de.anomic.http.HttpClient.wget(url, null, 10000))); } } catch (final IOException e) { e.printStackTrace(); diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 91047bbe6..12d2d97b9 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -76,6 +76,7 @@ import java.util.logging.LogManager; import java.util.logging.Logger; import java.util.zip.GZIPOutputStream; +import de.anomic.crawler.HTTPLoader; import de.anomic.htmlFilter.htmlFilterContentTransformer; import de.anomic.htmlFilter.htmlFilterTransformer; import de.anomic.htmlFilter.htmlFilterWriter; @@ -113,7 +114,6 @@ public final class httpdProxyHandler { * *The* remote Proxy configuration */ private static httpRemoteProxyConfig remoteProxyConfig = null; - private static final String proxyUserAgent = "yacy (" + HttpClient.getSystemOST() +") yacy.net"; private static File htRootPath = null; //private Properties connectionProperties = null; @@ -1618,7 +1618,7 @@ public final class httpdProxyHandler { private static synchronized String generateUserAgent(httpHeader requestHeaders) { userAgentStr.setLength(0); - String browserUserAgent = (String) requestHeaders.get(httpHeader.USER_AGENT, proxyUserAgent); + String browserUserAgent = (String) requestHeaders.get(httpHeader.USER_AGENT, HTTPLoader.yacyUserAgent); int pos = browserUserAgent.lastIndexOf(')'); if (pos >= 0) { userAgentStr diff --git a/source/de/anomic/net/natLib.java b/source/de/anomic/net/natLib.java index a85526f0d..fd869c6b2 100644 --- a/source/de/anomic/net/natLib.java +++ b/source/de/anomic/net/natLib.java @@ -64,7 +64,7 @@ public class natLib { rm status.htm */ try { - ArrayList x = nxTools.strings(HttpClient.wget("http://admin:"+password+"@192.168.0.1:80/status.htm")); + ArrayList x = nxTools.strings(HttpClient.wget("http://admin:"+password+"@192.168.0.1:80/status.htm", null, 10000)); x = nxTools.grep(x, 1, "IP Address"); if ((x == null) || (x.size() == 0)) return null; String line = nxTools.tail1(x); @@ -76,7 +76,7 @@ public class natLib { private static String getWhatIsMyIP() { try { - ArrayList x = nxTools.strings(HttpClient.wget("http://www.whatismyip.com/")); + ArrayList x = nxTools.strings(HttpClient.wget("http://www.whatismyip.com/", null, 10000)); x = nxTools.grep(x, 0, "Your IP is"); String line = nxTools.tail1(x); return nxTools.awk(line, " ", 4); @@ -87,7 +87,7 @@ public class natLib { private static String getStanford() { try { - ArrayList x = nxTools.strings(HttpClient.wget("http://www.slac.stanford.edu/cgi-bin/nph-traceroute.pl")); + ArrayList x = nxTools.strings(HttpClient.wget("http://www.slac.stanford.edu/cgi-bin/nph-traceroute.pl", null, 10000)); x = nxTools.grep(x, 0, "firewall protecting your browser"); String line = nxTools.tail1(x); return nxTools.awk(line, " ", 7); @@ -98,7 +98,7 @@ public class natLib { private static String getIPID() { try { - ArrayList x = nxTools.strings(HttpClient.wget("http://ipid.shat.net/"), "UTF-8"); + ArrayList x = nxTools.strings(HttpClient.wget("http://ipid.shat.net/", null, 10000), "UTF-8"); x = nxTools.grep(x, 2, "Your IP address"); String line = nxTools.tail1(x); return nxTools.awk(nxTools.awk(nxTools.awk(line, " ", 5), ">", 2), "<", 1); diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index a9895ee09..686642f59 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -58,7 +58,9 @@ import com.catcode.odf.ODFMetaFileAnalyzer; import com.catcode.odf.OpenDocumentMetadata; import com.catcode.odf.OpenDocumentTextInputStream; +import de.anomic.crawler.HTTPLoader; import de.anomic.http.HttpClient; +import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; @@ -250,7 +252,9 @@ public class odtParser extends AbstractParser implements Parser { testParser.setLogger(new serverLog("PARSER.ODT")); // downloading the document content - byte[] content = HttpClient.wget(contentUrl.toString()); + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); + byte[] content = HttpClient.wget(contentUrl.toString(), reqHeader, 10000); ByteArrayInputStream input = new ByteArrayInputStream(content); // parsing the document diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index 0be0c0bb9..d420d3aec 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -52,7 +52,9 @@ import java.util.Hashtable; import com.jguild.jrpm.io.RPMFile; import com.jguild.jrpm.io.datatype.DataTypeIf; +import de.anomic.crawler.HTTPLoader; import de.anomic.http.HttpClient; +import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; @@ -180,7 +182,9 @@ public class rpmParser extends AbstractParser implements Parser { yacyURL contentUrl = new yacyURL(args[0], null); rpmParser testParser = new rpmParser(); - byte[] content = HttpClient.wget(contentUrl.toString()); + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); + byte[] content = HttpClient.wget(contentUrl.toString(), reqHeader, 10000); ByteArrayInputStream input = new ByteArrayInputStream(content); testParser.parse(contentUrl, "application/x-rpm", null, input); } catch (Exception e) { diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index 8c0a22a88..c9658283e 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -53,7 +53,9 @@ import java.util.Hashtable; import java.util.Iterator; import java.util.LinkedList; +import de.anomic.crawler.HTTPLoader; import de.anomic.http.HttpClient; +import de.anomic.http.httpHeader; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; @@ -291,7 +293,9 @@ public class vcfParser extends AbstractParser implements Parser { yacyURL contentUrl = new yacyURL(args[0], null); vcfParser testParser = new vcfParser(); - byte[] content = HttpClient.wget(contentUrl.toString()); + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); + byte[] content = HttpClient.wget(contentUrl.toString(), reqHeader, 10000); ByteArrayInputStream input = new ByteArrayInputStream(content); testParser.parse(contentUrl, "text/x-vcard", "UTF-8",input); } catch (Exception e) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 989ff674d..ec59285f4 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -112,6 +112,7 @@ import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.CrawlStacker; import de.anomic.crawler.ErrorURL; +import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.ImporterManager; import de.anomic.crawler.IndexingStack; import de.anomic.crawler.NoticedURL; @@ -2703,8 +2704,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch loadHashMap(yacyURL url) { try { // sending request - final HashMap result = nxTools.table( - HttpClient.wget(url.toString()) - , "UTF-8"); - + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.yacyUserAgent); + final HashMap result = nxTools.table(HttpClient.wget(url.toString(), reqHeader, 10000), "UTF-8"); if (result == null) return new HashMap(); return result; } catch (Exception e) { diff --git a/source/de/anomic/tools/loaderThreads.java b/source/de/anomic/tools/loaderThreads.java index b96c6fda6..076489d4b 100644 --- a/source/de/anomic/tools/loaderThreads.java +++ b/source/de/anomic/tools/loaderThreads.java @@ -40,13 +40,13 @@ package de.anomic.tools; -import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Hashtable; +import de.anomic.crawler.HTTPLoader; import de.anomic.http.HttpClient; +import de.anomic.http.httpHeader; import de.anomic.http.httpRemoteProxyConfig; -import de.anomic.http.httpdProxyHandler; import de.anomic.yacy.yacyURL; public class loaderThreads { @@ -141,7 +141,9 @@ public class loaderThreads { public void run() { try { - page = HttpClient.wget(url.toString(), timeout); + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); + page = HttpClient.wget(url.toString(), reqHeader, timeout); loaded = true; process.feed(page); if (process.status() == loaderCore.STATUS_FAILED) { @@ -223,6 +225,7 @@ public class loaderThreads { } } + /* public static void main(String[] args) { httpdProxyHandler.setRemoteProxyConfig(httpRemoteProxyConfig.init("192.168.1.122", 3128)); loaderThreads loader = new loaderThreads(); @@ -232,5 +235,5 @@ public class loaderThreads { } } - + */ } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index ed0de1927..4789f422d 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -60,6 +60,7 @@ import org.apache.commons.httpclient.methods.multipart.FilePart; import org.apache.commons.httpclient.methods.multipart.Part; import org.apache.commons.httpclient.methods.multipart.StringPart; +import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.ResultURLs; import de.anomic.http.HttpClient; import de.anomic.http.JakartaCommonsHttpClient; @@ -263,13 +264,11 @@ public final class yacyClient { * @throws IOException */ private static byte[] wput(final String url, String vhost, final List post, final int timeout, boolean gzipBody) throws IOException { - JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(timeout, null, null); - client.setProxy(proxyConfig()); - - // address vhost httpHeader header = new httpHeader(); - header.add(httpHeader.HOST, vhost); - client.setHeader(header); + header.put(httpHeader.USER_AGENT, HTTPLoader.yacyUserAgent); + header.put(httpHeader.HOST, vhost); + JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(timeout, header, null); + client.setProxy(proxyConfig()); JakartaCommonsHttpResponse res = null; byte[] content = null; @@ -1068,6 +1067,8 @@ public final class yacyClient { final String wordhashe = indexWord.word2hash("test"); //System.out.println("permission=" + permissionMessage(args[1])); + httpHeader reqHeader = new httpHeader(); + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); final byte[] content = HttpClient.wget( "http://" + target.getPublicAddress() + "/yacy/search.html" + "?myseed=" + sb.webIndex.seedDB.mySeed().genSeedStr(null) + @@ -1077,7 +1078,7 @@ public final class yacyClient { "&resource=global" + "&query=" + wordhashe + "&network.unit.name=" + plasmaSwitchboard.getSwitchboard().getConfig("network.unit.name", yacySeed.DFLT_NETWORK_UNIT), - target.getHexHash() + ".yacyh"); + reqHeader, 10000, target.getHexHash() + ".yacyh"); final HashMap result = nxTools.table(content, "UTF-8"); System.out.println("Result=" + result.toString()); } catch (Exception e) { diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index 87976f5c1..c9156073e 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -60,6 +60,7 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeMap; +import de.anomic.crawler.HTTPLoader; import de.anomic.http.JakartaCommonsHttpClient; import de.anomic.http.JakartaCommonsHttpResponse; import de.anomic.http.httpHeader; @@ -849,7 +850,8 @@ public final class yacySeedDB implements httpdAlternativeDomainNames { // Configure http headers httpHeader reqHeader = new httpHeader(); reqHeader.put(httpHeader.PRAGMA, "no-cache"); - reqHeader.put(httpHeader.CACHE_CONTROL, "no-cache"); // httpc uses HTTP/1.0 is this necessary? + reqHeader.put(httpHeader.CACHE_CONTROL, "no-cache"); // httpc uses HTTP/1.0 is this necessary? + reqHeader.put(httpHeader.USER_AGENT, HTTPLoader.yacyUserAgent); // init http-client JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(10000, reqHeader, null); diff --git a/source/de/anomic/yacy/yacyVersion.java b/source/de/anomic/yacy/yacyVersion.java index 90ec5ba5d..90d89cba9 100644 --- a/source/de/anomic/yacy/yacyVersion.java +++ b/source/de/anomic/yacy/yacyVersion.java @@ -42,9 +42,11 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; +import de.anomic.crawler.HTTPLoader; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.JakartaCommonsHttpClient; import de.anomic.http.JakartaCommonsHttpResponse; +import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; import de.anomic.server.serverFileUtils; @@ -336,7 +338,9 @@ public final class yacyVersion implements Comparator, Comparable