From f810915717579d490259d70610dc4118b7c6e6e9 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 11 May 2015 16:30:41 +0200 Subject: [PATCH] added crawl start from a clone with very, very large url: they are now encoded as post submit form inside a javascript creation function. --- htroot/CrawlStartExpert.java | 2 +- htroot/Table_API_p.html | 24 +++++++++++++- htroot/Table_API_p.java | 21 +++++++++++-- .../cora/document/id/MultiProtocolURL.java | 31 ++++++++++++------- source/net/yacy/data/WorkTables.java | 12 +++---- 5 files changed, 67 insertions(+), 23 deletions(-) diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java index 79939bd54..037ea228d 100644 --- a/htroot/CrawlStartExpert.java +++ b/htroot/CrawlStartExpert.java @@ -55,7 +55,7 @@ public class CrawlStartExpert { // ---------- Start point // crawl start URL if (post != null && post.containsKey("crawlingURL")) { - final String crawlingURL = post.get("crawlingURL", ""); + final String crawlingURL = post.get("crawlingURL", "").replaceAll("%0D%0A", "\n").replaceAll("%0A", "\n").replaceAll("%0D", "\n"); prop.put("starturl", crawlingURL); // simple check for content since it may be empty if (!crawlingURL.trim().isEmpty()) { diff --git a/htroot/Table_API_p.html b/htroot/Table_API_p.html index 233533f1b..aa3cf30bc 100644 --- a/htroot/Table_API_p.html +++ b/htroot/Table_API_p.html @@ -83,7 +83,29 @@ To see a list of all APIs, please visit the - #[type]##(isCrawlerStart)#::

#(/isCrawlerStart)# + #[type]# + #(isCrawlerStart)#::

+ ::

+ + + + + #(/isCrawlerStart)# #[comment]# #[callcount]# #[dateRecording]# diff --git a/htroot/Table_API_p.java b/htroot/Table_API_p.java index b86e477b8..513bb46ae 100644 --- a/htroot/Table_API_p.java +++ b/htroot/Table_API_p.java @@ -31,6 +31,7 @@ import java.util.regex.Pattern; import net.yacy.cora.date.AbstractFormatter; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; @@ -290,9 +291,25 @@ public class Table_API_p { // check type & action to link crawl start URLs back to CrawlStartExpert.html if (prop.get("showtable_list_" + count + "_type", "").equals(WorkTables.TABLE_API_TYPE_CRAWLER) && prop.get("showtable_list_" + count + "_comment", "").startsWith("crawl start for")) { - prop.put("showtable_list_" + count + "_isCrawlerStart", 1); final String editUrl = UTF8.String(row.get(WorkTables.TABLE_API_COL_URL)).replace("Crawler_p", "CrawlStartExpert"); - prop.put("showtable_list_" + count + "_isCrawlerStart_url", editUrl); + if (editUrl.length() > 1000) { + final MultiProtocolURL u = new MultiProtocolURL("http://localhost:8090" + editUrl); + prop.put("showtable_list_" + count + "_isCrawlerStart", 2); + prop.put("showtable_list_" + count + "_isCrawlerStart_pk", UTF8.String(row.getPK())); + prop.put("showtable_list_" + count + "_isCrawlerStart_servlet", "/CrawlStartExpert.html"); + Map attr = u.getAttributes(); + int ac = 0; + for (Map.Entry entry: attr.entrySet()) { + prop.put("showtable_list_" + count + "_isCrawlerStart_attr_" + ac + "_key", entry.getKey()); + prop.put("showtable_list_" + count + "_isCrawlerStart_attr_" + ac + "_value", entry.getValue()); + ac++; + } + prop.put("showtable_list_" + count + "_isCrawlerStart_attr", ac); + } else { + // short calls + prop.put("showtable_list_" + count + "_isCrawlerStart", 1); + prop.put("showtable_list_" + count + "_isCrawlerStart_url", editUrl); + } } else { prop.put("showtable_list_" + count + "_isCrawlerStart", 0); } diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index dd9a252f2..ee5a8415b 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -516,18 +516,12 @@ public class MultiProtocolURL implements Serializable, Comparable element: getAttributes().entrySet()) { + qtmp.append('&'); + qtmp.append(element.getKey()); + qtmp.append('='); + qtmp.append(escape(element.getValue())); } this.searchpart = qtmp.substring((qtmp.length() > 0) ? 1 : 0); } @@ -1013,6 +1007,21 @@ public class MultiProtocolURL implements Serializable, Comparable getAttributes() { + Map map = new LinkedHashMap<>(); + if (this.searchpart == null) return map; + final String[] questp = CommonPattern.AMP.split(this.searchpart, -1); + for (final String element : questp) { + int p = element.indexOf('='); + if (p != -1) { + map.put(element.substring(0, p), element.substring(p + 1)); + } else { + map.put(element.substring(0, p), ""); + } + } + return map; + } + private static CharType charType(final char c) { if (c >= 'a' && c <= 'z') return CharType.low; if (c >= '0' && c <= '9') return CharType.number; diff --git a/source/net/yacy/data/WorkTables.java b/source/net/yacy/data/WorkTables.java index 90ac309f8..21f156e30 100644 --- a/source/net/yacy/data/WorkTables.java +++ b/source/net/yacy/data/WorkTables.java @@ -240,18 +240,15 @@ public class WorkTables extends Tables { if (row == null) continue; String theapicall = UTF8.String(row.get(WorkTables.TABLE_API_COL_URL)) + "&" + WorkTables.TABLE_API_COL_APICALL_PK + "=" + UTF8.String(row.getPK()); try { + MultiProtocolURL url = new MultiProtocolURL("http", host, port, theapicall); // use 4 param MultiProtocolURL to allow api_row_url with searchpart (like url?p=a&p2=b ) in client.GETbytes() if (theapicall.length() > 1000) { // use a POST to execute the call - int ai = theapicall.indexOf('?'); - String[] tacs = theapicall.substring(ai + 1).split("&"); Map post = new HashMap<>(); - for (String a: tacs) { - int f = a.indexOf('='); - if (f < 0) continue; - post.put(a.substring(0, f), UTF8.StringBody(a.substring(f + 1))); + for (Map.Entry a: url.getAttributes().entrySet()) { + post.put(a.getKey(), UTF8.StringBody(a.getValue())); } - MultiProtocolURL url = new MultiProtocolURL("http", host, port, theapicall.substring(0, ai)); + url = new MultiProtocolURL("http", host, port, url.getFileName()); try { client.POSTbytes(url, "localhost", post, false, false); } catch (final IOException e) { @@ -260,7 +257,6 @@ public class WorkTables extends Tables { } } else { // use a GET to execute the call - MultiProtocolURL url = new MultiProtocolURL("http", host, port, theapicall); ConcurrentLog.info("WorkTables", "executing url: " + url.toNormalform(true)); try { client.GETbytes(url, username, pass, false); // use GETbytes(MultiProtocolURL,..) form to allow url in parameter (&url=path%