added crawl start from a clone with very, very large url: they are now

encoded as post submit form inside a javascript creation function.
pull/8/head
Michael Peter Christen 10 years ago
parent 51de86c992
commit f810915717

@ -55,7 +55,7 @@ public class CrawlStartExpert {
// ---------- Start point
// crawl start URL
if (post != null && post.containsKey("crawlingURL")) {
final String crawlingURL = post.get("crawlingURL", "");
final String crawlingURL = post.get("crawlingURL", "").replaceAll("%0D%0A", "\n").replaceAll("%0A", "\n").replaceAll("%0D", "\n");
prop.put("starturl", crawlingURL);
// simple check for content since it may be empty
if (!crawlingURL.trim().isEmpty()) {

@ -83,7 +83,29 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
#{list}#
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#" id="#[pk]#">
<td valign="top" align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
<td valign="top">#[type]##(isCrawlerStart)#::<br/><br/><a href="#[url]#" title="clone"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>#(/isCrawlerStart)#</td>
<td valign="top">#[type]#
#(isCrawlerStart)#::<br/><br/>
<a href="#[url]#" title="clone"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>::<br/><br/>
<script>
var f = document.createElement("form");
f.setAttribute("method", "post");
f.setAttribute("enctype", "multipart/form-data");
f.setAttribute("accept-charset", "UTF-8");
f.setAttribute("action", "#[servlet]#");
f.setAttribute("id", "#[pk]#");
#{attr}#
var e = document.createElement("input");
e.setAttribute("type", "hidden");
e.setAttribute("name", "#[key]#");
e.setAttribute("value", "#[value]#");
f.appendChild(e);
#{/attr}#
document.body.appendChild(f);
</script>
<a href="#" title="clone" onclick="document.forms['#[pk]#'].submit(); return false;"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>
#(/isCrawlerStart)#</td>
<td valign="top">#[comment]#</td>
<td valign="top">#[callcount]#</td>
<td valign="top">#[dateRecording]#</td>

@ -31,6 +31,7 @@ import java.util.regex.Pattern;
import net.yacy.cora.date.AbstractFormatter;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -290,9 +291,25 @@ public class Table_API_p {
// check type & action to link crawl start URLs back to CrawlStartExpert.html
if (prop.get("showtable_list_" + count + "_type", "").equals(WorkTables.TABLE_API_TYPE_CRAWLER)
&& prop.get("showtable_list_" + count + "_comment", "").startsWith("crawl start for")) {
prop.put("showtable_list_" + count + "_isCrawlerStart", 1);
final String editUrl = UTF8.String(row.get(WorkTables.TABLE_API_COL_URL)).replace("Crawler_p", "CrawlStartExpert");
prop.put("showtable_list_" + count + "_isCrawlerStart_url", editUrl);
if (editUrl.length() > 1000) {
final MultiProtocolURL u = new MultiProtocolURL("http://localhost:8090" + editUrl);
prop.put("showtable_list_" + count + "_isCrawlerStart", 2);
prop.put("showtable_list_" + count + "_isCrawlerStart_pk", UTF8.String(row.getPK()));
prop.put("showtable_list_" + count + "_isCrawlerStart_servlet", "/CrawlStartExpert.html");
Map<String, String> attr = u.getAttributes();
int ac = 0;
for (Map.Entry<String, String> entry: attr.entrySet()) {
prop.put("showtable_list_" + count + "_isCrawlerStart_attr_" + ac + "_key", entry.getKey());
prop.put("showtable_list_" + count + "_isCrawlerStart_attr_" + ac + "_value", entry.getValue());
ac++;
}
prop.put("showtable_list_" + count + "_isCrawlerStart_attr", ac);
} else {
// short calls
prop.put("showtable_list_" + count + "_isCrawlerStart", 1);
prop.put("showtable_list_" + count + "_isCrawlerStart_url", editUrl);
}
} else {
prop.put("showtable_list_" + count + "_isCrawlerStart", 0);
}

@ -516,18 +516,12 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
}
private void escapeSearchpart() {
final String[] questp = CommonPattern.AMP.split(this.searchpart, -1);
final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
for (final String element : questp) {
if (element.indexOf('=') != -1) {
qtmp.append('&');
qtmp.append(escape(element.substring(0, element.indexOf('='))));
qtmp.append('=');
qtmp.append(escape(element.substring(element.indexOf('=') + 1)));
} else {
qtmp.append('&');
qtmp.append(escape(element));
}
for (final Map.Entry<String, String> element: getAttributes().entrySet()) {
qtmp.append('&');
qtmp.append(element.getKey());
qtmp.append('=');
qtmp.append(escape(element.getValue()));
}
this.searchpart = qtmp.substring((qtmp.length() > 0) ? 1 : 0);
}
@ -1013,6 +1007,21 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return token;
}
public Map<String, String> getAttributes() {
Map<String, String > map = new LinkedHashMap<>();
if (this.searchpart == null) return map;
final String[] questp = CommonPattern.AMP.split(this.searchpart, -1);
for (final String element : questp) {
int p = element.indexOf('=');
if (p != -1) {
map.put(element.substring(0, p), element.substring(p + 1));
} else {
map.put(element.substring(0, p), "");
}
}
return map;
}
private static CharType charType(final char c) {
if (c >= 'a' && c <= 'z') return CharType.low;
if (c >= '0' && c <= '9') return CharType.number;

@ -240,18 +240,15 @@ public class WorkTables extends Tables {
if (row == null) continue;
String theapicall = UTF8.String(row.get(WorkTables.TABLE_API_COL_URL)) + "&" + WorkTables.TABLE_API_COL_APICALL_PK + "=" + UTF8.String(row.getPK());
try {
MultiProtocolURL url = new MultiProtocolURL("http", host, port, theapicall);
// use 4 param MultiProtocolURL to allow api_row_url with searchpart (like url?p=a&p2=b ) in client.GETbytes()
if (theapicall.length() > 1000) {
// use a POST to execute the call
int ai = theapicall.indexOf('?');
String[] tacs = theapicall.substring(ai + 1).split("&");
Map<String, ContentBody> post = new HashMap<>();
for (String a: tacs) {
int f = a.indexOf('=');
if (f < 0) continue;
post.put(a.substring(0, f), UTF8.StringBody(a.substring(f + 1)));
for (Map.Entry<String, String> a: url.getAttributes().entrySet()) {
post.put(a.getKey(), UTF8.StringBody(a.getValue()));
}
MultiProtocolURL url = new MultiProtocolURL("http", host, port, theapicall.substring(0, ai));
url = new MultiProtocolURL("http", host, port, url.getFileName());
try {
client.POSTbytes(url, "localhost", post, false, false);
} catch (final IOException e) {
@ -260,7 +257,6 @@ public class WorkTables extends Tables {
}
} else {
// use a GET to execute the call
MultiProtocolURL url = new MultiProtocolURL("http", host, port, theapicall);
ConcurrentLog.info("WorkTables", "executing url: " + url.toNormalform(true));
try {
client.GETbytes(url, username, pass, false); // use GETbytes(MultiProtocolURL,..) form to allow url in parameter (&url=path%

Loading…
Cancel
Save