From d052bbdfe1a9d409664a826df67c70387efa5d41 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 15 Mar 2014 21:52:42 +0100 Subject: [PATCH 1/7] prevent exception on Site Crawl if no start url is given --- htroot/Crawler_p.java | 91 ++++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 39 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 5f9a238b6..4bb106d23 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -368,47 +368,60 @@ public class Crawler_p { prop.putHTML("info_error", e.getMessage()); } + boolean hasCrawlstartDataOK = true; + // check crawlurl was given in sitecrawl + if ("url".equals(crawlingMode) && rootURLs.size() == 0) hasCrawlstartDataOK = false; + // prepare a new crawling profile - final CrawlProfile profile = new CrawlProfile( - crawlName, - newcrawlingMustMatch, - newcrawlingMustNotMatch, - ipMustMatch, - ipMustNotMatch, - countryMustMatch, - crawlerNoDepthLimitMatch, - indexUrlMustMatch, - indexUrlMustNotMatch, - indexContentMustMatch, - indexContentMustNotMatch, - newcrawlingdepth, - directDocByURL, - crawlingIfOlder, - crawlingDomMaxPages, - crawlingQ, followFrames, obeyHtmlRobotsNoindex, - indexText, - indexMedia, - storeHTCache, - crawlOrder, - cachePolicy, - collection, - agentName); - byte[] handle = ASCII.getBytes(profile.handle()); - - // before we fire up a new crawl, we make sure that another crawl with the same name is not running - sb.crawler.removeActive(handle); - sb.crawler.removePassive(handle); - try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (final SpaceExceededException e1) {} - - // delete all error urls for that domain - Set hosthashes = new HashSet(); - for (DigestURL u: rootURLs) { - sb.index.fulltext().remove(u.hash()); - hosthashes.add(u.hosthash()); + final CrawlProfile profile; + byte[] handle; + if (hasCrawlstartDataOK) { + profile = new CrawlProfile( + crawlName, + newcrawlingMustMatch, + newcrawlingMustNotMatch, + ipMustMatch, + ipMustNotMatch, + countryMustMatch, + crawlerNoDepthLimitMatch, + indexUrlMustMatch, + indexUrlMustNotMatch, + indexContentMustMatch, + indexContentMustNotMatch, + newcrawlingdepth, + directDocByURL, + crawlingIfOlder, + crawlingDomMaxPages, + crawlingQ, followFrames, obeyHtmlRobotsNoindex, + indexText, + indexMedia, + storeHTCache, + crawlOrder, + cachePolicy, + collection, + agentName); + handle = ASCII.getBytes(profile.handle()); + + // before we fire up a new crawl, we make sure that another crawl with the same name is not running + sb.crawler.removeActive(handle); + sb.crawler.removePassive(handle); + try { + sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000); + } catch (final SpaceExceededException e1) { } + + // delete all error urls for that domain + Set hosthashes = new HashSet(); + for (DigestURL u : rootURLs) { + sb.index.fulltext().remove(u.hash()); + hosthashes.add(u.hosthash()); + } + sb.crawlQueues.errorURL.removeHosts(hosthashes); + sb.index.fulltext().commit(true); + } else { + profile = null; + handle = null; } - sb.crawlQueues.errorURL.removeHosts(hosthashes); - sb.index.fulltext().commit(true); - + // start the crawl if ("url".equals(crawlingMode)) { if (rootURLs.size() == 0) { From 923c9762cd9a894a5c4a1cf296fc475dc81fe131 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 15 Mar 2014 23:02:17 +0100 Subject: [PATCH 2/7] upd pom to latest jars --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 455dde921..b46297b49 100644 --- a/pom.xml +++ b/pom.xml @@ -325,7 +325,7 @@ org.apache.pdfbox fontbox - 1.8.3 + 1.8.4 org.apache.geronimo.specs @@ -546,7 +546,7 @@ xerces xercesImpl - 2.7.1 + 2.11.0 From 81a846ec3395afa879c08f420a96a7a454c2cd7a Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 16 Mar 2014 20:51:32 +0100 Subject: [PATCH 6/7] fix: set YaCy CONNECTION_PROP_HOST Header in ProxyServlet to host incl. port --- .../yacy/http/servlets/YaCyProxyServlet.java | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/source/net/yacy/http/servlets/YaCyProxyServlet.java b/source/net/yacy/http/servlets/YaCyProxyServlet.java index 6517d4bd1..c0ae0a786 100644 --- a/source/net/yacy/http/servlets/YaCyProxyServlet.java +++ b/source/net/yacy/http/servlets/YaCyProxyServlet.java @@ -127,14 +127,10 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); return; } - int port = proxyurl.getPort(); - if (port < 1) { - port = 80; - } - String host = proxyurl.getHost(); + String hostwithport = proxyurl.getHost(); if (proxyurl.getPort() != -1) { - host += ":" + proxyurl.getPort(); + hostwithport += ":" + proxyurl.getPort(); } RequestHeader yacyRequestHeader = ProxyHandler.convertHeaderFromJetty(request); yacyRequestHeader.remove(RequestHeader.KEEP_ALIVE); @@ -142,12 +138,12 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { final HashMap prop = new HashMap(); prop.put(HeaderFramework.CONNECTION_PROP_HTTP_VER, HeaderFramework.HTTP_VERSION_1_1); - prop.put(HeaderFramework.CONNECTION_PROP_HOST, proxyurl.getHost()); + prop.put(HeaderFramework.CONNECTION_PROP_HOST, hostwithport); prop.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getFile().replaceAll(" ", "%20")); prop.put(HeaderFramework.CONNECTION_PROP_REQUESTLINE, "PROXY"); prop.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); - yacyRequestHeader.put(HeaderFramework.HOST, proxyurl.getHost()); + yacyRequestHeader.put(HeaderFramework.HOST, hostwithport ); // temporarily add argument to header to pass it on to augmented browsing if (action != null) yacyRequestHeader.put("YACYACTION", action); @@ -185,7 +181,7 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { if (location.startsWith("http")) { location = request.getServletPath() + actioncmdstr + "url=" + location; } else { - location = request.getServletPath() + actioncmdstr + "url=http://" + proxyurl.getHost() + "/" + location; + location = request.getServletPath() + actioncmdstr + "url=http://" + hostwithport + "/" + location; } response.addHeader(HeaderFramework.LOCATION, location); } @@ -262,14 +258,14 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { } else if (url.startsWith("/")) { // absolute path of form href="/absolute/path/to/linked/page" - String newurl = init + servletstub + "http://" + host + url; + String newurl = init + servletstub + "http://" + hostwithport + url; newurl = newurl.replaceAll("\\$", "\\\\\\$"); m.appendReplacement(result, newurl); } else { // relative path of form href="relative/path" try { - MultiProtocolURL target = new MultiProtocolURL("http://" + host + directory + "/" + url); + MultiProtocolURL target = new MultiProtocolURL("http://" + hostwithport + directory + "/" + url); String newurl = init + servletstub + target.toString(); newurl = newurl.replaceAll("\\$", "\\\\\\$"); m.appendReplacement(result, newurl); From 9f02d2c47b4d4bb802f8f84a5bc2dd7a4826abb4 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 16 Mar 2014 22:11:19 +0100 Subject: [PATCH 7/7] fix: remove link to triplestore in Vocabulary_p (triplestore does not longer exist) - should be investigated in more detail to look for additional implications Remove "yacyaction" from proxyservlet as it was only needed for removed interaction routines. --- htroot/Vocabulary_p.html | 2 +- source/net/yacy/http/servlets/YaCyProxyServlet.java | 13 ++----------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index d3ac606b3..5b1c7b019 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -134,7 +134,7 @@ To see a list of all APIs, please visit the
if set, uses the predicate
#[objectspacepredicate]# for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)# -

This produces the following triples in the triplestore if a term or synonym matches in a document:

+

This produces the following triples in the triplestore if a term or synonym matches in a document:

Triple #1
#[triple1]#
Triple #2
#[triple2]#
diff --git a/source/net/yacy/http/servlets/YaCyProxyServlet.java b/source/net/yacy/http/servlets/YaCyProxyServlet.java index c0ae0a786..b1c93f00f 100644 --- a/source/net/yacy/http/servlets/YaCyProxyServlet.java +++ b/source/net/yacy/http/servlets/YaCyProxyServlet.java @@ -93,7 +93,6 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { if ("CONNECT".equalsIgnoreCase(request.getMethod())) { handleConnect(request, response); } else { - String action = null; final Continuation continuation = ContinuationSupport.getContinuation(request); @@ -108,11 +107,6 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { return; } - if (strARGS.startsWith("action=")) { - int detectnextargument = strARGS.indexOf("&"); - action = strARGS.substring(7, detectnextargument); - strARGS = strARGS.substring(detectnextargument + 1); - } if (strARGS.startsWith("url=")) { final String strUrl = strARGS.substring(4); // strip "url=" @@ -144,8 +138,6 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { prop.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); yacyRequestHeader.put(HeaderFramework.HOST, hostwithport ); - // temporarily add argument to header to pass it on to augmented browsing - if (action != null) yacyRequestHeader.put("YACYACTION", action); final ByteArrayOutputStream tmpproxyout = new ByteArrayOutputStream(); HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent); @@ -177,11 +169,10 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { if (response.getHeader(HeaderFramework.LOCATION) != null) { // rewrite location header String location = response.getHeader(HeaderFramework.LOCATION); - final String actioncmdstr = (action != null) ? "?action=" + action + "&" : "?"; if (location.startsWith("http")) { - location = request.getServletPath() + actioncmdstr + "url=" + location; + location = request.getServletPath() + "?url=" + location; } else { - location = request.getServletPath() + actioncmdstr + "url=http://" + hostwithport + "/" + location; + location = request.getServletPath() + "?url=http://" + hostwithport + "/" + location; } response.addHeader(HeaderFramework.LOCATION, location); }