added a regular expression test servlet which is linked within the

parser/crawler error page whenever a problem with regular expression
occurs.
This makes it easy to correct and enhance the must-match and
must-not-match patterns just by trying out which pattern could be
correct.
pull/1/head
Michael Peter Christen 13 years ago
parent 0504b01bdc
commit a30653a864

@ -30,6 +30,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlStacker;
import de.anomic.crawler.ZURL;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -89,7 +90,17 @@ public class IndexCreateParserErrors_p {
prop.putHTML("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : initiatorSeed.getName()));
prop.putHTML("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : executorSeed.getName()));
prop.putHTML("rejected_list_"+j+"_url", url.toNormalform(false, true));
prop.putHTML("rejected_list_"+j+"_failreason", entry.anycause());
String cause = entry.anycause();
if (cause.startsWith(CrawlStacker.ERROR_NO_MATCH_MUST_MATCH_FILTER)) {
prop.put("rejected_list_"+j+"_failreason", "(<a href=\"/RegexTest.html?text=" + url.toNormalform(false, true) +
"&regex=" + cause.substring(CrawlStacker.ERROR_NO_MATCH_MUST_MATCH_FILTER.length()) + "\">test</a>) " + cause);
} else if (cause.startsWith(CrawlStacker.ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER)) {
prop.put("rejected_list_"+j+"_failreason", "(<a href=\"/RegexTest.html?text=" + url.toNormalform(false, true) +
"&regex=" + cause.substring(CrawlStacker.ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER.length()) + "\">test</a>) " + cause);
} else {
prop.putHTML("rejected_list_"+j+"_failreason", cause);
}
prop.put("rejected_list_"+j+"_dark", dark ? "1" : "0");
dark = !dark;
j++;

@ -0,0 +1,39 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >
<head>
<title>YaCy '#[clientname]#': Crawl Start</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreate">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>RegexTest</h2>
<fieldset>
<form method="get" action="RegexTest.html" accept-charset="UTF-8">
<dl>
<dt>
<label>Test String</label>
</dt>
<dd>
<input name="text" id="text" type="text" size="80" maxlength="256" value="#[text]#" style="font-size:16px"/>
</dd>
<dt>
<label>Regular Expression</label>
</dt>
<dd>
<textarea name="regex" id="regex" cols="80" rows="5" style="font-size:16px">#[regex]#</textarea>
</dd>
<dt>
<label>Result</label>
</dt>
#(match)#<dd><img src="/env/grafics/nok.png"> no match</dd>::<dd><img src="/env/grafics/ok.png"> match</dd>::<dd><img src="/env/grafics/bad.png"> error in expression: #[error]#</dd>#(/match)#
<dt></dt>
<dd>
<input name="submit" id="submit" type="submit" />
</dd>
</dl>
</form>
</fieldset>
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,51 @@
/**
* RegexTest
* Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 14.09.2011 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.protocol.RequestHeader;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class RegexTest {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
String text = (post == null) ? "" : post.get("text", "");
String regex = (post == null) ? ".*" : post.get("regex", ".*");
String error = "";
Boolean match = null;
try {
match = text.matches(regex);
} catch (PatternSyntaxException e) {
error = e.getMessage();
}
final serverObjects prop = new serverObjects();
prop.put("text", text);
prop.put("regex", regex);
prop.put("match", match == null ? 2 : (match.booleanValue() ? 1 : 0));
prop.put("match_error", error);
return prop;
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 732 B

@ -65,6 +65,10 @@ import de.anomic.crawler.retrieval.SMBLoader;
public final class CrawlStacker {
public static String ERROR_NO_MATCH_MUST_MATCH_FILTER = "url does not match must-match filter ";
public static String ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER = "url matches must-not-match filter ";
private final Log log = new Log("STACKCRAWL");
private final WorkflowProcessor<Request> fastQueue, slowQueue;
@ -415,13 +419,13 @@ public final class CrawlStacker {
// filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return "url does not match must-match filter " + profile.urlMustMatchPattern().toString();
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
}
// filter with must-not-match for URLs
if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
return "url matches must-not-match filter " + profile.urlMustNotMatchPattern().toString();
return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
}
// deny cgi

Loading…
Cancel
Save