added more configuration options for search:

- removed configuration button for 'search only for admin' from index.html and added this to ConfigPortal
- added configuration of link verification options (iffresh, cacheonly, nocache, ifexist) to ConfigPortal
- added configuration of navigation options to ConfigPortal
- added an option to switch off automatic index cleaning in case that a link verification method fails


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7613 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent e0c7d490f9
commit ba03ca8620

@ -723,6 +723,30 @@ search.result.show.metadata = true
search.result.show.parser = true
search.result.show.pictures = true
# search navigators: comma-separated list of default values for search navigation.
# can be temporary different if search string is given with differen navigation values
# assigning no value(s) means that no navigation is shown
search.navigation=hosts,authors,namespace,topics
# search result verification and snippet fetch caching rules
# each search result can be verified byloading the link from the web
# this can be enhanced using a cache. In some cases it may be appropriate
# to not verify the link at all and do not compute a snippet
# the possible cases are:
# nocache: no use of web cache, load all snippets online
# iffresh: use the cache if the cache exists and is fresh otherwise load online
# ifexist: use the cache if the cache exist or load online
# cacheonly: never go online, use all content from cache. If no cache entry exist,
# consider content nevertheless as available and show result without snippet
# false: no link verification and not snippet generation:
all search results are valid without verification
search.verify = iffresh
# in case that a link verification fails then the corresponding index reference can be
# deleted to clean up the index. If this property is set then failed index verification in
# the cases of nocache, iffresh and ifexist causes an index deletion
search.verify.delete = true
# remote search details
remotesearch.maxcount = 20
remotesearch.maxtime = 1000

@ -15,7 +15,7 @@
and a link to a home page that is reached when the 'corporate identity'-images are clicked.
To change also colours and styles use the <a href="ConfigAppearance_p.html">Appearance Servlet</a> for different skins and languages.
</p>
<form action="ConfigPortal.html" id="ConfigPortal" accept-charset="UTF-8">
<form action="ConfigPortal.html" method="post" enctype="multipart/form-data" id="ConfigPortal" accept-charset="UTF-8">
<fieldset>
<dl>
<dt>Greeting Line</dt>
@ -30,6 +30,12 @@
<dt>URL of a Large Corporate Image</dt>
<dd><input type="text" name="promoteSearchPageGreeting.largeImage" value="#[promoteSearchPageGreeting.largeImage]#" size="60" /></dd>
<dt>Enable Search for Everyone?</dt>
<dd>
<input type="radio" name="publicSearchpage" value="true" #(publicSearchpage)#::checked="checked"#(/publicSearchpage)# />Search is available for everyone&nbsp;
<input type="radio" name="publicSearchpage" value="false" #(publicSearchpage)#checked="checked"::#(/publicSearchpage)# />Only the administator is allowed to search
</dd>
<dt>Show Navigation Bar on Search Page?</dt>
<dd>
<input type="radio" name="publicTopmenu" value="true" #(publicTopmenu)#::checked="checked"#(/publicTopmenu)# />Show Navigation Top-Menu&nbsp;
@ -42,6 +48,16 @@
<input type="radio" name="search.options" value="false" #(search.options)#checked="checked"::#(/search.options)# />do not show Advanced Search
</dd>
<dt>Snippet Fetch Strategy &amp; Link Verification</dt>
<dd>
<input type="radio" name="search.verify" value="nocache" #(search.verify.nocache)#::checked="checked"#(/search.verify.nocache)# onclick="document.getElementById('search_verify_delete').disabled=false;document.getElementById('search_verify_delete').checked=true;"/>NOCACHE: no use of web cache, load all snippets online<br/>
<input type="radio" name="search.verify" value="iffresh" #(search.verify.iffresh)#::checked="checked"#(/search.verify.iffresh)# onclick="document.getElementById('search_verify_delete').disabled=false;document.getElementById('search_verify_delete').checked=true;"/>IFFRESH: use the cache if the cache exists and is fresh otherwise load online<br/>
<input type="radio" name="search.verify" value="ifexist" #(search.verify.ifexist)#::checked="checked"#(/search.verify.ifexist)# onclick="document.getElementById('search_verify_delete').disabled=false;document.getElementById('search_verify_delete').checked=true;"/>IFEXIST: use the cache if the cache exist or load online<br/>
<input type="checkbox" name="search.verify.delete" id="search_verify_delete" value="true" #(search.verify.delete)#::checked="checked"#(/search.verify.delete)# />If verification fails, delete index reference<br/><br/>
<input type="radio" name="search.verify" value="cacheonly" #(search.verify.cacheonly)#::checked="checked"#(/search.verify.cacheonly)# onclick="document.getElementById('search_verify_delete').disabled=true;document.getElementById('search_verify_delete').checked=false;"/>CACHEONLY: never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available and show result without snippet<br/>
<input type="radio" name="search.verify" value="false" #(search.verify.false)#::checked="checked"#(/search.verify.false)# onclick="document.getElementById('search_verify_delete').disabled=true;document.getElementById('search_verify_delete').checked=false;"/>FALSE: no link verification and not snippet generation: all search results are valid without verification
</dd>
<dt>Show Information Links for each Search Result Entry</dt>
<dd>
<input type="checkbox" name="search.result.show.date" value="true" #(search.result.show.date)#::checked="checked"#(/search.result.show.date)# />Date&nbsp;
@ -51,6 +67,14 @@
<input type="checkbox" name="search.result.show.pictures" value="true" #(search.result.show.pictures)#::checked="checked"#(/search.result.show.pictures)# />Pictures
</dd>
<dt>Show Navigation on Side-Bar</dt>
<dd>
<input type="checkbox" name="search.navigation.hosts" value="true" #(search.navigation.hosts)#::checked="checked"#(/search.navigation.hosts)# />Host Navigation&nbsp;
<input type="checkbox" name="search.navigation.authors" value="true" #(search.navigation.authors)#::checked="checked"#(/search.navigation.authors)# />Author Navigation&nbsp;
<input type="checkbox" name="search.navigation.namespace" value="true" #(search.navigation.namespace)#::checked="checked"#(/search.navigation.namespace)# />Wiki Name-Space Navigation&nbsp;
<input type="checkbox" name="search.navigation.topics" value="true" #(search.navigation.topics)#::checked="checked"#(/search.navigation.topics)# />Topics (Tag-Cloud) Navigation&nbsp;
</dd>
<dt>Default Pop-Up Page</dt>
<dd>
<input type="radio" name="popup" value="status" #(popupStatus)#::checked="checked"#(/popupStatus)# />Status Page&nbsp;

@ -71,12 +71,23 @@ public class ConfigPortal {
sb.setConfig(SwitchboardConstants.INDEX_FORWARD, post.get(SwitchboardConstants.INDEX_FORWARD, ""));
HTTPDFileHandler.indexForward = post.get(SwitchboardConstants.INDEX_FORWARD, "");
sb.setConfig("publicTopmenu", post.getBoolean("publicTopmenu", true));
sb.setConfig("publicSearchpage", post.getBoolean("publicSearchpage", true));
sb.setConfig("search.options", post.getBoolean("search.options", false));
sb.setConfig("search.result.show.date", post.getBoolean("search.result.show.date", false));
sb.setConfig("search.result.show.size", post.getBoolean("search.result.show.size", false));
sb.setConfig("search.result.show.metadata", post.getBoolean("search.result.show.metadata", false));
sb.setConfig("search.result.show.parser", post.getBoolean("search.result.show.parser", false));
sb.setConfig("search.result.show.pictures", post.getBoolean("search.result.show.pictures", false));
sb.setConfig("search.verify", post.get("search.verify", "ifexist"));
sb.setConfig("search.verify.delete", post.getBoolean("search.verify.delete", false));
// construct navigation String
String nav = "";
if (post.getBoolean("search.navigation.hosts", false)) nav += "hosts,";
if (post.getBoolean("search.navigation.authors", false)) nav += "authors,";
if (post.getBoolean("search.navigation.namespace", false)) nav += "namespace,";
if (post.getBoolean("search.navigation.topics", false)) nav += "topics,";
if (nav.endsWith(",")) nav = nav.substring(0, nav.length() - 1);
sb.setConfig("search.navigation", nav);
}
if (post.containsKey("searchpage_default")) {
sb.setConfig(SwitchboardConstants.GREETING, "P2P Web Search");
@ -88,12 +99,16 @@ public class ConfigPortal {
HTTPDFileHandler.indexForward = "";
sb.setConfig(SwitchboardConstants.SEARCH_TARGET, "_self");
sb.setConfig("publicTopmenu", true);
sb.setConfig("publicSearchpage", true);
sb.setConfig("search.navigation", "hosts,authors,namespace,topics");
sb.setConfig("search.options", true);
sb.setConfig("search.result.show.date", true);
sb.setConfig("search.result.show.size", true);
sb.setConfig("search.result.show.metadata", true);
sb.setConfig("search.result.show.parser", true);
sb.setConfig("search.result.show.pictures", true);
sb.setConfig("search.verify", "iffresh");
sb.setConfig("search.verify.delete", "true");
}
}
@ -103,13 +118,27 @@ public class ConfigPortal {
prop.putHTML(SwitchboardConstants.GREETING_SMALL_IMAGE, sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, ""));
prop.putHTML(SwitchboardConstants.INDEX_FORWARD, sb.getConfig(SwitchboardConstants.INDEX_FORWARD, ""));
prop.put("publicTopmenu", sb.getConfigBool("publicTopmenu", false) ? 1 : 0);
prop.put("publicSearchpage", sb.getConfigBool("publicSearchpage", false) ? 1 : 0);
prop.put("search.options", sb.getConfigBool("search.options", false) ? 1 : 0);
prop.put("search.result.show.date", sb.getConfigBool("search.result.show.date", false) ? 1 : 0);
prop.put("search.result.show.size", sb.getConfigBool("search.result.show.size", false) ? 1 : 0);
prop.put("search.result.show.metadata", sb.getConfigBool("search.result.show.metadata", false) ? 1 : 0);
prop.put("search.result.show.parser", sb.getConfigBool("search.result.show.parser", false) ? 1 : 0);
prop.put("search.result.show.pictures", sb.getConfigBool("search.result.show.pictures", false) ? 1 : 0);
prop.put("search.navigation.hosts", sb.getConfig("search.navigation", "").indexOf("hosts") >= 0 ? 1 : 0);
prop.put("search.navigation.authors", sb.getConfig("search.navigation", "").indexOf("authors") >= 0 ? 1 : 0);
prop.put("search.navigation.namespace", sb.getConfig("search.navigation", "").indexOf("namespace") >= 0 ? 1 : 0);
prop.put("search.navigation.topics", sb.getConfig("search.navigation", "").indexOf("topics") >= 0 ? 1 : 0);
prop.put("search.verify.nocache", sb.getConfig("search.verify", "").equals("nocache") ? 1 : 0);
prop.put("search.verify.iffresh", sb.getConfig("search.verify", "").equals("iffresh") ? 1 : 0);
prop.put("search.verify.ifexist", sb.getConfig("search.verify", "").equals("ifexist") ? 1 : 0);
prop.put("search.verify.cacheonly", sb.getConfig("search.verify", "").equals("cacheonly") ? 1 : 0);
prop.put("search.verify.false", sb.getConfig("search.verify", "").equals("false") ? 1 : 0);
prop.put("search.verify.delete", sb.getConfigBool("search.verify.delete", true) ? 1 : 0);
final String browserPopUpPage = sb.getConfig(SwitchboardConstants.BROWSER_POP_UP_PAGE, "ConfigBasic.html");
prop.put("popupFront", 0);
prop.put("popupSearch", 0);

@ -49,7 +49,7 @@
<fieldset class="maininput">
<input name="query" id="search" type="text" size="52" maxlength="80" value="#[former]#" />
<input type="submit" name="Enter" value="Search" />
<input type="hidden" name="verify" value="true" />
<input type="hidden" name="verify" value="#[search.verify]#" />
#(searchdomswitches)#::
<div class="yacysearch">
#(searchtext)#::<input type="radio" id="text" name="contentdom" value="text" #(check)#::checked="checked"#(/check)# /><label for="text">Text</label>&nbsp;&nbsp;#(/searchtext)#
@ -59,7 +59,7 @@
#(searchapp)#::<input type="radio" id="app" name="contentdom" value="app" #(check)#::checked="checked"#(/check)# /><label for="app">Applications</label>#(/searchapp)#
</div>
#(/searchdomswitches)#
<input type="hidden" name="nav" value="all" />
<input type="hidden" name="nav" value="#[search.navigation]#" />
<input type="hidden" name="startRecord" value="0" />
<input type="hidden" name="resource" value="global" />
<input type="hidden" name="urlmaskfilter" value=".*" />
@ -70,7 +70,6 @@
#(searchoptions)#::
</fieldset>
<p><a href="/index.html?searchoptions=2" onclick="this.href='/index.html?searchoptions=2&amp;former='+document.getElementById('searchform').search.value+'&amp;contentdom='+radioValue(document.getElementById('searchform').contentdom)">more options...</a></p>
<p><a href="http://www.yacy-websuche.de/wiki/index.php/En:SearchParameters">advanced parameters</a></p>
::
</fieldset>
@ -132,26 +131,17 @@
<input type="checkbox" id="indexof" name="indexof" #[indexofChecked]# /> <label for="indexof">only index pages</label>
</td>
</tr>
<tr><td></td><td></td></tr>
<tr>
<td>
<a href="http://www.yacy-websuche.de/wiki/index.php/En:SearchParameters">advanced parameters</a>
</td>
<td>
</td>
</tr>
</table>
#(/searchoptions)#
</form>
#(searchoptions)#::
<form action="index.html" method="get" class="search" accept-charset="UTF-8">
<p>
#(publicSearchpage)#
<button type="submit" name="publicPage" value="0">
<img src="/env/grafics/lock.gif" alt="authentication required" />
Disable search function for users without authorization
</button>
::
<button type="submit" name="publicPage" value="1">
<img src="/env/grafics/lock.gif" alt="authentication required" />
Enable web search to everyone
</button>
#(/publicSearchpage)#
</p>
</form>
#(/searchoptions)#
#(topmenu)#
#%env/templates/embeddedfooter.template%#
::

@ -51,15 +51,12 @@ public class index {
}
// access control
boolean publicPage = sb.getConfigBool("publicSearchpage", true);
final boolean authorizedAccess = sb.verifyAuthentication(header, false);
if ((post != null) && (post.containsKey("publicPage"))) {
if (!authorizedAccess) {
prop.put("AUTHENTICATE", "admin log-in"); // force log-in
return prop;
}
publicPage = post.get("publicPage", "0").equals("1");
sb.setConfig("publicSearchpage", publicPage);
}
final boolean global = (post == null) ? true : post.get("resource", "global").equals("global");
@ -114,7 +111,6 @@ public class index {
prop.put("searchoptions_prefermaskoptions", "0");
prop.putHTML("searchoptions_prefermaskoptions_prefermaskfilter", prefermaskfilter);
prop.put("searchoptions_indexofChecked", "");
prop.put("searchoptions_publicSearchpage", (publicPage) ? "0" : "1");
prop.put("results", "");
prop.putHTML("cat", cat);
prop.put("type", type);
@ -132,6 +128,8 @@ public class index {
prop.put("searchdomswitches_searchvideo_check", (contentdom == ContentDomain.VIDEO) ? "1" : "0");
prop.put("searchdomswitches_searchimage_check", (contentdom == ContentDomain.IMAGE) ? "1" : "0");
prop.put("searchdomswitches_searchapp_check", (contentdom == ContentDomain.APP) ? "1" : "0");
prop.put("search.navigation", sb.getConfig("search.navigation", "all") );
prop.put("search.verify", sb.getConfig("search.verify", "iffresh") );
// online caution timing
sb.localSearchLastAccess = System.currentTimeMillis();

@ -121,9 +121,9 @@ $(function() {
<input type="hidden" name="former" value="#[former]#" />
<input type="hidden" name="maximumRecords" value="#[count]#" />
<input type="hidden" name="startRecord" value="#[offset]#" />
<input type="hidden" name="verify" value="#[verify]#" />
<input type="hidden" name="verify" value="#[search.verify]#" />
<input type="hidden" name="resource" value="#[resource]#" />
<input type="hidden" name="nav" value="all" />
<input type="hidden" name="nav" value="#[search.navigation]#" />
<input type="hidden" name="urlmaskfilter" value="#[urlmaskfilter]#" />
<input type="hidden" name="prefermaskfilter" value="#[prefermaskfilter]#" />
<input type="hidden" name="depth" value="#[depth]#" />

@ -131,7 +131,8 @@ public class yacysearch {
prop.put("constraint", "");
prop.put("cat", "href");
prop.put("depth", "0");
prop.put("verify", (post == null) ? "true" : post.get("verify", "true"));
prop.put("search.verify", (post == null) ? sb.getConfig("search.verify", "iffresh") : post.get("verify", "iffresh"));
prop.put("search.navigation", (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "all"));
prop.put("contentdom", "text");
prop.put("contentdomCheckText", "1");
prop.put("contentdomCheckAudio", "0");
@ -403,7 +404,7 @@ public class yacysearch {
}
// navigation
final String navigation = (post == null) ? "" : post.get("nav", "");
final String navigation = (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "");
// the query
final TreeSet<String>[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute
@ -707,7 +708,8 @@ public class yacysearch {
prop.putHTML("prefermaskfilter", prefermask);
prop.put("indexof", (indexof) ? "on" : "off");
prop.put("constraint", (constraint == null) ? "" : constraint.exportB64());
prop.put("verify", snippetFetchStrategy == null ? "false" : snippetFetchStrategy.toName());
prop.put("search.verify", snippetFetchStrategy == null ? sb.getConfig("search.verify", "iffresh") : snippetFetchStrategy.toName());
prop.put("search.navigation", (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "all"));
prop.put("contentdom", (post == null ? "text" : post.get("contentdom", "text")));
prop.put("searchdomswitches", sb.getConfigBool("search.text", true) || sb.getConfigBool("search.audio", true) || sb.getConfigBool("search.video", true) || sb.getConfigBool("search.image", true) || sb.getConfigBool("search.app", true) ? 1 : 0);
prop.put("searchdomswitches_searchtext", sb.getConfigBool("search.text", true) ? 1 : 0);

@ -174,7 +174,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
return (WordReferenceVars) word;
}
public boolean hasTextSnippet() {
return (this.textSnippet != null) && (this.textSnippet.getErrorCode() < 11);
return (this.textSnippet != null) && (!this.textSnippet.getErrorCode().fail());
}
public boolean hasMediaSnippets() {
return (this.mediaSnippets != null) && (!this.mediaSnippets.isEmpty());

@ -66,6 +66,7 @@ public class ResultFetcher {
long urlRetrievalAllTime;
long snippetComputationAllTime;
int taketimeout;
private final boolean deleteIfSnippetFail;
public ResultFetcher(
final LoaderDispatcher loader,
@ -73,7 +74,8 @@ public class ResultFetcher {
final QueryParams query,
final yacySeedDB peers,
final WorkTables workTables,
final int taketimeout) {
final int taketimeout,
final boolean deleteIfSnippetFail) {
assert query != null;
this.loader = loader;
this.rankingProcess = rankedCache;
@ -81,6 +83,7 @@ public class ResultFetcher {
this.peers = peers;
this.workTables = workTables;
this.taketimeout = taketimeout;
this.deleteIfSnippetFail = deleteIfSnippetFail;
this.urlRetrievalAllTime = 0;
this.snippetComputationAllTime = 0;
@ -399,9 +402,9 @@ public class ResultFetcher {
Integer.MAX_VALUE,
!query.isLocal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + (!snippet.getErrorCode().fail() ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
if (snippet.getErrorCode() < 11) {
if (!snippet.getErrorCode().fail()) {
// we loaded the file and found the snippet
return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
} else if (cacheStrategy.mustBeOffline()) {
@ -411,7 +414,7 @@ public class ResultFetcher {
} else {
// problems with snippet fetch
String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
this.workTables.failURLsRegisterMissingWord(query.getSegment().termIndex(), metadata.url(), query.queryHashes, reason);
if (deleteIfSnippetFail) this.workTables.failURLsRegisterMissingWord(query.getSegment().termIndex(), metadata.url(), query.queryHashes, reason);
Log.logInfo("SEARCH", "sorted out url " + metadata.url().toNormalform(true, false) + " during search: " + reason);
return null;
}
@ -430,7 +433,7 @@ public class ResultFetcher {
} else {
// problems with snippet fetch
String reason = "no media snippet";
this.workTables.failURLsRegisterMissingWord(query.getSegment().termIndex(), metadata.url(), query.queryHashes, reason);
if (deleteIfSnippetFail) this.workTables.failURLsRegisterMissingWord(query.getSegment().termIndex(), metadata.url(), query.queryHashes, reason);
Log.logInfo("SEARCH", "sorted out url " + metadata.url().toNormalform(true, false) + " during search: " + reason);
return null;
}

@ -91,7 +91,8 @@ public final class SearchEvent {
final int remote_maxcount,
final long remote_maxtime,
final int burstRobinsonPercent,
final int burstMultiwordPercent) {
final int burstMultiwordPercent,
final boolean deleteIfSnippetFail) {
if (MemoryControl.available() < 1024 * 1024 * 100) SearchEventCache.cleanupEvents(true);
this.eventTime = System.currentTimeMillis(); // for lifetime check
this.peers = peers;
@ -155,7 +156,7 @@ public final class SearchEvent {
}
// start worker threads to fetch urls and snippets
this.resultFetcher = new ResultFetcher(loader, this.rankingProcess, query, this.peers, this.workTables, 3000);
this.resultFetcher = new ResultFetcher(loader, this.rankingProcess, query, this.peers, this.workTables, 3000, deleteIfSnippetFail);
} else {
// do a local search
this.rankingProcess = new RankingProcess(this.query, this.order, max_results_preparation);
@ -199,7 +200,7 @@ public final class SearchEvent {
}
// start worker threads to fetch urls and snippets
this.resultFetcher = new ResultFetcher(loader, this.rankingProcess, query, this.peers, this.workTables, 500);
this.resultFetcher = new ResultFetcher(loader, this.rankingProcess, query, this.peers, this.workTables, 500, deleteIfSnippetFail);
}
// clean up events

@ -130,7 +130,8 @@ public class SearchEventCache {
}
if (event == null) {
// start a new event
event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, burstRobinsonPercent, burstMultiwordPercent);
boolean delete = Switchboard.getSwitchboard() == null | Switchboard.getSwitchboard().getConfigBool("search.verify.delete", true);
event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, burstRobinsonPercent, burstMultiwordPercent, delete);
}
return event;

@ -1264,7 +1264,7 @@ public final class Switchboard extends serverSwitch {
File infile = new File(this.surrogatesInPath, s);
if (!infile.exists() || !infile.canWrite() || !infile.canRead()) return false;
File outfile = new File(this.surrogatesOutPath, s);
if (outfile.exists()) return false;
//if (outfile.exists()) return false;
boolean moved = false;
if (s.endsWith("xml.zip")) {
// open the zip file with all the xml files in it
@ -1332,7 +1332,7 @@ public final class Switchboard extends serverSwitch {
assert crawlStacker != null;
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier(true));
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier(true) + "': " + urlRejectReason);
this.log.logWarning("Rejected URL '" + surrogate.getIdentifier(true) + "': " + urlRejectReason);
continue;
}

@ -56,17 +56,6 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
private static final int maxCache = 1000;
public static final int SOURCE_CACHE = 0;
public static final int SOURCE_FILE = 1;
public static final int SOURCE_WEB = 2;
public static final int SOURCE_METADATA = 3;
public static final int ERROR_NO_HASH_GIVEN = 11;
public static final int ERROR_SOURCE_LOADING = 12;
public static final int ERROR_RESOURCE_LOADING = 13;
public static final int ERROR_PARSER_FAILED = 14;
public static final int ERROR_PARSER_NO_LINES = 15;
public static final int ERROR_NO_MATCH = 16;
/**
* <code>\\A[^\\p{L}\\p{N}].+</code>
@ -118,12 +107,32 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
public static final Cache snippetsCache = new Cache();
public static enum ResultClass {
SOURCE_CACHE(false),
SOURCE_FILE(false),
SOURCE_WEB(false),
SOURCE_METADATA(false),
ERROR_NO_HASH_GIVEN(true),
ERROR_SOURCE_LOADING(true),
ERROR_RESOURCE_LOADING(true),
ERROR_PARSER_FAILED(true),
ERROR_PARSER_NO_LINES(true),
ERROR_NO_MATCH(true);
private final boolean fail;
private ResultClass(final boolean fail) {
this.fail = fail;
}
public boolean fail() {
return this.fail;
}
}
private byte[] urlhash;
private String line;
private String error;
private int errorCode;
private ResultClass resultStatus;
public TextSnippet(final byte[] urlhash, final String line, final int errorCode, final String errortext) {
public TextSnippet(final byte[] urlhash, final String line, final ResultClass errorCode, final String errortext) {
init(urlhash, line, errorCode, errortext);
}
@ -132,12 +141,12 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
final DigestURI url = comp.url();
if (queryhashes.isEmpty()) {
//System.out.println("found no queryhashes for URL retrieve " + url);
init(url.hash(), null, ERROR_NO_HASH_GIVEN, "no query hashes given");
init(url.hash(), null, ResultClass.ERROR_NO_HASH_GIVEN, "no query hashes given");
return;
}
// try to get snippet from snippetCache
int source = SOURCE_CACHE;
ResultClass source = ResultClass.SOURCE_CACHE;
final String wordhashes = yacySearch.set2string(queryhashes);
final String urls = UTF8.String(url.hash());
String snippetLine = snippetsCache.get(wordhashes, urls);
@ -161,19 +170,19 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
boolean useMetadata = !objectWasInCache && !cacheStrategy.mustBeOffline();
if (useMetadata && containsAllHashes(loc = comp.dc_title(), queryhashes)) {
// try to create the snippet from information given in the url itself
init(url.hash(), loc, SOURCE_METADATA, null);
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return;
} else if (useMetadata && containsAllHashes(loc = comp.dc_creator(), queryhashes)) {
// try to create the snippet from information given in the creator metadata
init(url.hash(), loc, SOURCE_METADATA, null);
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return;
} else if (useMetadata && containsAllHashes(loc = comp.dc_subject(), queryhashes)) {
// try to create the snippet from information given in the subject metadata
init(url.hash(), loc, SOURCE_METADATA, null);
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return;
} else if (useMetadata && containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
// try to create the snippet from information given in the url
init(url.hash(), loc, SOURCE_METADATA, null);
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
return;
} else {
// try to load the resource from the cache
@ -181,23 +190,23 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy.mustBeOffline()) {
init(url.hash(), null, ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
return;
}
// if it is still not available, report an error
init(url.hash(), null, ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
init(url.hash(), null, ResultClass.ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
return;
}
if (!objectWasInCache) {
// place entry on indexing queue
Switchboard.getSwitchboard().toIndexer(response);
source = SOURCE_WEB;
source = ResultClass.SOURCE_WEB;
}
}
} catch (final Exception e) {
//Log.logException(e);
init(url.hash(), null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
return;
}
@ -208,11 +217,11 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} catch (final Parser.Failure e) {
init(url.hash(), null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
return;
}
if (document == null) {
init(url.hash(), null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
return;
}
@ -224,7 +233,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// compute snippet from text
final Collection<StringBuilder> sentences = document.getSentences(pre);
if (sentences == null) {
init(url.hash(), null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
init(url.hash(), null, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences");
return;
}
final SnippetExtractor tsr;
@ -235,7 +244,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords();
} catch (UnsupportedOperationException e) {
init(url.hash(), null, ERROR_NO_MATCH, "no matching snippet found");
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
return;
}
@ -254,7 +263,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
if (textline != null) snippetLine += (snippetLine.length() == 0) ? textline : "<br />" + textline;
if (snippetLine == null || !remainingHashes.isEmpty()) {
init(url.hash(), null, ERROR_NO_MATCH, "no matching snippet found");
init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
return;
}
if (snippetLine.length() > snippetMaxLength) snippetLine = snippetLine.substring(0, snippetMaxLength);
@ -266,10 +275,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
init(url.hash(), snippetLine, source, null);
}
private void init(final byte[] urlhash, final String line, final int errorCode, final String errortext) {
private void init(final byte[] urlhash, final String line, final ResultClass errorCode, final String errortext) {
this.urlhash = urlhash;
this.line = line;
this.errorCode = errorCode;
this.resultStatus = errorCode;
this.error = errortext;
}
@ -285,8 +294,8 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return (error == null) ? "" : error.trim();
}
public int getErrorCode() {
return errorCode;
public ResultClass getErrorCode() {
return resultStatus;
}
public String getLineMarked(final HandleSet queryHashes) {

Loading…
Cancel
Save