added an option to ViewFile to see all solr fields which contain texts

master
Michael Peter Christen 2 months ago
parent 6db374fdcf
commit c88c30a5c5

@ -84,16 +84,16 @@ function updatepage(str) {
<dd> <dd>
<input type="text" size="60" name="url" id="url" value="#[url]#" /> <input type="text" size="60" name="url" id="url" value="#[url]#" />
<input type="submit" name="show" class="btn btn-primary" value="Show Metadata" /> <input type="submit" name="show" class="btn btn-primary" value="Show Metadata" />
#(moar)#::<input type="button" value="Browse Host" class="btn btn-default" onClick="location.href='IndexBrowser_p.html?path=' + document.getElementById('url').value" />#(/moar)# #(searchindocument)#::<input type="button" value="Browse Host" class="btn btn-default" onClick="location.href='IndexBrowser_p.html?path=' + document.getElementById('url').value" />#(/searchindocument)#
<div id="searchresults"></div> <div id="searchresults"></div>
</dd> </dd>
#(moar)#:: #(searchindocument)#::
<dt>Search in Document:</dt> <dt>Search in Document:</dt>
<dd> <dd>
<input type="text" size="60" name="search" id="search" value="#[search]#" /> <input type="text" size="60" name="search" id="query" value="#[query]#" />
<input type="submit" name="show" class="btn btn-primary" value="Show Snippet" /> <input type="submit" name="show" class="btn btn-primary" value="Show Snippet" />
</dd> </dd>
#(/moar)# #(/searchindocument)#
</dl> </dl>
</fieldset> </fieldset>
</form> </form>
@ -123,6 +123,7 @@ function updatepage(str) {
<option value="sentences"#(vMode-sentences)#:: selected="selected"#(/vMode-sentences)#>Parsed Sentences</option> <option value="sentences"#(vMode-sentences)#:: selected="selected"#(/vMode-sentences)#>Parsed Sentences</option>
<option value="words"#(vMode-words)#:: selected="selected"#(/vMode-words)#>Parsed Tokens/Words</option> <option value="words"#(vMode-words)#:: selected="selected"#(/vMode-words)#>Parsed Tokens/Words</option>
<option value="links"#(vMode-links)#:: selected="selected"#(/vMode-links)#>Link List</option> <option value="links"#(vMode-links)#:: selected="selected"#(/vMode-links)#>Link List</option>
<option value="schema"#(vMode-schema)#:: selected="selected"#(/vMode-schema)#>Schema Fields</option>
<option value="iframeCitations"#(vMode-iframeCitations)#:: selected="selected"#(/vMode-iframeCitations)#>Citation Report</option> <option value="iframeCitations"#(vMode-iframeCitations)#:: selected="selected"#(/vMode-iframeCitations)#>Citation Report</option>
</select> </select>
<noscript><input type="submit" name="show" class="btn btn-primary" value="Show" /></noscript> <noscript><input type="submit" name="show" class="btn btn-primary" value="Show" /></noscript>
@ -166,9 +167,9 @@ function updatepage(str) {
</p> </p>
</fieldset> </fieldset>
:: <!-- 2 --> :: <!-- 2 -->
<form action=""> <form action="">
<fieldset><legend>Parsed Content</legend> <fieldset><legend>Parsed Content</legend>
<dl> <dl>
<dt>dc:title</dt><dd>#[title]#</dd> <dt>dc:title</dt><dd>#[title]#</dd>
<dt>dc:creator</dt><dd>#[creator]#</dd> <dt>dc:creator</dt><dd>#[creator]#</dd>
<dt>dc:subject</dt><dd>#[subject]#</dd> <dt>dc:subject</dt><dd>#[subject]#</dd>
@ -180,7 +181,7 @@ function updatepage(str) {
<dt>geo:lat &amp; geo:long</dt><dd><a href="osm.png?lon=#[lon]#&lat=#[lat]#&zoom=14" onclick="return hs.expand(this)">lat=#[lat]#, lon=#[lon]#</a></dd> <dt>geo:lat &amp; geo:long</dt><dd><a href="osm.png?lon=#[lon]#&lat=#[lat]#&zoom=14" onclick="return hs.expand(this)">lat=#[lat]#, lon=#[lon]#</a></dd>
</dl> </dl>
<p class="tt">#[parsedText]#</p> <p class="tt">#[parsedText]#</p>
</fieldset> </fieldset>
</form> </form>
:: <!-- 3 --> :: <!-- 3 -->
<fieldset><legend>Parsed Sentences</legend> <fieldset><legend>Parsed Sentences</legend>
@ -228,6 +229,14 @@ function updatepage(str) {
</ol> </ol>
</fieldset> </fieldset>
:: <!-- 8 --> :: <!-- 8 -->
<form action="">
<fieldset><legend>Schema Fields</legend>
<dl>#{fields}#
<dt>#[key]#</dt><dd>#[value]#</dd>#{/fields}#
</dl>
</fieldset>
</form>
:: <!-- 9 -->
<fieldset><legend>CitationReport</legend> <fieldset><legend>CitationReport</legend>
<iframe src="api/citation.html?url=#[url]#" width="800" height="400" /> <iframe src="api/citation.html?url=#[url]#" width="800" height="400" />
</fieldset> </fieldset>

@ -40,6 +40,8 @@ import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import org.apache.solr.common.SolrDocument;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
@ -79,7 +81,8 @@ public class ViewFile {
public static final int VIEW_MODE_AS_IFRAME_FROM_CACHE = 5; public static final int VIEW_MODE_AS_IFRAME_FROM_CACHE = 5;
public static final int VIEW_MODE_AS_LINKLIST = 6; public static final int VIEW_MODE_AS_LINKLIST = 6;
public static final int VIEW_MODE_AS_PARSED_WORDS = 7; public static final int VIEW_MODE_AS_PARSED_WORDS = 7;
public static final int VIEW_MODE_AS_IFRAME_FROM_CITATION_REPORT = 8; public static final int VIEW_MODE_AS_SCHEMA = 8;
public static final int VIEW_MODE_AS_IFRAME_FROM_CITATION_REPORT = 9;
private static final String HIGHLIGHT_CSS = "searchHighlight"; private static final String HIGHLIGHT_CSS = "searchHighlight";
private static final int MAX_HIGHLIGHTS = 6; private static final int MAX_HIGHLIGHTS = 6;
@ -89,7 +92,7 @@ public class ViewFile {
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard)env; final Switchboard sb = (Switchboard)env;
prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0); prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0);
prop.put("moar", 0); prop.put("searchindocument", 0);
prop.put("viewMode", VIEW_MODE_NO_TEXT); prop.put("viewMode", VIEW_MODE_NO_TEXT);
prop.put("viewModeValue", "sentences"); prop.put("viewModeValue", "sentences");
prop.putHTML("error_words", ""); prop.putHTML("error_words", "");
@ -118,6 +121,7 @@ public class ViewFile {
prop.put("error_vMode-words", "0"); prop.put("error_vMode-words", "0");
prop.put("error_vMode-links", "0"); prop.put("error_vMode-links", "0");
prop.put("error_vMode-iframeCitations", "0"); prop.put("error_vMode-iframeCitations", "0");
prop.put("error_vMode-schema", "0");
final boolean showSnippet = post.get("show", "").equals("Show Snippet"); final boolean showSnippet = post.get("show", "").equals("Show Snippet");
final String viewMode = showSnippet ? "sentences" : post.get("viewMode", "sentences"); final String viewMode = showSnippet ? "sentences" : post.get("viewMode", "sentences");
prop.put("error_vMode-" + viewMode, "1"); prop.put("error_vMode-" + viewMode, "1");
@ -132,6 +136,7 @@ public class ViewFile {
// get the url hash from which the content should be loaded // get the url hash from which the content should be loaded
String urlHash = post.get("urlHash", post.get("urlhash", "")); String urlHash = post.get("urlHash", post.get("urlhash", ""));
// if the user has made an input of the url string, this overwrites a possibly given url hash
final String urlString = post.get("url", ""); final String urlString = post.get("url", "");
if (urlString.length() > 0) try { if (urlString.length() > 0) try {
// this call forces the peer to download web pages // this call forces the peer to download web pages
@ -148,11 +153,12 @@ public class ViewFile {
pre = post.getBoolean("pre"); pre = post.getBoolean("pre");
} catch (final MalformedURLException e) {} } catch (final MalformedURLException e) {}
URIMetadataNode urlEntry = null;
// get the urlEntry that belongs to the url hash // get the urlEntry that belongs to the url hash
URIMetadataNode urlEntry = null; // to be overwritten if we succeed in finding the url in the current document index
//boolean ue = urlHash.length() > 0 && indexSegment.exists(ASCII.getBytes(urlHash)); //boolean ue = urlHash.length() > 0 && indexSegment.exists(ASCII.getBytes(urlHash));
//if (ue) Log.logInfo("ViewFile", "exists(" + urlHash + ")"); //if (ue) Log.logInfo("ViewFile", "exists(" + urlHash + ")");
if (urlHash.length() > 0 && (urlEntry = indexSegment.fulltext().getMetadata(ASCII.getBytes(urlHash))) == null) { if (urlHash.length() > 0 && (urlEntry = indexSegment.fulltext().getMetadata(ASCII.getBytes(urlHash))) == null) {
// could not find the url, we try a commit to get the latest data and the try again
indexSegment.fulltext().commit(true); indexSegment.fulltext().commit(true);
} }
if (urlHash.length() > 0 && (urlEntry = indexSegment.fulltext().getMetadata(ASCII.getBytes(urlHash))) != null) { if (urlHash.length() > 0 && (urlEntry = indexSegment.fulltext().getMetadata(ASCII.getBytes(urlHash))) != null) {
@ -167,8 +173,8 @@ public class ViewFile {
//urlEntry.wordCount(); //urlEntry.wordCount();
size = urlEntry.filesize(); size = urlEntry.filesize();
pre = urlEntry.flags().get(Tokenizer.flag_cat_indexof); pre = urlEntry.flags().get(Tokenizer.flag_cat_indexof);
prop.put("moar", 1); prop.put("searchindocument", 1);
prop.putHTML("moar_search", post.get("search","")); prop.putHTML("searchindocument_query", post.get("query",""));
} }
prop.put("error_inurldb", urlEntry == null ? 0 : 1); prop.put("error_inurldb", urlEntry == null ? 0 : 1);
@ -184,6 +190,7 @@ public class ViewFile {
// loading the resource content as byte array // loading the resource content as byte array
prop.put("error_incache", Cache.has(url.hash()) ? 1 : 0); prop.put("error_incache", Cache.has(url.hash()) ? 1 : 0);
// load the resource content, if user is not authorized, use cache only
Response response = null; Response response = null;
try { try {
final ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); final ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
@ -196,6 +203,7 @@ public class ViewFile {
return prop; return prop;
} }
// no resource available, return an error
if (response == null) { if (response == null) {
prop.put("error", "4"); prop.put("error", "4");
prop.put("error_errorText", "No resource available"); prop.put("error_errorText", "No resource available");
@ -204,25 +212,40 @@ public class ViewFile {
} }
final String[] wordArray = wordArray(post.get("words", null)); final String[] wordArray = wordArray(post.get("words", null));
if (viewMode.equals("plain")) { if (viewMode.equals("iframeWeb")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_WEB);
prop.put("viewMode_url", url.toNormalform(true));
} else if (viewMode.equals("iframeCache")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CACHE);
prop.put("viewMode_png", 0);
prop.put("viewMode_html", 0);
if (response.docType() == Response.DT_IMAGE) {
prop.put("viewMode_png", 1);
prop.put("viewMode_png_url", url.toNormalform(true));
} else {
prop.put("viewMode_html", 1);
prop.put("viewMode_html_url", url.toNormalform(true));
}
} else if (viewMode.equals("plain")) {
// TODO: how to handle very large files here ? // TODO: how to handle very large files here ?
String content; String content;
try { try {
String charsetName = response.getCharacterEncoding(); String charsetName = response.getCharacterEncoding();
try { try {
if(charsetName == null) { if(charsetName == null) {
/* Encoding is unknown from response headers : default decode using UTF-8 */ /* Encoding is unknown from response headers : default decode using UTF-8 */
charsetName = StandardCharsets.UTF_8.name(); charsetName = StandardCharsets.UTF_8.name();
} else if(!Charset.isSupported(charsetName)) { } else if(!Charset.isSupported(charsetName)) {
/* Encoding is known but not supported on this system : default decode using UTF-8 */ /* Encoding is known but not supported on this system : default decode using UTF-8 */
charsetName = StandardCharsets.UTF_8.name(); charsetName = StandardCharsets.UTF_8.name();
} }
} catch(final IllegalCharsetNameException e) { } catch(final IllegalCharsetNameException e) {
/* Encoding is known but charset name is not valid : default decode using UTF-8 */ /* Encoding is known but charset name is not valid : default decode using UTF-8 */
charsetName = StandardCharsets.UTF_8.name(); charsetName = StandardCharsets.UTF_8.name();
} }
content = new String(response.getContent(), charsetName); content = new String(response.getContent(), charsetName);
} catch (final Exception e) { } catch (final Exception e) {
prop.put("error", "4"); prop.put("error", "4");
prop.putHTML("error_errorText", e.getMessage()); prop.putHTML("error_errorText", e.getMessage());
@ -234,24 +257,6 @@ public class ViewFile {
prop.put("viewMode", VIEW_MODE_AS_PLAIN_TEXT); prop.put("viewMode", VIEW_MODE_AS_PLAIN_TEXT);
prop.put("viewMode_plainText", markup(wordArray, content).replaceAll("\n", "<br />").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")); prop.put("viewMode_plainText", markup(wordArray, content).replaceAll("\n", "<br />").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"));
} else if (viewMode.equals("iframeWeb")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_WEB);
prop.put("viewMode_url", url.toNormalform(true));
} else if (viewMode.equals("iframeCache")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CACHE);
prop.put("viewMode_png", 0);
prop.put("viewMode_html", 0);
if (response.docType() == Response.DT_IMAGE) {
prop.put("viewMode_png", 1);
prop.put("viewMode_png_url", url.toNormalform(true));
} else {
prop.put("viewMode_html", 1);
prop.put("viewMode_html_url", url.toNormalform(true));
}
} else if (viewMode.equals("iframeCitations")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CITATION_REPORT);
prop.put("viewMode_url", url.toNormalform(true));
} else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("words") || viewMode.equals("links")) { } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("words") || viewMode.equals("links")) {
// parsing the resource content // parsing the resource content
Document document = null; Document document = null;
@ -344,29 +349,35 @@ public class ViewFile {
} else if (viewMode.equals("links")) { } else if (viewMode.equals("links")) {
putLinks(prop, wordArray, document, post.get("agentName")); putLinks(prop, wordArray, document, post.get("agentName"));
} }
// optional: generate snippet // optional: generate snippet
if (showSnippet) { if (showSnippet) {
final QueryGoal goal = new QueryGoal(post.get("search", "")); final QueryGoal goal = new QueryGoal(post.get("search", ""));
final TextSnippet snippet = new TextSnippet( try {
sb.loader, final TextSnippet snippet = new TextSnippet(
urlEntry, sb.loader,
goal.getIncludeWordsSet(), urlEntry,
goal.getIncludeHashes(), goal.getIncludeWordsSet(),
CacheStrategy.CACHEONLY, goal.getIncludeHashes(),
false, CacheStrategy.CACHEONLY,
SearchEvent.SNIPPET_MAX_LENGTH, false,
false); SearchEvent.SNIPPET_MAX_LENGTH,
String titlestr = urlEntry.dc_title(); false);
// if title is empty use filename as title String titlestr = urlEntry.dc_title();
if (titlestr.isEmpty()) { // if url has no filename, title is still empty (e.g. "www.host.com/" ) // if title is empty use filename as title
titlestr = urlEntry.url() != null ? urlEntry.url().getFileName() : ""; if (titlestr.isEmpty()) { // if url has no filename, title is still empty (e.g. "www.host.com/" )
titlestr = urlEntry.url() != null ? urlEntry.url().getFileName() : "";
}
final String desc = (snippet == null) ? "" : snippet.descriptionline(goal);
prop.put("showSnippet_headline", titlestr);
prop.put("showSnippet_teasertext", desc);
prop.put("showSnippet", 1);
} catch (UnsupportedOperationException e) {
prop.put("showSnippet_headline", "<no snippet found>");
prop.put("showSnippet_teasertext", "<no snippet found>");
prop.put("showSnippet", 1);
} }
final String desc = (snippet == null) ? "" : snippet.descriptionline(goal);
prop.put("showSnippet_headline", titlestr);
prop.put("showSnippet_teasertext", desc);
prop.put("showSnippet", 1);
} }
// update index with parsed resource if index entry is older or missing // update index with parsed resource if index entry is older or missing
final long responseSize = response.size(); final long responseSize = response.size();
@ -378,6 +389,31 @@ public class ViewFile {
} }
} }
if (document != null) document.close(); if (document != null) document.close();
} else if (viewMode.equals("schema")) {
prop.put("viewMode", VIEW_MODE_AS_SCHEMA);
prop.put("viewMode_url", url.toNormalform(true));
// list all fields in the document which have text or string content
// first we must load the solr document from the index
try {
final SolrDocument solrDocument = indexSegment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(url.hash()));
if (solrDocument != null) {
int c = 0;
for (final String fieldName : solrDocument.getFieldNames()) {
final Object value = solrDocument.getFieldValue(fieldName);
if (value instanceof String || value instanceof Collection) {
prop.put("viewMode_fields_" + c + "_key", fieldName);
prop.put("viewMode_fields_" + c + "_value", value.toString());
c++;
}
}
prop.put("viewMode_fields", c);
}
} catch (IOException e) {
e.printStackTrace();
}
} else if (viewMode.equals("iframeCitations")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CITATION_REPORT);
prop.put("viewMode_url", url.toNormalform(true));
} }
prop.put("error", "0"); prop.put("error", "0");
prop.put("error_url", url.toNormalform(true)); prop.put("error_url", url.toNormalform(true));

Loading…
Cancel
Save