- extended ViewFile to use is as debugging-tool: you can now use the

post-parameter url to submit an url directly
- fixed some bugs in text parser (not all parts had been analysed)
- fixed a bug in remote search interface (could not handle constraints)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3001 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 30888e7a2f
commit 8e7215475b

@ -47,6 +47,7 @@
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder; import java.net.URLDecoder;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.Enumeration; import java.util.Enumeration;
@ -55,6 +56,7 @@ import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
@ -87,24 +89,21 @@ public class ViewFile {
serverObjects prop = new serverObjects(); serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard)env; plasmaSwitchboard sb = (plasmaSwitchboard)env;
if (post != null && post.containsKey("words")) if (post != null && post.containsKey("words")) try {
try { prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8")); } catch (UnsupportedEncodingException e1) {
} catch (UnsupportedEncodingException e1) { // ignore this. this should not occure
// ignore this. this should not occure }
}
String viewMode = post.get("viewMode","sentences");
URL url = null;
// getting the url hash from which the content should be loaded String descr = "";
String urlHash = post.get("urlHash",""); int wordCount = 0;
if (urlHash.equals("")) { int size = 0;
prop.put("error",1);
prop.put("viewMode",VIEW_MODE_NO_TEXT); // getting the url hash from which the content should be loaded
return prop; String urlHash = post.get("urlHash","");
} if (urlHash.length() > 0) {
String viewMode = post.get("viewMode","sentences");
// getting the urlEntry that belongs to the url hash // getting the urlEntry that belongs to the url hash
indexURLEntry urlEntry = null; indexURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null); urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
@ -113,196 +112,238 @@ public class ViewFile {
prop.put("viewMode",VIEW_MODE_NO_TEXT); prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop; return prop;
} }
// gettin the url that belongs to the entry // gettin the url that belongs to the entry
indexURLEntry.Components comp = urlEntry.comp(); indexURLEntry.Components comp = urlEntry.comp();
if ((comp == null) || (comp.url() == null)) { if ((comp == null) || (comp.url() == null)) {
prop.put("error",3); prop.put("error", 3);
prop.put("viewMode",VIEW_MODE_NO_TEXT); prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop; return prop;
} }
url = comp.url();
descr = comp.descr();
urlEntry.wordCount();
size = urlEntry.size();
}
// alternatively, get the url simply from a url String
// this can be used as a simple tool to test the text parser
String urlString = post.get("url", "");
if (urlString.length() > 0) try {
url = new URL(urlString);
} catch (MalformedURLException e) {}
if (url == null) {
prop.put("error", 1);
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
// loading the resource content as byte array
InputStream resource = null;
long resourceLength = -1;
IResourceInfo resInfo = null;
String resMime = null;
try {
// trying to load the resource body
resource = sb.cacheManager.getResourceContentStream(url);
resourceLength = sb.cacheManager.getResourceContentLength(url);
// if the resource body was not cached we try to load it from web
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
} catch (plasmaCrawlerException e) {
prop.put("error", 4);
prop.put("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
// loading the resource content as byte array if (entry != null) {
InputStream resource = null; resInfo = entry.getDocumentInfo();
long resourceLength = -1; resource = sb.cacheManager.getResourceContentStream(url);
IResourceInfo resInfo = null; resourceLength = sb.cacheManager.getResourceContentLength(url);
String resMime = null; }
try {
// trying to load the resource body
resource = sb.cacheManager.getResourceContentStream(comp.url());
resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
// if the resource body was not cached we try to load it from web
if (resource == null) { if (resource == null) {
plasmaHTCache.Entry entry = null; prop.put("error", 4);
try { prop.put("error_errorText", "No resource available");
entry = sb.snippetCache.loadResourceFromWeb(comp.url(), 5000, false); prop.put("viewMode", VIEW_MODE_NO_TEXT);
} catch (plasmaCrawlerException e) { return prop;
prop.put("error",4); }
prop.put("error_errorText",e.getMessage()); }
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
if (entry != null) { // try to load resource metadata
resInfo = entry.getDocumentInfo(); if (resInfo == null) {
resource = sb.cacheManager.getResourceContentStream(comp.url());
resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
}
if (resource == null) { // try to load the metadata from cache
prop.put("error",4); try {
prop.put("error_errorText","No resource available"); resInfo = sb.cacheManager.loadResourceInfo(url);
prop.put("viewMode",VIEW_MODE_NO_TEXT); } catch (Exception e) {
return prop; /* ignore this */
}
} }
// try to load resource metadata // if the metadata where not cached try to load it from web
if (resInfo == null) { if (resInfo == null) {
String protocol = url.getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error", 6);
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
// try to load the metadata from cache httpHeader responseHeader = httpc.whead(url, url.getHost(), 5000, null, null, sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error", 4);
prop.put("error_errorText", "Unable to load resource metadata.");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
try { try {
resInfo = sb.cacheManager.loadResourceInfo(comp.url()); resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
} catch (Exception e) { /* ignore this */} } catch (Exception e) {
prop.put("error", 4);
// if the metadata where not cached try to load it from web prop.put("error_errorText", e.getMessage());
if (resInfo == null) { prop.put("viewMode", VIEW_MODE_NO_TEXT);
String protocol = comp.url().getProtocol(); return prop;
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error",6);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
httpHeader responseHeader = httpc.whead(comp.url(),comp.url().getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error",4);
prop.put("error_errorText","Unable to load resource metadata.");
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
try {
resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(comp.url(), responseHeader);
} catch (Exception e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
resMime = responseHeader.mime();
} }
} else { resMime = responseHeader.mime();
resMime = resInfo.getMimeType();
} }
} catch (IOException e) { } else {
if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */} resMime = resInfo.getMimeType();
prop.put("error",4); }
prop.put("error_errorText",e.getMessage()); } catch (IOException e) {
prop.put("viewMode",VIEW_MODE_NO_TEXT); if (resource != null)
return prop;
}
if (viewMode.equals("plain")) {
// TODO: how to handle very large files here ?
String content;
try { try {
content = new String(serverFileUtils.read(resource),"UTF-8"); resource.close();
} catch (Exception e) { } catch (Exception ex) {
prop.put("error",4); /* ignore this */
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
} }
prop.put("error", 4);
content = content.replaceAll("<","&lt;") prop.put("error_errorText", e.getMessage());
.replaceAll(">","&gt;") prop.put("viewMode", VIEW_MODE_NO_TEXT);
.replaceAll("\"","&quot;") return prop;
.replaceAll("\n","<br>") }
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
if (viewMode.equals("plain")) {
prop.put("error",0);
prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT); // TODO: how to handle very large files here ?
prop.put("viewMode_plainText",content); String content;
} else if (viewMode.equals("iframe")) { try {
prop.put("viewMode",VIEW_MODE_AS_IFRAME); content = new String(serverFileUtils.read(resource), "UTF-8");
prop.put("viewMode_url",comp.url().toNormalform()); } catch (Exception e) {
} else if (viewMode.equals("parsed") || viewMode.equals("sentences")) { prop.put("error", 4);
// parsing the resource content prop.put("error_errorText", e.getMessage());
plasmaParserDocument document = null; prop.put("viewMode", VIEW_MODE_NO_TEXT);
try { return prop;
document = sb.snippetCache.parseDocument(comp.url(), resourceLength, resource,resInfo); } finally {
if (document == null) { if (resource != null)
prop.put("error",5); try {
prop.put("error_errorText","Unknown error"); resource.close();
prop.put("viewMode",VIEW_MODE_NO_TEXT); } catch (Exception e) {
return prop; /* ignore this */
} }
} catch (ParserException e) { }
prop.put("error",5);
prop.put("error_errorText",e.getMessage()); content = content.replaceAll("<", "&lt;").replaceAll(">", "&gt;")
prop.put("viewMode",VIEW_MODE_NO_TEXT); .replaceAll("\"", "&quot;").replaceAll("\n", "<br>")
return prop; .replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;");
} finally {
if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */} prop.put("error", 0);
prop.put("viewMode", VIEW_MODE_AS_PLAIN_TEXT);
prop.put("viewMode_plainText", content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url", url.toNormalform());
} else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
document = sb.snippetCache.parseDocument(url, resourceLength, resource, resInfo);
if (document == null) {
prop.put("error", 5);
prop.put("error_errorText", "Unknown error");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
} }
} catch (ParserException e) {
resMime = document.getMimeType(); prop.put("error", 5);
prop.put("error_errorText", e.getMessage());
if (viewMode.equals("parsed")) { prop.put("viewMode", VIEW_MODE_NO_TEXT);
String content = new String(document.getTextBytes()); return prop;
content = wikiCode.replaceHTML(content); //added by Marc Nause } finally {
content = content.replaceAll("\n","<br>") if (resource != null)
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;"); try {
resource.close();
prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT); } catch (Exception e) {
prop.put("viewMode_parsedText",content); /* ignore this */
} else { }
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES); }
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
resMime = document.getMimeType();
boolean dark = true;
int i = 0; if (viewMode.equals("parsed")) {
if (sentences != null) while (sentences.hasMoreElements()) { String content = new String(document.getTextBytes());
String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement()); content = wikiCode.replaceHTML(content); // added by Marc Nause
content = content.replaceAll("\n", "<br>").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_parsedText", content);
} else {
prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
boolean dark = true;
int i = 0;
if (sentences != null)
while (sentences.hasMoreElements()) {
String currentSentence = wikiCode
.replaceHTML((String) sentences.nextElement());
// Search word highlighting // Search word highlighting
String words = post.get("words",null); String words = post.get("words", null);
if (words != null) { if (words != null) {
try { try {
words = URLDecoder.decode(words,"UTF-8"); words = URLDecoder.decode(words, "UTF-8");
} catch (UnsupportedEncodingException e) {} } catch (UnsupportedEncodingException e) {
}
String[] wordArray = words.substring(1,words.length()-1).split(",");
for (int j=0; j < wordArray.length; j++) { String[] wordArray = words.substring(1,
String currentWord = wordArray[j].trim(); words.length() - 1).split(",");
currentSentence = currentSentence.replaceAll(currentWord, for (int j = 0; j < wordArray.length; j++) {
"<b style=\"color: black; background-color: rgb(" + highlightingColors[j%6] + ");\">" + currentWord + "</b>"); String currentWord = wordArray[j].trim();
currentSentence = currentSentence.replaceAll(
currentWord,
"<b style=\"color: black; background-color: rgb("
+ highlightingColors[j % 6]
+ ");\">" + currentWord
+ "</b>");
} }
} }
prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1));
prop.put("viewMode_sentences_" + i + "_text",currentSentence); prop.put("viewMode_sentences_" + i + "_text", currentSentence);
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark; prop.put("viewMode_sentences_" + i + "_dark", ((dark) ? 1 : 0));
dark = !dark;
i++; i++;
} }
prop.put("viewMode_sentences", i); prop.put("viewMode_sentences", i);
}
if (document != null) document.close();
} }
prop.put("error", 0); if (document != null) document.close();
prop.put("error_url", comp.url().toNormalform()); }
prop.put("error_hash", urlHash); prop.put("error", 0);
prop.put("error_wordCount", Integer.toString(urlEntry.wordCount())); prop.put("error_url", url.toNormalform());
prop.put("error_desc", comp.descr()); prop.put("error_hash", urlHash);
prop.put("error_size", urlEntry.size()); prop.put("error_wordCount", Integer.toString(wordCount));
prop.put("error_mimeType", resMime); prop.put("error_desc", descr);
prop.put("error_size", size);
return prop; prop.put("error_mimeType", resMime);
return prop;
} }
} }

@ -94,7 +94,7 @@ public final class search {
final String prefer = post.get("prefer", ""); final String prefer = post.get("prefer", "");
final String filter = post.get("filter", ".*"); final String filter = post.get("filter", ".*");
final boolean includesnippet = post.get("includesnippet", "false").equals("true"); final boolean includesnippet = post.get("includesnippet", "false").equals("true");
final kelondroBitfield constraint = kelondroBitfield(4, post.get("constraint", "______")); final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______"));
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers // final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time // Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -301,9 +301,4 @@ public final class search {
return prop; return prop;
} }
private static kelondroBitfield kelondroBitfield(int i, String string) {
// TODO Auto-generated method stub
return null;
}
} }

@ -250,7 +250,7 @@ public final class plasmaCondenser {
sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize); sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize);
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
// System.out.println("PARSED-WORD " + word); //System.out.println("PARSED-WORD " + word);
// distinguish punctuation and words // distinguish punctuation and words
wordlen = word.length(); wordlen = word.length();
@ -740,6 +740,7 @@ public final class plasmaCondenser {
private Object nextElement0() { private Object nextElement0() {
try { try {
String s = readSentence(raf); String s = readSentence(raf);
//System.out.println(" SENTENCE='" + s + "'"); // DEBUG
if (s == null) { if (s == null) {
raf.close(); raf.close();
return null; return null;
@ -782,7 +783,10 @@ public final class plasmaCondenser {
// find sentence end // find sentence end
for (;;) { for (;;) {
nextChar = reader.read(); nextChar = reader.read();
if (nextChar < 0) return null; //System.out.print((char) nextChar); // DEBUG
if (nextChar < 0) {
if (s.length() == 0) return null; else break;
}
c = (char) nextChar; c = (char) nextChar;
s.append(c); s.append(c);
if (htmlFilterContentScraper.punctuation(c)) break; if (htmlFilterContentScraper.punctuation(c)) break;

Loading…
Cancel
Save