- extended ViewFile to use is as debugging-tool: you can now use the

post-parameter url to submit an url directly
- fixed some bugs in text parser (not all parts had been analysed)
- fixed a bug in remote search interface (could not handle constraints)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3001 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 30888e7a2f
commit 8e7215475b

@ -47,6 +47,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.Enumeration;
@ -55,6 +56,7 @@ import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
@ -87,24 +89,21 @@ public class ViewFile {
serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard)env;
if (post != null && post.containsKey("words"))
try {
prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
} catch (UnsupportedEncodingException e1) {
// ignore this. this should not occure
}
// getting the url hash from which the content should be loaded
String urlHash = post.get("urlHash","");
if (urlHash.equals("")) {
prop.put("error",1);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
String viewMode = post.get("viewMode","sentences");
if (post != null && post.containsKey("words")) try {
prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
} catch (UnsupportedEncodingException e1) {
// ignore this. this should not occure
}
String viewMode = post.get("viewMode","sentences");
URL url = null;
String descr = "";
int wordCount = 0;
int size = 0;
// getting the url hash from which the content should be loaded
String urlHash = post.get("urlHash","");
if (urlHash.length() > 0) {
// getting the urlEntry that belongs to the url hash
indexURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
@ -113,196 +112,238 @@ public class ViewFile {
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
// gettin the url that belongs to the entry
// gettin the url that belongs to the entry
indexURLEntry.Components comp = urlEntry.comp();
if ((comp == null) || (comp.url() == null)) {
prop.put("error",3);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
prop.put("error", 3);
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
url = comp.url();
descr = comp.descr();
urlEntry.wordCount();
size = urlEntry.size();
}
// alternatively, get the url simply from a url String
// this can be used as a simple tool to test the text parser
String urlString = post.get("url", "");
if (urlString.length() > 0) try {
url = new URL(urlString);
} catch (MalformedURLException e) {}
if (url == null) {
prop.put("error", 1);
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
// loading the resource content as byte array
InputStream resource = null;
long resourceLength = -1;
IResourceInfo resInfo = null;
String resMime = null;
try {
// trying to load the resource body
resource = sb.cacheManager.getResourceContentStream(url);
resourceLength = sb.cacheManager.getResourceContentLength(url);
// if the resource body was not cached we try to load it from web
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
} catch (plasmaCrawlerException e) {
prop.put("error", 4);
prop.put("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
// loading the resource content as byte array
InputStream resource = null;
long resourceLength = -1;
IResourceInfo resInfo = null;
String resMime = null;
try {
// trying to load the resource body
resource = sb.cacheManager.getResourceContentStream(comp.url());
resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
if (entry != null) {
resInfo = entry.getDocumentInfo();
resource = sb.cacheManager.getResourceContentStream(url);
resourceLength = sb.cacheManager.getResourceContentLength(url);
}
// if the resource body was not cached we try to load it from web
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
entry = sb.snippetCache.loadResourceFromWeb(comp.url(), 5000, false);
} catch (plasmaCrawlerException e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
prop.put("error", 4);
prop.put("error_errorText", "No resource available");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
}
if (entry != null) {
resInfo = entry.getDocumentInfo();
resource = sb.cacheManager.getResourceContentStream(comp.url());
resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
}
// try to load resource metadata
if (resInfo == null) {
if (resource == null) {
prop.put("error",4);
prop.put("error_errorText","No resource available");
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
// try to load the metadata from cache
try {
resInfo = sb.cacheManager.loadResourceInfo(url);
} catch (Exception e) {
/* ignore this */
}
// try to load resource metadata
// if the metadata where not cached try to load it from web
if (resInfo == null) {
String protocol = url.getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error", 6);
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
// try to load the metadata from cache
httpHeader responseHeader = httpc.whead(url, url.getHost(), 5000, null, null, sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error", 4);
prop.put("error_errorText", "Unable to load resource metadata.");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
try {
resInfo = sb.cacheManager.loadResourceInfo(comp.url());
} catch (Exception e) { /* ignore this */}
// if the metadata where not cached try to load it from web
if (resInfo == null) {
String protocol = comp.url().getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error",6);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
httpHeader responseHeader = httpc.whead(comp.url(),comp.url().getHost(),5000,null,null,sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error",4);
prop.put("error_errorText","Unable to load resource metadata.");
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
try {
resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(comp.url(), responseHeader);
} catch (Exception e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
resMime = responseHeader.mime();
resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
} catch (Exception e) {
prop.put("error", 4);
prop.put("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
} else {
resMime = resInfo.getMimeType();
resMime = responseHeader.mime();
}
} catch (IOException e) {
if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */}
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
if (viewMode.equals("plain")) {
// TODO: how to handle very large files here ?
String content;
} else {
resMime = resInfo.getMimeType();
}
} catch (IOException e) {
if (resource != null)
try {
content = new String(serverFileUtils.read(resource),"UTF-8");
} catch (Exception e) {
prop.put("error",4);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
resource.close();
} catch (Exception ex) {
/* ignore this */
}
content = content.replaceAll("<","&lt;")
.replaceAll(">","&gt;")
.replaceAll("\"","&quot;")
.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("error",0);
prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
prop.put("viewMode_plainText",content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode",VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url",comp.url().toNormalform());
} else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
document = sb.snippetCache.parseDocument(comp.url(), resourceLength, resource,resInfo);
if (document == null) {
prop.put("error",5);
prop.put("error_errorText","Unknown error");
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
prop.put("error", 4);
prop.put("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
if (viewMode.equals("plain")) {
// TODO: how to handle very large files here ?
String content;
try {
content = new String(serverFileUtils.read(resource), "UTF-8");
} catch (Exception e) {
prop.put("error", 4);
prop.put("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null)
try {
resource.close();
} catch (Exception e) {
/* ignore this */
}
} catch (ParserException e) {
prop.put("error",5);
prop.put("error_errorText",e.getMessage());
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
}
content = content.replaceAll("<", "&lt;").replaceAll(">", "&gt;")
.replaceAll("\"", "&quot;").replaceAll("\n", "<br>")
.replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("error", 0);
prop.put("viewMode", VIEW_MODE_AS_PLAIN_TEXT);
prop.put("viewMode_plainText", content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url", url.toNormalform());
} else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
document = sb.snippetCache.parseDocument(url, resourceLength, resource, resInfo);
if (document == null) {
prop.put("error", 5);
prop.put("error_errorText", "Unknown error");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
resMime = document.getMimeType();
if (viewMode.equals("parsed")) {
String content = new String(document.getTextBytes());
content = wikiCode.replaceHTML(content); //added by Marc Nause
content = content.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_parsedText",content);
} else {
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
boolean dark = true;
int i = 0;
if (sentences != null) while (sentences.hasMoreElements()) {
String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());
} catch (ParserException e) {
prop.put("error", 5);
prop.put("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null)
try {
resource.close();
} catch (Exception e) {
/* ignore this */
}
}
resMime = document.getMimeType();
if (viewMode.equals("parsed")) {
String content = new String(document.getTextBytes());
content = wikiCode.replaceHTML(content); // added by Marc Nause
content = content.replaceAll("\n", "<br>").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_parsedText", content);
} else {
prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
boolean dark = true;
int i = 0;
if (sentences != null)
while (sentences.hasMoreElements()) {
String currentSentence = wikiCode
.replaceHTML((String) sentences.nextElement());
// Search word highlighting
String words = post.get("words",null);
String words = post.get("words", null);
if (words != null) {
try {
words = URLDecoder.decode(words,"UTF-8");
} catch (UnsupportedEncodingException e) {}
String[] wordArray = words.substring(1,words.length()-1).split(",");
for (int j=0; j < wordArray.length; j++) {
String currentWord = wordArray[j].trim();
currentSentence = currentSentence.replaceAll(currentWord,
"<b style=\"color: black; background-color: rgb(" + highlightingColors[j%6] + ");\">" + currentWord + "</b>");
words = URLDecoder.decode(words, "UTF-8");
} catch (UnsupportedEncodingException e) {
}
String[] wordArray = words.substring(1,
words.length() - 1).split(",");
for (int j = 0; j < wordArray.length; j++) {
String currentWord = wordArray[j].trim();
currentSentence = currentSentence.replaceAll(
currentWord,
"<b style=\"color: black; background-color: rgb("
+ highlightingColors[j % 6]
+ ");\">" + currentWord
+ "</b>");
}
}
prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
prop.put("viewMode_sentences_" + i + "_text",currentSentence);
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1));
prop.put("viewMode_sentences_" + i + "_text", currentSentence);
prop.put("viewMode_sentences_" + i + "_dark", ((dark) ? 1 : 0));
dark = !dark;
i++;
}
prop.put("viewMode_sentences", i);
prop.put("viewMode_sentences", i);
}
if (document != null) document.close();
}
prop.put("error", 0);
prop.put("error_url", comp.url().toNormalform());
prop.put("error_hash", urlHash);
prop.put("error_wordCount", Integer.toString(urlEntry.wordCount()));
prop.put("error_desc", comp.descr());
prop.put("error_size", urlEntry.size());
prop.put("error_mimeType", resMime);
return prop;
if (document != null) document.close();
}
prop.put("error", 0);
prop.put("error_url", url.toNormalform());
prop.put("error_hash", urlHash);
prop.put("error_wordCount", Integer.toString(wordCount));
prop.put("error_desc", descr);
prop.put("error_size", size);
prop.put("error_mimeType", resMime);
return prop;
}
}

@ -94,7 +94,7 @@ public final class search {
final String prefer = post.get("prefer", "");
final String filter = post.get("filter", ".*");
final boolean includesnippet = post.get("includesnippet", "false").equals("true");
final kelondroBitfield constraint = kelondroBitfield(4, post.get("constraint", "______"));
final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______"));
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -301,9 +301,4 @@ public final class search {
return prop;
}
private static kelondroBitfield kelondroBitfield(int i, String string) {
// TODO Auto-generated method stub
return null;
}
}

@ -250,7 +250,7 @@ public final class plasmaCondenser {
sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize);
while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
// System.out.println("PARSED-WORD " + word);
//System.out.println("PARSED-WORD " + word);
// distinguish punctuation and words
wordlen = word.length();
@ -740,6 +740,7 @@ public final class plasmaCondenser {
private Object nextElement0() {
try {
String s = readSentence(raf);
//System.out.println(" SENTENCE='" + s + "'"); // DEBUG
if (s == null) {
raf.close();
return null;
@ -782,7 +783,10 @@ public final class plasmaCondenser {
// find sentence end
for (;;) {
nextChar = reader.read();
if (nextChar < 0) return null;
//System.out.print((char) nextChar); // DEBUG
if (nextChar < 0) {
if (s.length() == 0) return null; else break;
}
c = (char) nextChar;
s.append(c);
if (htmlFilterContentScraper.punctuation(c)) break;

Loading…
Cancel
Save