- extended ViewFile to use is as debugging-tool: you can now use the

post-parameter url to submit an url directly
- fixed some bugs in text parser (not all parts had been analysed)
- fixed a bug in remote search interface (could not handle constraints)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3001 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 30888e7a2f
commit 8e7215475b

@ -47,6 +47,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.Enumeration;
@ -55,6 +56,7 @@ import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
@ -87,24 +89,21 @@ public class ViewFile {
serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard)env;
if (post != null && post.containsKey("words"))
try {
if (post != null && post.containsKey("words")) try {
prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
} catch (UnsupportedEncodingException e1) {
// ignore this. this should not occure
}
String viewMode = post.get("viewMode","sentences");
URL url = null;
String descr = "";
int wordCount = 0;
int size = 0;
// getting the url hash from which the content should be loaded
String urlHash = post.get("urlHash","");
if (urlHash.equals("")) {
prop.put("error",1);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
String viewMode = post.get("viewMode","sentences");
if (urlHash.length() > 0) {
// getting the urlEntry that belongs to the url hash
indexURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
@ -121,6 +120,25 @@ public class ViewFile {
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
url = comp.url();
descr = comp.descr();
urlEntry.wordCount();
size = urlEntry.size();
}
// alternatively, get the url simply from a url String
// this can be used as a simple tool to test the text parser
String urlString = post.get("url", "");
if (urlString.length() > 0) try {
url = new URL(urlString);
} catch (MalformedURLException e) {}
if (url == null) {
prop.put("error", 1);
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
// loading the resource content as byte array
InputStream resource = null;
@ -129,14 +147,14 @@ public class ViewFile {
String resMime = null;
try {
// trying to load the resource body
resource = sb.cacheManager.getResourceContentStream(comp.url());
resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
resource = sb.cacheManager.getResourceContentStream(url);
resourceLength = sb.cacheManager.getResourceContentLength(url);
// if the resource body was not cached we try to load it from web
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
entry = sb.snippetCache.loadResourceFromWeb(comp.url(), 5000, false);
entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
} catch (plasmaCrawlerException e) {
prop.put("error", 4);
prop.put("error_errorText", e.getMessage());
@ -146,8 +164,8 @@ public class ViewFile {
if (entry != null) {
resInfo = entry.getDocumentInfo();
resource = sb.cacheManager.getResourceContentStream(comp.url());
resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
resource = sb.cacheManager.getResourceContentStream(url);
resourceLength = sb.cacheManager.getResourceContentLength(url);
}
if (resource == null) {
@ -163,19 +181,21 @@ public class ViewFile {
// try to load the metadata from cache
try {
resInfo = sb.cacheManager.loadResourceInfo(comp.url());
} catch (Exception e) { /* ignore this */}
resInfo = sb.cacheManager.loadResourceInfo(url);
} catch (Exception e) {
/* ignore this */
}
// if the metadata where not cached try to load it from web
if (resInfo == null) {
String protocol = comp.url().getProtocol();
String protocol = url.getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error", 6);
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
httpHeader responseHeader = httpc.whead(comp.url(),comp.url().getHost(),5000,null,null,sb.remoteProxyConfig);
httpHeader responseHeader = httpc.whead(url, url.getHost(), 5000, null, null, sb.remoteProxyConfig);
if (responseHeader == null) {
prop.put("error", 4);
prop.put("error_errorText", "Unable to load resource metadata.");
@ -183,7 +203,7 @@ public class ViewFile {
return prop;
}
try {
resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(comp.url(), responseHeader);
resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
} catch (Exception e) {
prop.put("error", 4);
prop.put("error_errorText", e.getMessage());
@ -196,7 +216,12 @@ public class ViewFile {
resMime = resInfo.getMimeType();
}
} catch (IOException e) {
if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */}
if (resource != null)
try {
resource.close();
} catch (Exception ex) {
/* ignore this */
}
prop.put("error", 4);
prop.put("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
@ -215,13 +240,16 @@ public class ViewFile {
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
if (resource != null)
try {
resource.close();
} catch (Exception e) {
/* ignore this */
}
}
content = content.replaceAll("<","&lt;")
.replaceAll(">","&gt;")
.replaceAll("\"","&quot;")
.replaceAll("\n","<br>")
content = content.replaceAll("<", "&lt;").replaceAll(">", "&gt;")
.replaceAll("\"", "&quot;").replaceAll("\n", "<br>")
.replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("error", 0);
@ -229,12 +257,12 @@ public class ViewFile {
prop.put("viewMode_plainText", content);
} else if (viewMode.equals("iframe")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME);
prop.put("viewMode_url",comp.url().toNormalform());
prop.put("viewMode_url", url.toNormalform());
} else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
// parsing the resource content
plasmaParserDocument document = null;
try {
document = sb.snippetCache.parseDocument(comp.url(), resourceLength, resource,resInfo);
document = sb.snippetCache.parseDocument(url, resourceLength, resource, resInfo);
if (document == null) {
prop.put("error", 5);
prop.put("error_errorText", "Unknown error");
@ -247,7 +275,12 @@ public class ViewFile {
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
if (resource != null)
try {
resource.close();
} catch (Exception e) {
/* ignore this */
}
}
resMime = document.getMimeType();
@ -255,8 +288,7 @@ public class ViewFile {
if (viewMode.equals("parsed")) {
String content = new String(document.getTextBytes());
content = wikiCode.replaceHTML(content); // added by Marc Nause
content = content.replaceAll("\n","<br>")
.replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
content = content.replaceAll("\n", "<br>").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;");
prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_parsedText", content);
@ -266,27 +298,36 @@ public class ViewFile {
boolean dark = true;
int i = 0;
if (sentences != null) while (sentences.hasMoreElements()) {
String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());
if (sentences != null)
while (sentences.hasMoreElements()) {
String currentSentence = wikiCode
.replaceHTML((String) sentences.nextElement());
// Search word highlighting
String words = post.get("words", null);
if (words != null) {
try {
words = URLDecoder.decode(words, "UTF-8");
} catch (UnsupportedEncodingException e) {}
} catch (UnsupportedEncodingException e) {
}
String[] wordArray = words.substring(1,words.length()-1).split(",");
String[] wordArray = words.substring(1,
words.length() - 1).split(",");
for (int j = 0; j < wordArray.length; j++) {
String currentWord = wordArray[j].trim();
currentSentence = currentSentence.replaceAll(currentWord,
"<b style=\"color: black; background-color: rgb(" + highlightingColors[j%6] + ");\">" + currentWord + "</b>");
currentSentence = currentSentence.replaceAll(
currentWord,
"<b style=\"color: black; background-color: rgb("
+ highlightingColors[j % 6]
+ ");\">" + currentWord
+ "</b>");
}
}
prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1));
prop.put("viewMode_sentences_" + i + "_text", currentSentence);
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
prop.put("viewMode_sentences_" + i + "_dark", ((dark) ? 1 : 0));
dark = !dark;
i++;
}
prop.put("viewMode_sentences", i);
@ -295,11 +336,11 @@ public class ViewFile {
if (document != null) document.close();
}
prop.put("error", 0);
prop.put("error_url", comp.url().toNormalform());
prop.put("error_url", url.toNormalform());
prop.put("error_hash", urlHash);
prop.put("error_wordCount", Integer.toString(urlEntry.wordCount()));
prop.put("error_desc", comp.descr());
prop.put("error_size", urlEntry.size());
prop.put("error_wordCount", Integer.toString(wordCount));
prop.put("error_desc", descr);
prop.put("error_size", size);
prop.put("error_mimeType", resMime);
return prop;

@ -94,7 +94,7 @@ public final class search {
final String prefer = post.get("prefer", "");
final String filter = post.get("filter", ".*");
final boolean includesnippet = post.get("includesnippet", "false").equals("true");
final kelondroBitfield constraint = kelondroBitfield(4, post.get("constraint", "______"));
final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______"));
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -301,9 +301,4 @@ public final class search {
return prop;
}
private static kelondroBitfield kelondroBitfield(int i, String string) {
// TODO Auto-generated method stub
return null;
}
}

@ -740,6 +740,7 @@ public final class plasmaCondenser {
private Object nextElement0() {
try {
String s = readSentence(raf);
//System.out.println(" SENTENCE='" + s + "'"); // DEBUG
if (s == null) {
raf.close();
return null;
@ -782,7 +783,10 @@ public final class plasmaCondenser {
// find sentence end
for (;;) {
nextChar = reader.read();
if (nextChar < 0) return null;
//System.out.print((char) nextChar); // DEBUG
if (nextChar < 0) {
if (s.length() == 0) return null; else break;
}
c = (char) nextChar;
s.append(c);
if (htmlFilterContentScraper.punctuation(c)) break;

Loading…
Cancel
Save