From 8e7215475b2ed79837c4fd45911c621870f4993b Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 23 Nov 2006 15:47:19 +0000 Subject: [PATCH] - extended ViewFile to use is as debugging-tool: you can now use the post-parameter url to submit an url directly - fixed some bugs in text parser (not all parts had been analysed) - fixed a bug in remote search interface (could not handle constraints) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3001 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ViewFile.java | 395 ++++++++++--------- htroot/yacy/search.java | 7 +- source/de/anomic/plasma/plasmaCondenser.java | 8 +- 3 files changed, 225 insertions(+), 185 deletions(-) diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 47d428683..5077a0d06 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -47,6 +47,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.Enumeration; @@ -55,6 +56,7 @@ import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; import de.anomic.http.httpc; import de.anomic.index.indexURLEntry; +import de.anomic.net.URL; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; @@ -87,24 +89,21 @@ public class ViewFile { serverObjects prop = new serverObjects(); plasmaSwitchboard sb = (plasmaSwitchboard)env; - if (post != null && post.containsKey("words")) - try { - prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8")); - } catch (UnsupportedEncodingException e1) { - // ignore this. this should not occure - } - - - // getting the url hash from which the content should be loaded - String urlHash = post.get("urlHash",""); - if (urlHash.equals("")) { - prop.put("error",1); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - - String viewMode = post.get("viewMode","sentences"); - + if (post != null && post.containsKey("words")) try { + prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8")); + } catch (UnsupportedEncodingException e1) { + // ignore this. this should not occure + } + + String viewMode = post.get("viewMode","sentences"); + URL url = null; + String descr = ""; + int wordCount = 0; + int size = 0; + + // getting the url hash from which the content should be loaded + String urlHash = post.get("urlHash",""); + if (urlHash.length() > 0) { // getting the urlEntry that belongs to the url hash indexURLEntry urlEntry = null; urlEntry = sb.urlPool.loadedURL.load(urlHash, null); @@ -113,196 +112,238 @@ public class ViewFile { prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; } - - // gettin the url that belongs to the entry + + // gettin the url that belongs to the entry indexURLEntry.Components comp = urlEntry.comp(); if ((comp == null) || (comp.url() == null)) { - prop.put("error",3); - prop.put("viewMode",VIEW_MODE_NO_TEXT); + prop.put("error", 3); + prop.put("viewMode", VIEW_MODE_NO_TEXT); return prop; } + url = comp.url(); + descr = comp.descr(); + urlEntry.wordCount(); + size = urlEntry.size(); + } + + // alternatively, get the url simply from a url String + // this can be used as a simple tool to test the text parser + String urlString = post.get("url", ""); + if (urlString.length() > 0) try { + url = new URL(urlString); + } catch (MalformedURLException e) {} + + + if (url == null) { + prop.put("error", 1); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; + } + + // loading the resource content as byte array + InputStream resource = null; + long resourceLength = -1; + IResourceInfo resInfo = null; + String resMime = null; + try { + // trying to load the resource body + resource = sb.cacheManager.getResourceContentStream(url); + resourceLength = sb.cacheManager.getResourceContentLength(url); + + // if the resource body was not cached we try to load it from web + if (resource == null) { + plasmaHTCache.Entry entry = null; + try { + entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false); + } catch (plasmaCrawlerException e) { + prop.put("error", 4); + prop.put("error_errorText", e.getMessage()); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; + } - // loading the resource content as byte array - InputStream resource = null; - long resourceLength = -1; - IResourceInfo resInfo = null; - String resMime = null; - try { - // trying to load the resource body - resource = sb.cacheManager.getResourceContentStream(comp.url()); - resourceLength = sb.cacheManager.getResourceContentLength(comp.url()); + if (entry != null) { + resInfo = entry.getDocumentInfo(); + resource = sb.cacheManager.getResourceContentStream(url); + resourceLength = sb.cacheManager.getResourceContentLength(url); + } - // if the resource body was not cached we try to load it from web if (resource == null) { - plasmaHTCache.Entry entry = null; - try { - entry = sb.snippetCache.loadResourceFromWeb(comp.url(), 5000, false); - } catch (plasmaCrawlerException e) { - prop.put("error",4); - prop.put("error_errorText",e.getMessage()); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } + prop.put("error", 4); + prop.put("error_errorText", "No resource available"); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; + } + } - if (entry != null) { - resInfo = entry.getDocumentInfo(); - resource = sb.cacheManager.getResourceContentStream(comp.url()); - resourceLength = sb.cacheManager.getResourceContentLength(comp.url()); - } + // try to load resource metadata + if (resInfo == null) { - if (resource == null) { - prop.put("error",4); - prop.put("error_errorText","No resource available"); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } + // try to load the metadata from cache + try { + resInfo = sb.cacheManager.loadResourceInfo(url); + } catch (Exception e) { + /* ignore this */ } - // try to load resource metadata + // if the metadata where not cached try to load it from web if (resInfo == null) { + String protocol = url.getProtocol(); + if (!((protocol.equals("http") || protocol.equals("https")))) { + prop.put("error", 6); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; + } - // try to load the metadata from cache + httpHeader responseHeader = httpc.whead(url, url.getHost(), 5000, null, null, sb.remoteProxyConfig); + if (responseHeader == null) { + prop.put("error", 4); + prop.put("error_errorText", "Unable to load resource metadata."); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; + } try { - resInfo = sb.cacheManager.loadResourceInfo(comp.url()); - } catch (Exception e) { /* ignore this */} - - // if the metadata where not cached try to load it from web - if (resInfo == null) { - String protocol = comp.url().getProtocol(); - if (!((protocol.equals("http") || protocol.equals("https")))) { - prop.put("error",6); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - - httpHeader responseHeader = httpc.whead(comp.url(),comp.url().getHost(),5000,null,null,sb.remoteProxyConfig); - if (responseHeader == null) { - prop.put("error",4); - prop.put("error_errorText","Unable to load resource metadata."); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - try { - resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(comp.url(), responseHeader); - } catch (Exception e) { - prop.put("error",4); - prop.put("error_errorText",e.getMessage()); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - resMime = responseHeader.mime(); + resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader); + } catch (Exception e) { + prop.put("error", 4); + prop.put("error_errorText", e.getMessage()); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; } - } else { - resMime = resInfo.getMimeType(); + resMime = responseHeader.mime(); } - } catch (IOException e) { - if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */} - prop.put("error",4); - prop.put("error_errorText",e.getMessage()); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - - if (viewMode.equals("plain")) { - - // TODO: how to handle very large files here ? - String content; + } else { + resMime = resInfo.getMimeType(); + } + } catch (IOException e) { + if (resource != null) try { - content = new String(serverFileUtils.read(resource),"UTF-8"); - } catch (Exception e) { - prop.put("error",4); - prop.put("error_errorText",e.getMessage()); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } finally { - if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */} + resource.close(); + } catch (Exception ex) { + /* ignore this */ } - - content = content.replaceAll("<","<") - .replaceAll(">",">") - .replaceAll("\"",""") - .replaceAll("\n","
") - .replaceAll("\t","    "); - - prop.put("error",0); - prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT); - prop.put("viewMode_plainText",content); - } else if (viewMode.equals("iframe")) { - prop.put("viewMode",VIEW_MODE_AS_IFRAME); - prop.put("viewMode_url",comp.url().toNormalform()); - } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) { - // parsing the resource content - plasmaParserDocument document = null; - try { - document = sb.snippetCache.parseDocument(comp.url(), resourceLength, resource,resInfo); - if (document == null) { - prop.put("error",5); - prop.put("error_errorText","Unknown error"); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; + prop.put("error", 4); + prop.put("error_errorText", e.getMessage()); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; + } + + if (viewMode.equals("plain")) { + + // TODO: how to handle very large files here ? + String content; + try { + content = new String(serverFileUtils.read(resource), "UTF-8"); + } catch (Exception e) { + prop.put("error", 4); + prop.put("error_errorText", e.getMessage()); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; + } finally { + if (resource != null) + try { + resource.close(); + } catch (Exception e) { + /* ignore this */ } - } catch (ParserException e) { - prop.put("error",5); - prop.put("error_errorText",e.getMessage()); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } finally { - if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */} + } + + content = content.replaceAll("<", "<").replaceAll(">", ">") + .replaceAll("\"", """).replaceAll("\n", "
") + .replaceAll("\t", "    "); + + prop.put("error", 0); + prop.put("viewMode", VIEW_MODE_AS_PLAIN_TEXT); + prop.put("viewMode_plainText", content); + } else if (viewMode.equals("iframe")) { + prop.put("viewMode", VIEW_MODE_AS_IFRAME); + prop.put("viewMode_url", url.toNormalform()); + } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) { + // parsing the resource content + plasmaParserDocument document = null; + try { + document = sb.snippetCache.parseDocument(url, resourceLength, resource, resInfo); + if (document == null) { + prop.put("error", 5); + prop.put("error_errorText", "Unknown error"); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; } - - resMime = document.getMimeType(); - - if (viewMode.equals("parsed")) { - String content = new String(document.getTextBytes()); - content = wikiCode.replaceHTML(content); //added by Marc Nause - content = content.replaceAll("\n","
") - .replaceAll("\t","    "); - - prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT); - prop.put("viewMode_parsedText",content); - } else { - prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES); - final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset - - boolean dark = true; - int i = 0; - if (sentences != null) while (sentences.hasMoreElements()) { - String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement()); + } catch (ParserException e) { + prop.put("error", 5); + prop.put("error_errorText", e.getMessage()); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; + } finally { + if (resource != null) + try { + resource.close(); + } catch (Exception e) { + /* ignore this */ + } + } + + resMime = document.getMimeType(); + + if (viewMode.equals("parsed")) { + String content = new String(document.getTextBytes()); + content = wikiCode.replaceHTML(content); // added by Marc Nause + content = content.replaceAll("\n", "
").replaceAll("\t", "    "); + + prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT); + prop.put("viewMode_parsedText", content); + } else { + prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES); + final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset + + boolean dark = true; + int i = 0; + if (sentences != null) + while (sentences.hasMoreElements()) { + String currentSentence = wikiCode + .replaceHTML((String) sentences.nextElement()); // Search word highlighting - String words = post.get("words",null); + String words = post.get("words", null); if (words != null) { try { - words = URLDecoder.decode(words,"UTF-8"); - } catch (UnsupportedEncodingException e) {} - - String[] wordArray = words.substring(1,words.length()-1).split(","); - for (int j=0; j < wordArray.length; j++) { - String currentWord = wordArray[j].trim(); - currentSentence = currentSentence.replaceAll(currentWord, - "" + currentWord + ""); + words = URLDecoder.decode(words, "UTF-8"); + } catch (UnsupportedEncodingException e) { + } + + String[] wordArray = words.substring(1, + words.length() - 1).split(","); + for (int j = 0; j < wordArray.length; j++) { + String currentWord = wordArray[j].trim(); + currentSentence = currentSentence.replaceAll( + currentWord, + "" + currentWord + + ""); } } - prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); - prop.put("viewMode_sentences_" + i + "_text",currentSentence); - prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark; + prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1)); + prop.put("viewMode_sentences_" + i + "_text", currentSentence); + prop.put("viewMode_sentences_" + i + "_dark", ((dark) ? 1 : 0)); + dark = !dark; i++; } - prop.put("viewMode_sentences", i); + prop.put("viewMode_sentences", i); - } - if (document != null) document.close(); } - prop.put("error", 0); - prop.put("error_url", comp.url().toNormalform()); - prop.put("error_hash", urlHash); - prop.put("error_wordCount", Integer.toString(urlEntry.wordCount())); - prop.put("error_desc", comp.descr()); - prop.put("error_size", urlEntry.size()); - prop.put("error_mimeType", resMime); - - return prop; + if (document != null) document.close(); + } + prop.put("error", 0); + prop.put("error_url", url.toNormalform()); + prop.put("error_hash", urlHash); + prop.put("error_wordCount", Integer.toString(wordCount)); + prop.put("error_desc", descr); + prop.put("error_size", size); + prop.put("error_mimeType", resMime); + + return prop; } } diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 7cc7ef885..66a40fe0c 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -94,7 +94,7 @@ public final class search { final String prefer = post.get("prefer", ""); final String filter = post.get("filter", ".*"); final boolean includesnippet = post.get("includesnippet", "false").equals("true"); - final kelondroBitfield constraint = kelondroBitfield(4, post.get("constraint", "______")); + final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______")); // final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers // Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time @@ -301,9 +301,4 @@ public final class search { return prop; } - private static kelondroBitfield kelondroBitfield(int i, String string) { - // TODO Auto-generated method stub - return null; - } - } \ No newline at end of file diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 352e2b5b9..048f6067c 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -250,7 +250,7 @@ public final class plasmaCondenser { sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize); while (wordenum.hasMoreElements()) { word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? - // System.out.println("PARSED-WORD " + word); + //System.out.println("PARSED-WORD " + word); // distinguish punctuation and words wordlen = word.length(); @@ -740,6 +740,7 @@ public final class plasmaCondenser { private Object nextElement0() { try { String s = readSentence(raf); + //System.out.println(" SENTENCE='" + s + "'"); // DEBUG if (s == null) { raf.close(); return null; @@ -782,7 +783,10 @@ public final class plasmaCondenser { // find sentence end for (;;) { nextChar = reader.read(); - if (nextChar < 0) return null; + //System.out.print((char) nextChar); // DEBUG + if (nextChar < 0) { + if (s.length() == 0) return null; else break; + } c = (char) nextChar; s.append(c); if (htmlFilterContentScraper.punctuation(c)) break;