From 8e7215475b2ed79837c4fd45911c621870f4993b Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 23 Nov 2006 15:47:19 +0000
Subject: [PATCH] - extended ViewFile to use is as debugging-tool: you can now
 use the   post-parameter url to submit an url directly - fixed some bugs in
 text parser (not all parts had been analysed) - fixed a bug in remote search
 interface (could not handle constraints)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3001 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/ViewFile.java                         | 395 ++++++++++---------
 htroot/yacy/search.java                      |   7 +-
 source/de/anomic/plasma/plasmaCondenser.java |   8 +-
 3 files changed, 225 insertions(+), 185 deletions(-)

diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 47d428683..5077a0d06 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -47,6 +47,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
 import java.util.Enumeration;
@@ -55,6 +56,7 @@ import de.anomic.data.wikiCode;
 import de.anomic.http.httpHeader;
 import de.anomic.http.httpc;
 import de.anomic.index.indexURLEntry;
+import de.anomic.net.URL;
 import de.anomic.plasma.plasmaHTCache;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.plasmaSwitchboard;
@@ -87,24 +89,21 @@ public class ViewFile {
         serverObjects prop = new serverObjects();
         plasmaSwitchboard sb = (plasmaSwitchboard)env;     
 
-        if (post != null && post.containsKey("words"))
-            try {
-                prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
-            } catch (UnsupportedEncodingException e1) {
-                // ignore this. this should not occure
-            }
-
-
-            // getting the url hash from which the content should be loaded
-            String urlHash = post.get("urlHash","");       
-            if (urlHash.equals("")) {
-                prop.put("error",1);
-                prop.put("viewMode",VIEW_MODE_NO_TEXT);
-                return prop;
-            }
-
-            String viewMode = post.get("viewMode","sentences");
-
+        if (post != null && post.containsKey("words")) try {
+            prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
+        } catch (UnsupportedEncodingException e1) {
+            // ignore this. this should not occure
+        }
+
+        String viewMode = post.get("viewMode","sentences");
+        URL url = null;
+        String descr = "";
+        int wordCount = 0;
+        int size = 0;    
+        
+        // getting the url hash from which the content should be loaded
+        String urlHash = post.get("urlHash","");       
+        if (urlHash.length() > 0) {
             // getting the urlEntry that belongs to the url hash
             indexURLEntry urlEntry = null;
             urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
@@ -113,196 +112,238 @@ public class ViewFile {
                 prop.put("viewMode",VIEW_MODE_NO_TEXT);
                 return prop;
             }            
-
-            // gettin the url that belongs to the entry
+            
+                // gettin the url that belongs to the entry
             indexURLEntry.Components comp = urlEntry.comp();
             if ((comp == null) || (comp.url() == null)) {
-                prop.put("error",3);
-                prop.put("viewMode",VIEW_MODE_NO_TEXT);
+                prop.put("error", 3);
+                prop.put("viewMode", VIEW_MODE_NO_TEXT);
                 return prop;
             }
+            url = comp.url();
+            descr = comp.descr();
+            urlEntry.wordCount();
+            size = urlEntry.size();
+        }
+
+        // alternatively, get the url simply from a url String
+        // this can be used as a simple tool to test the text parser
+        String urlString = post.get("url", "");
+        if (urlString.length() > 0) try {
+            url = new URL(urlString);
+        } catch (MalformedURLException e) {}
+        
+        
+        if (url == null) {
+            prop.put("error", 1);
+            prop.put("viewMode", VIEW_MODE_NO_TEXT);
+            return prop;
+        }
+
+        // loading the resource content as byte array
+        InputStream resource = null;
+        long resourceLength = -1;
+        IResourceInfo resInfo = null;
+        String resMime = null;
+        try {
+            // trying to load the resource body
+            resource = sb.cacheManager.getResourceContentStream(url);
+            resourceLength = sb.cacheManager.getResourceContentLength(url);
+
+            // if the resource body was not cached we try to load it from web
+            if (resource == null) {
+                plasmaHTCache.Entry entry = null;
+                try {
+                    entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
+                } catch (plasmaCrawlerException e) {
+                    prop.put("error", 4);
+                    prop.put("error_errorText", e.getMessage());
+                    prop.put("viewMode", VIEW_MODE_NO_TEXT);
+                    return prop;
+                }
 
-            // loading the resource content as byte array
-            InputStream resource = null;
-            long resourceLength = -1;
-            IResourceInfo resInfo = null;
-            String resMime = null;
-            try {
-                // trying to load the resource body
-                resource = sb.cacheManager.getResourceContentStream(comp.url());
-                resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
+                if (entry != null) {
+                    resInfo = entry.getDocumentInfo();
+                    resource = sb.cacheManager.getResourceContentStream(url);
+                    resourceLength = sb.cacheManager.getResourceContentLength(url);
+                }
 
-                // if the resource body was not cached we try to load it from web
                 if (resource == null) {
-                    plasmaHTCache.Entry entry = null;
-                    try {
-                        entry = sb.snippetCache.loadResourceFromWeb(comp.url(), 5000, false);
-                    } catch (plasmaCrawlerException e) {
-                        prop.put("error",4);
-                        prop.put("error_errorText",e.getMessage());
-                        prop.put("viewMode",VIEW_MODE_NO_TEXT);                        
-                        return prop;
-                    }
+                    prop.put("error", 4);
+                    prop.put("error_errorText", "No resource available");
+                    prop.put("viewMode", VIEW_MODE_NO_TEXT);
+                    return prop;
+                }
+            }
 
-                    if (entry != null) {
-                        resInfo = entry.getDocumentInfo();
-                        resource = sb.cacheManager.getResourceContentStream(comp.url());
-                        resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
-                    }
+            // try to load resource metadata
+            if (resInfo == null) {
 
-                    if (resource == null) {
-                        prop.put("error",4);
-                        prop.put("error_errorText","No resource available");
-                        prop.put("viewMode",VIEW_MODE_NO_TEXT);
-                        return prop;
-                    } 
+                // try to load the metadata from cache
+                try {
+                    resInfo = sb.cacheManager.loadResourceInfo(url);
+                } catch (Exception e) {
+                    /* ignore this */
                 }
 
-                // try to load resource metadata
+                // if the metadata where not cached try to load it from web
                 if (resInfo == null) {
+                    String protocol = url.getProtocol();
+                    if (!((protocol.equals("http") || protocol.equals("https")))) {
+                        prop.put("error", 6);
+                        prop.put("viewMode", VIEW_MODE_NO_TEXT);
+                        return prop;
+                    }
 
-                    // try to load the metadata from cache
+                    httpHeader responseHeader = httpc.whead(url, url.getHost(), 5000, null, null, sb.remoteProxyConfig);
+                    if (responseHeader == null) {
+                        prop.put("error", 4);
+                        prop.put("error_errorText", "Unable to load resource metadata.");
+                        prop.put("viewMode", VIEW_MODE_NO_TEXT);
+                        return prop;
+                    }
                     try {
-                        resInfo = sb.cacheManager.loadResourceInfo(comp.url());
-                    } catch (Exception e) { /* ignore this */}
-
-                    // if the metadata where not cached try to load it from web
-                    if (resInfo == null) {
-                        String protocol = comp.url().getProtocol();
-                        if (!((protocol.equals("http") || protocol.equals("https")))) {
-                            prop.put("error",6);
-                            prop.put("viewMode",VIEW_MODE_NO_TEXT);
-                            return prop;                                
-                        }
-
-                        httpHeader responseHeader = httpc.whead(comp.url(),comp.url().getHost(),5000,null,null,sb.remoteProxyConfig);
-                        if (responseHeader == null) {
-                            prop.put("error",4);
-                            prop.put("error_errorText","Unable to load resource metadata.");
-                            prop.put("viewMode",VIEW_MODE_NO_TEXT);
-                            return prop;
-                        } 
-                        try {
-                            resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(comp.url(), responseHeader);
-                        } catch (Exception e) {
-                            prop.put("error",4);
-                            prop.put("error_errorText",e.getMessage());
-                            prop.put("viewMode",VIEW_MODE_NO_TEXT);
-                            return prop;
-                        }
-                        resMime = responseHeader.mime();
+                        resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
+                    } catch (Exception e) {
+                        prop.put("error", 4);
+                        prop.put("error_errorText", e.getMessage());
+                        prop.put("viewMode", VIEW_MODE_NO_TEXT);
+                        return prop;
                     }
-                } else {
-                    resMime = resInfo.getMimeType();
+                    resMime = responseHeader.mime();
                 }
-            } catch (IOException e) {
-                if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */}
-                prop.put("error",4);
-                prop.put("error_errorText",e.getMessage());
-                prop.put("viewMode",VIEW_MODE_NO_TEXT);
-                return prop; 
-            } 
-            
-            if (viewMode.equals("plain")) {
-                
-                // TODO: how to handle very large files here ?
-                String content;
+            } else {
+                resMime = resInfo.getMimeType();
+            }
+        } catch (IOException e) {
+            if (resource != null)
                 try {
-                    content = new String(serverFileUtils.read(resource),"UTF-8");
-                } catch (Exception e) {
-                    prop.put("error",4);
-                    prop.put("error_errorText",e.getMessage());
-                    prop.put("viewMode",VIEW_MODE_NO_TEXT);
-                    return prop;                     
-                } finally {
-                    if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
+                    resource.close();
+                } catch (Exception ex) {
+                    /* ignore this */
                 }
-                
-                content = content.replaceAll("<","&lt;")
-                .replaceAll(">","&gt;")
-                .replaceAll("\"","&quot;")
-                .replaceAll("\n","<br>")
-                .replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
-
-                prop.put("error",0);
-                prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
-                prop.put("viewMode_plainText",content); 
-            } else if (viewMode.equals("iframe")) {
-                prop.put("viewMode",VIEW_MODE_AS_IFRAME);
-                prop.put("viewMode_url",comp.url().toNormalform());                
-            } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
-                // parsing the resource content
-                plasmaParserDocument document = null;
-                try {
-                    document = sb.snippetCache.parseDocument(comp.url(), resourceLength, resource,resInfo);
-                    if (document == null) {
-                        prop.put("error",5);
-                        prop.put("error_errorText","Unknown error");
-                        prop.put("viewMode",VIEW_MODE_NO_TEXT);
-                        return prop;                
+            prop.put("error", 4);
+            prop.put("error_errorText", e.getMessage());
+            prop.put("viewMode", VIEW_MODE_NO_TEXT);
+            return prop;
+        }
+
+        if (viewMode.equals("plain")) {
+
+            // TODO: how to handle very large files here ?
+            String content;
+            try {
+                content = new String(serverFileUtils.read(resource), "UTF-8");
+            } catch (Exception e) {
+                prop.put("error", 4);
+                prop.put("error_errorText", e.getMessage());
+                prop.put("viewMode", VIEW_MODE_NO_TEXT);
+                return prop;
+            } finally {
+                if (resource != null)
+                    try {
+                        resource.close();
+                    } catch (Exception e) {
+                        /* ignore this */
                     }
-                } catch (ParserException e) {
-                    prop.put("error",5);
-                    prop.put("error_errorText",e.getMessage());
-                    prop.put("viewMode",VIEW_MODE_NO_TEXT);
-                    return prop;     
-                } finally {
-                    if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
+            }
+
+            content = content.replaceAll("<", "&lt;").replaceAll(">", "&gt;")
+                    .replaceAll("\"", "&quot;").replaceAll("\n", "<br>")
+                    .replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;");
+
+            prop.put("error", 0);
+            prop.put("viewMode", VIEW_MODE_AS_PLAIN_TEXT);
+            prop.put("viewMode_plainText", content);
+        } else if (viewMode.equals("iframe")) {
+            prop.put("viewMode", VIEW_MODE_AS_IFRAME);
+            prop.put("viewMode_url", url.toNormalform());
+        } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
+            // parsing the resource content
+            plasmaParserDocument document = null;
+            try {
+                document = sb.snippetCache.parseDocument(url, resourceLength, resource, resInfo);
+                if (document == null) {
+                    prop.put("error", 5);
+                    prop.put("error_errorText", "Unknown error");
+                    prop.put("viewMode", VIEW_MODE_NO_TEXT);
+                    return prop;
                 }
-                
-                resMime = document.getMimeType();
-
-                if (viewMode.equals("parsed")) {
-                    String content = new String(document.getTextBytes());
-                    content = wikiCode.replaceHTML(content); //added by Marc Nause
-                    content = content.replaceAll("\n","<br>")
-                    .replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
-
-                    prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
-                    prop.put("viewMode_parsedText",content);
-                } else {
-                    prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
-                    final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
-
-                    boolean dark = true;
-                    int i = 0;
-                    if (sentences != null) while (sentences.hasMoreElements()) {
-                        String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());
+            } catch (ParserException e) {
+                prop.put("error", 5);
+                prop.put("error_errorText", e.getMessage());
+                prop.put("viewMode", VIEW_MODE_NO_TEXT);
+                return prop;
+            } finally {
+                if (resource != null)
+                    try {
+                        resource.close();
+                    } catch (Exception e) {
+                        /* ignore this */
+                    }
+            }
+
+            resMime = document.getMimeType();
+
+            if (viewMode.equals("parsed")) {
+                String content = new String(document.getTextBytes());
+                content = wikiCode.replaceHTML(content); // added by Marc Nause
+                content = content.replaceAll("\n", "<br>").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;");
+
+                prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
+                prop.put("viewMode_parsedText", content);
+            } else {
+                prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
+                final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+
+                boolean dark = true;
+                int i = 0;
+                if (sentences != null)
+                    while (sentences.hasMoreElements()) {
+                        String currentSentence = wikiCode
+                                .replaceHTML((String) sentences.nextElement());
 
                         // Search word highlighting
-                        String words = post.get("words",null);
+                        String words = post.get("words", null);
                         if (words != null) {
                             try {
-                                words = URLDecoder.decode(words,"UTF-8");
-                            } catch (UnsupportedEncodingException e) {}
-
-                            String[] wordArray = words.substring(1,words.length()-1).split(",");
-                            for (int j=0; j < wordArray.length; j++) {
-                                String currentWord = wordArray[j].trim(); 
-                                currentSentence = currentSentence.replaceAll(currentWord,
-                                        "<b style=\"color: black; background-color: rgb(" + highlightingColors[j%6] + ");\">" + currentWord + "</b>");
+                                words = URLDecoder.decode(words, "UTF-8");
+                            } catch (UnsupportedEncodingException e) {
+                            }
+
+                            String[] wordArray = words.substring(1,
+                                    words.length() - 1).split(",");
+                            for (int j = 0; j < wordArray.length; j++) {
+                                String currentWord = wordArray[j].trim();
+                                currentSentence = currentSentence.replaceAll(
+                                        currentWord,
+                                        "<b style=\"color: black; background-color: rgb("
+                                                + highlightingColors[j % 6]
+                                                + ");\">" + currentWord
+                                                + "</b>");
                             }
                         }
 
-                        prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); 
-                        prop.put("viewMode_sentences_" + i + "_text",currentSentence);   
-                        prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
+                        prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1));
+                        prop.put("viewMode_sentences_" + i + "_text", currentSentence);
+                        prop.put("viewMode_sentences_" + i + "_dark", ((dark) ? 1 : 0));
+                        dark = !dark;
                         i++;
                     }
-                    prop.put("viewMode_sentences", i);
+                prop.put("viewMode_sentences", i);
 
-                } 
-                if (document != null) document.close();
             }
-            prop.put("error", 0);
-            prop.put("error_url", comp.url().toNormalform());                
-            prop.put("error_hash", urlHash);
-            prop.put("error_wordCount", Integer.toString(urlEntry.wordCount()));
-            prop.put("error_desc", comp.descr());
-            prop.put("error_size", urlEntry.size());
-            prop.put("error_mimeType", resMime);
-
-            return prop;
+            if (document != null) document.close();
+        }
+        prop.put("error", 0);
+        prop.put("error_url", url.toNormalform());
+        prop.put("error_hash", urlHash);
+        prop.put("error_wordCount", Integer.toString(wordCount));
+        prop.put("error_desc", descr);
+        prop.put("error_size", size);
+        prop.put("error_mimeType", resMime);
+
+        return prop;
     }
 
 }
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index 7cc7ef885..66a40fe0c 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -94,7 +94,7 @@ public final class search {
         final String  prefer = post.get("prefer", "");
         final String  filter = post.get("filter", ".*");
         final boolean includesnippet = post.get("includesnippet", "false").equals("true");
-        final kelondroBitfield constraint = kelondroBitfield(4, post.get("constraint", "______"));
+        final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______"));
 //      final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
 //      Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME));        // read remote time
 
@@ -301,9 +301,4 @@ public final class search {
         return prop;
     }
 
-    private static kelondroBitfield kelondroBitfield(int i, String string) {
-        // TODO Auto-generated method stub
-        return null;
-    }
-
 }
\ No newline at end of file
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index 352e2b5b9..048f6067c 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -250,7 +250,7 @@ public final class plasmaCondenser {
         sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize);
         while (wordenum.hasMoreElements()) {
             word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
-            // System.out.println("PARSED-WORD " + word);
+            //System.out.println("PARSED-WORD " + word);
             
             // distinguish punctuation and words
             wordlen = word.length();
@@ -740,6 +740,7 @@ public final class plasmaCondenser {
         private Object nextElement0() {
             try {
                 String s = readSentence(raf);
+                //System.out.println(" SENTENCE='" + s + "'"); // DEBUG 
                 if (s == null) {
                     raf.close();
                     return null;
@@ -782,7 +783,10 @@ public final class plasmaCondenser {
         // find sentence end
         for (;;) {
             nextChar = reader.read();
-            if (nextChar < 0) return null;
+            //System.out.print((char) nextChar); // DEBUG    
+            if (nextChar < 0) {
+                if (s.length() == 0) return null; else break;
+            }
             c = (char) nextChar;
             s.append(c);
             if (htmlFilterContentScraper.punctuation(c)) break;