diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 47d428683..5077a0d06 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -47,6 +47,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.Enumeration;
@@ -55,6 +56,7 @@ import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.index.indexURLEntry;
+import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
@@ -87,24 +89,21 @@ public class ViewFile {
serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard)env;
- if (post != null && post.containsKey("words"))
- try {
- prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
- } catch (UnsupportedEncodingException e1) {
- // ignore this. this should not occure
- }
-
-
- // getting the url hash from which the content should be loaded
- String urlHash = post.get("urlHash","");
- if (urlHash.equals("")) {
- prop.put("error",1);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
-
- String viewMode = post.get("viewMode","sentences");
-
+ if (post != null && post.containsKey("words")) try {
+ prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
+ } catch (UnsupportedEncodingException e1) {
+ // ignore this. this should not occure
+ }
+
+ String viewMode = post.get("viewMode","sentences");
+ URL url = null;
+ String descr = "";
+ int wordCount = 0;
+ int size = 0;
+
+ // getting the url hash from which the content should be loaded
+ String urlHash = post.get("urlHash","");
+ if (urlHash.length() > 0) {
// getting the urlEntry that belongs to the url hash
indexURLEntry urlEntry = null;
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
@@ -113,196 +112,238 @@ public class ViewFile {
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
-
- // gettin the url that belongs to the entry
+
+ // gettin the url that belongs to the entry
indexURLEntry.Components comp = urlEntry.comp();
if ((comp == null) || (comp.url() == null)) {
- prop.put("error",3);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ prop.put("error", 3);
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
+ url = comp.url();
+ descr = comp.descr();
+ urlEntry.wordCount();
+ size = urlEntry.size();
+ }
+
+ // alternatively, get the url simply from a url String
+ // this can be used as a simple tool to test the text parser
+ String urlString = post.get("url", "");
+ if (urlString.length() > 0) try {
+ url = new URL(urlString);
+ } catch (MalformedURLException e) {}
+
+
+ if (url == null) {
+ prop.put("error", 1);
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+
+ // loading the resource content as byte array
+ InputStream resource = null;
+ long resourceLength = -1;
+ IResourceInfo resInfo = null;
+ String resMime = null;
+ try {
+ // trying to load the resource body
+ resource = sb.cacheManager.getResourceContentStream(url);
+ resourceLength = sb.cacheManager.getResourceContentLength(url);
+
+ // if the resource body was not cached we try to load it from web
+ if (resource == null) {
+ plasmaHTCache.Entry entry = null;
+ try {
+ entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
+ } catch (plasmaCrawlerException e) {
+ prop.put("error", 4);
+ prop.put("error_errorText", e.getMessage());
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
+ return prop;
+ }
- // loading the resource content as byte array
- InputStream resource = null;
- long resourceLength = -1;
- IResourceInfo resInfo = null;
- String resMime = null;
- try {
- // trying to load the resource body
- resource = sb.cacheManager.getResourceContentStream(comp.url());
- resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
+ if (entry != null) {
+ resInfo = entry.getDocumentInfo();
+ resource = sb.cacheManager.getResourceContentStream(url);
+ resourceLength = sb.cacheManager.getResourceContentLength(url);
+ }
- // if the resource body was not cached we try to load it from web
if (resource == null) {
- plasmaHTCache.Entry entry = null;
- try {
- entry = sb.snippetCache.loadResourceFromWeb(comp.url(), 5000, false);
- } catch (plasmaCrawlerException e) {
- prop.put("error",4);
- prop.put("error_errorText",e.getMessage());
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
+ prop.put("error", 4);
+ prop.put("error_errorText", "No resource available");
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+ }
- if (entry != null) {
- resInfo = entry.getDocumentInfo();
- resource = sb.cacheManager.getResourceContentStream(comp.url());
- resourceLength = sb.cacheManager.getResourceContentLength(comp.url());
- }
+ // try to load resource metadata
+ if (resInfo == null) {
- if (resource == null) {
- prop.put("error",4);
- prop.put("error_errorText","No resource available");
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
+ // try to load the metadata from cache
+ try {
+ resInfo = sb.cacheManager.loadResourceInfo(url);
+ } catch (Exception e) {
+ /* ignore this */
}
- // try to load resource metadata
+ // if the metadata where not cached try to load it from web
if (resInfo == null) {
+ String protocol = url.getProtocol();
+ if (!((protocol.equals("http") || protocol.equals("https")))) {
+ prop.put("error", 6);
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
+ return prop;
+ }
- // try to load the metadata from cache
+ httpHeader responseHeader = httpc.whead(url, url.getHost(), 5000, null, null, sb.remoteProxyConfig);
+ if (responseHeader == null) {
+ prop.put("error", 4);
+ prop.put("error_errorText", "Unable to load resource metadata.");
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
+ return prop;
+ }
try {
- resInfo = sb.cacheManager.loadResourceInfo(comp.url());
- } catch (Exception e) { /* ignore this */}
-
- // if the metadata where not cached try to load it from web
- if (resInfo == null) {
- String protocol = comp.url().getProtocol();
- if (!((protocol.equals("http") || protocol.equals("https")))) {
- prop.put("error",6);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
-
- httpHeader responseHeader = httpc.whead(comp.url(),comp.url().getHost(),5000,null,null,sb.remoteProxyConfig);
- if (responseHeader == null) {
- prop.put("error",4);
- prop.put("error_errorText","Unable to load resource metadata.");
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
- try {
- resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(comp.url(), responseHeader);
- } catch (Exception e) {
- prop.put("error",4);
- prop.put("error_errorText",e.getMessage());
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
- resMime = responseHeader.mime();
+ resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
+ } catch (Exception e) {
+ prop.put("error", 4);
+ prop.put("error_errorText", e.getMessage());
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
+ return prop;
}
- } else {
- resMime = resInfo.getMimeType();
+ resMime = responseHeader.mime();
}
- } catch (IOException e) {
- if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */}
- prop.put("error",4);
- prop.put("error_errorText",e.getMessage());
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
-
- if (viewMode.equals("plain")) {
-
- // TODO: how to handle very large files here ?
- String content;
+ } else {
+ resMime = resInfo.getMimeType();
+ }
+ } catch (IOException e) {
+ if (resource != null)
try {
- content = new String(serverFileUtils.read(resource),"UTF-8");
- } catch (Exception e) {
- prop.put("error",4);
- prop.put("error_errorText",e.getMessage());
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- } finally {
- if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
+ resource.close();
+ } catch (Exception ex) {
+ /* ignore this */
}
-
- content = content.replaceAll("<","<")
- .replaceAll(">",">")
- .replaceAll("\"",""")
- .replaceAll("\n","
")
- .replaceAll("\t"," ");
-
- prop.put("error",0);
- prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
- prop.put("viewMode_plainText",content);
- } else if (viewMode.equals("iframe")) {
- prop.put("viewMode",VIEW_MODE_AS_IFRAME);
- prop.put("viewMode_url",comp.url().toNormalform());
- } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
- // parsing the resource content
- plasmaParserDocument document = null;
- try {
- document = sb.snippetCache.parseDocument(comp.url(), resourceLength, resource,resInfo);
- if (document == null) {
- prop.put("error",5);
- prop.put("error_errorText","Unknown error");
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
+ prop.put("error", 4);
+ prop.put("error_errorText", e.getMessage());
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+
+ if (viewMode.equals("plain")) {
+
+ // TODO: how to handle very large files here ?
+ String content;
+ try {
+ content = new String(serverFileUtils.read(resource), "UTF-8");
+ } catch (Exception e) {
+ prop.put("error", 4);
+ prop.put("error_errorText", e.getMessage());
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
+ return prop;
+ } finally {
+ if (resource != null)
+ try {
+ resource.close();
+ } catch (Exception e) {
+ /* ignore this */
}
- } catch (ParserException e) {
- prop.put("error",5);
- prop.put("error_errorText",e.getMessage());
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- } finally {
- if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
+ }
+
+ content = content.replaceAll("<", "<").replaceAll(">", ">")
+ .replaceAll("\"", """).replaceAll("\n", "
")
+ .replaceAll("\t", " ");
+
+ prop.put("error", 0);
+ prop.put("viewMode", VIEW_MODE_AS_PLAIN_TEXT);
+ prop.put("viewMode_plainText", content);
+ } else if (viewMode.equals("iframe")) {
+ prop.put("viewMode", VIEW_MODE_AS_IFRAME);
+ prop.put("viewMode_url", url.toNormalform());
+ } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
+ // parsing the resource content
+ plasmaParserDocument document = null;
+ try {
+ document = sb.snippetCache.parseDocument(url, resourceLength, resource, resInfo);
+ if (document == null) {
+ prop.put("error", 5);
+ prop.put("error_errorText", "Unknown error");
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
+ return prop;
}
-
- resMime = document.getMimeType();
-
- if (viewMode.equals("parsed")) {
- String content = new String(document.getTextBytes());
- content = wikiCode.replaceHTML(content); //added by Marc Nause
- content = content.replaceAll("\n","
")
- .replaceAll("\t"," ");
-
- prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
- prop.put("viewMode_parsedText",content);
- } else {
- prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
- final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
-
- boolean dark = true;
- int i = 0;
- if (sentences != null) while (sentences.hasMoreElements()) {
- String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());
+ } catch (ParserException e) {
+ prop.put("error", 5);
+ prop.put("error_errorText", e.getMessage());
+ prop.put("viewMode", VIEW_MODE_NO_TEXT);
+ return prop;
+ } finally {
+ if (resource != null)
+ try {
+ resource.close();
+ } catch (Exception e) {
+ /* ignore this */
+ }
+ }
+
+ resMime = document.getMimeType();
+
+ if (viewMode.equals("parsed")) {
+ String content = new String(document.getTextBytes());
+ content = wikiCode.replaceHTML(content); // added by Marc Nause
+ content = content.replaceAll("\n", "
").replaceAll("\t", " ");
+
+ prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
+ prop.put("viewMode_parsedText", content);
+ } else {
+ prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
+ final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+
+ boolean dark = true;
+ int i = 0;
+ if (sentences != null)
+ while (sentences.hasMoreElements()) {
+ String currentSentence = wikiCode
+ .replaceHTML((String) sentences.nextElement());
// Search word highlighting
- String words = post.get("words",null);
+ String words = post.get("words", null);
if (words != null) {
try {
- words = URLDecoder.decode(words,"UTF-8");
- } catch (UnsupportedEncodingException e) {}
-
- String[] wordArray = words.substring(1,words.length()-1).split(",");
- for (int j=0; j < wordArray.length; j++) {
- String currentWord = wordArray[j].trim();
- currentSentence = currentSentence.replaceAll(currentWord,
- "" + currentWord + "");
+ words = URLDecoder.decode(words, "UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ }
+
+ String[] wordArray = words.substring(1,
+ words.length() - 1).split(",");
+ for (int j = 0; j < wordArray.length; j++) {
+ String currentWord = wordArray[j].trim();
+ currentSentence = currentSentence.replaceAll(
+ currentWord,
+ "" + currentWord
+ + "");
}
}
- prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
- prop.put("viewMode_sentences_" + i + "_text",currentSentence);
- prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
+ prop.put("viewMode_sentences_" + i + "_nr", Integer.toString(i + 1));
+ prop.put("viewMode_sentences_" + i + "_text", currentSentence);
+ prop.put("viewMode_sentences_" + i + "_dark", ((dark) ? 1 : 0));
+ dark = !dark;
i++;
}
- prop.put("viewMode_sentences", i);
+ prop.put("viewMode_sentences", i);
- }
- if (document != null) document.close();
}
- prop.put("error", 0);
- prop.put("error_url", comp.url().toNormalform());
- prop.put("error_hash", urlHash);
- prop.put("error_wordCount", Integer.toString(urlEntry.wordCount()));
- prop.put("error_desc", comp.descr());
- prop.put("error_size", urlEntry.size());
- prop.put("error_mimeType", resMime);
-
- return prop;
+ if (document != null) document.close();
+ }
+ prop.put("error", 0);
+ prop.put("error_url", url.toNormalform());
+ prop.put("error_hash", urlHash);
+ prop.put("error_wordCount", Integer.toString(wordCount));
+ prop.put("error_desc", descr);
+ prop.put("error_size", size);
+ prop.put("error_mimeType", resMime);
+
+ return prop;
}
}
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index 7cc7ef885..66a40fe0c 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -94,7 +94,7 @@ public final class search {
final String prefer = post.get("prefer", "");
final String filter = post.get("filter", ".*");
final boolean includesnippet = post.get("includesnippet", "false").equals("true");
- final kelondroBitfield constraint = kelondroBitfield(4, post.get("constraint", "______"));
+ final kelondroBitfield constraint = new kelondroBitfield(4, post.get("constraint", "______"));
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@@ -301,9 +301,4 @@ public final class search {
return prop;
}
- private static kelondroBitfield kelondroBitfield(int i, String string) {
- // TODO Auto-generated method stub
- return null;
- }
-
}
\ No newline at end of file
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index 352e2b5b9..048f6067c 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -250,7 +250,7 @@ public final class plasmaCondenser {
sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize);
while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
- // System.out.println("PARSED-WORD " + word);
+ //System.out.println("PARSED-WORD " + word);
// distinguish punctuation and words
wordlen = word.length();
@@ -740,6 +740,7 @@ public final class plasmaCondenser {
private Object nextElement0() {
try {
String s = readSentence(raf);
+ //System.out.println(" SENTENCE='" + s + "'"); // DEBUG
if (s == null) {
raf.close();
return null;
@@ -782,7 +783,10 @@ public final class plasmaCondenser {
// find sentence end
for (;;) {
nextChar = reader.read();
- if (nextChar < 0) return null;
+ //System.out.print((char) nextChar); // DEBUG
+ if (nextChar < 0) {
+ if (s.length() == 0) return null; else break;
+ }
c = (char) nextChar;
s.append(c);
if (htmlFilterContentScraper.punctuation(c)) break;