diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java
index f25acaa4e..96b7b0d15 100644
--- a/htroot/Bookmarks.java
+++ b/htroot/Bookmarks.java
@@ -51,11 +51,11 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
-import de.anomic.net.URL;
import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager;
import de.anomic.data.bookmarksDB.Tag;
import de.anomic.http.httpHeader;
+import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html
index 87830d891..9cccb1fdc 100644
--- a/htroot/ViewFile.html
+++ b/htroot/ViewFile.html
@@ -53,9 +53,11 @@ Unable to find URL Entry in DB
::
Invalid URL
::
-Unable to download resource content.
+Unable to download resource content.
+#[errorText]#
::
-Unable to parse resource content.
+Unable to parse resource content.
+#[errorText]#
::
Unsupported protocol.
#(/error)#
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index d28daa23a..ca76bf10a 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -57,6 +57,8 @@ import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;
+import de.anomic.plasma.crawler.plasmaCrawlerException;
+import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.plasmaCrawlLURL.Entry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -83,174 +85,185 @@ public class ViewFile {
serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard)env;
-
-
- if (post.containsKey("words"))
+ if (post != null && post.containsKey("words"))
try {
prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8"));
} catch (UnsupportedEncodingException e1) {
- // TODO Auto-generated catch block
- e1.printStackTrace();
+ // ignore this. this should not occure
}
- if (post != null) {
- // getting the url hash from which the content should be loaded
- String urlHash = post.get("urlHash","");
- if (urlHash.equals("")) {
- prop.put("error",1);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
- String viewMode = post.get("viewMode","sentences");
-
- // getting the urlEntry that belongs to the url hash
- Entry urlEntry = null;
- urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
- if (urlEntry == null) {
- prop.put("error",2);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
+ // getting the url hash from which the content should be loaded
+ String urlHash = post.get("urlHash","");
+ if (urlHash.equals("")) {
+ prop.put("error",1);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
- // gettin the url that belongs to the entry
- URL url = urlEntry.url();
- if (url == null) {
- prop.put("error",3);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
+ String viewMode = post.get("viewMode","sentences");
+
+ // getting the urlEntry that belongs to the url hash
+ Entry urlEntry = null;
+ urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
+ if (urlEntry == null) {
+ prop.put("error",2);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+
+ // gettin the url that belongs to the entry
+ URL url = urlEntry.url();
+ if (url == null) {
+ prop.put("error",3);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+
+ // loading the resource content as byte array
+ byte[] resource = null;
+ IResourceInfo resInfo = null;
+ String resMime = null;
+ try {
+ // trying to load the resource body
+ resource = sb.cacheManager.loadResourceContent(url);
+
+ // if the resource body was not cached we try to load it from web
+ if (resource == null) {
+ plasmaHTCache.Entry entry = null;
+ try {
+ entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
+ } catch (plasmaCrawlerException e) {
+ prop.put("error",4);
+ prop.put("error_errorText",e.getMessage());
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
- // loading the resource content as byte array
- byte[] resource = null;
- IResourceInfo resInfo = null;
- String resMime = null;
- try {
- // trying to load the resource body
- resource = sb.cacheManager.loadResourceContent(url);
+ if (entry != null) {
+ resInfo = entry.getDocumentInfo();
+ resource = sb.cacheManager.loadResourceContent(url);
+ }
- // if the resource body was not cached we try to load it from web
if (resource == null) {
- plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
+ prop.put("error",4);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+ }
- if (entry != null) {
- resInfo = entry.getDocumentInfo();
- resource = sb.cacheManager.loadResourceContent(url);
+ // try to load resource metadata
+ if (resInfo == null) {
+
+ // try to load the metadata from cache
+ try {
+ resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url());
+ } catch (Exception e) { /* ignore this */}
+
+ // if the metadata where not cached try to load it from web
+ if (resInfo == null) {
+ String protocol = url.getProtocol();
+ if (!((protocol.equals("http") || protocol.equals("https")))) {
+ prop.put("error",6);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
}
- if (resource == null) {
+ httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
+ if (responseHeader == null) {
prop.put("error",4);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
+ resMime = responseHeader.mime();
}
+ } else {
+ resMime = resInfo.getMimeType();
+ }
+ } catch (IOException e) {
+ prop.put("error",4);
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+ if (viewMode.equals("plain")) {
+ String content = new String(resource);
+ content = content.replaceAll("<","<")
+ .replaceAll(">",">")
+ .replaceAll("\"",""")
+ .replaceAll("\n","
")
+ .replaceAll("\t"," ");
- // try to load resource metadata
- if (resInfo == null) {
-
- // try to load the metadata from cache
- try {
- resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url());
- } catch (Exception e) { /* ignore this */}
-
- // if the metadata where not cached try to load it from web
- if (resInfo == null) {
- String protocol = url.getProtocol();
- if (!((protocol.equals("http") || protocol.equals("https")))) {
- prop.put("error",6);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
-
- httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
- if (responseHeader == null) {
- prop.put("error",4);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
- resMime = responseHeader.mime();
- }
- } else {
- resMime = resInfo.getMimeType();
- }
- } catch (IOException e) {
- if (url == null) {
- prop.put("error",4);
- prop.put("viewMode",VIEW_MODE_NO_TEXT);
- return prop;
- }
- }
- if (viewMode.equals("plain")) {
- String content = new String(resource);
- content = content.replaceAll("<","<")
- .replaceAll(">",">")
- .replaceAll("\"",""")
- .replaceAll("\n","
")
- .replaceAll("\t"," ");
-
- prop.put("error",0);
- prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
- prop.put("viewMode_plainText",content);
- } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
- // parsing the resource content
- plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resInfo);
+ prop.put("error",0);
+ prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
+ prop.put("viewMode_plainText",content);
+ } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
+ // parsing the resource content
+ plasmaParserDocument document = null;
+ try {
+ document = sb.snippetCache.parseDocument(url, resource,resInfo);
if (document == null) {
prop.put("error",5);
+ prop.put("error_errorText","Unknown error");
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;
}
- resMime = document.getMimeType();
-
- if (viewMode.equals("parsed")) {
- String content = new String(document.getText());
- content = wikiCode.replaceHTML(content); //added by Marc Nause
- content = content.replaceAll("\n","
")
- .replaceAll("\t"," ");
-
- prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
- prop.put("viewMode_parsedText",content);
- } else if (viewMode.equals("iframe")) {
- prop.put("viewMode",VIEW_MODE_AS_IFRAME);
- prop.put("viewMode_url",url.toString());
- } else {
- prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
- String[] sentences = document.getSentences();
-
- boolean dark = true;
- for (int i=0; i < sentences.length; i++) {
- String currentSentence = wikiCode.replaceHTML(sentences[i]);
-
- // Search word highlighting
- String words = post.get("words",null);
- if (words != null) {
- try {
- words = URLDecoder.decode(words,"UTF-8");
- } catch (UnsupportedEncodingException e) {}
-
- String[] wordArray = words.substring(1,words.length()-1).split(",");
- for (int j=0; j < wordArray.length; j++) {
- String currentWord = wordArray[j].trim();
- currentSentence = currentSentence.replaceAll(currentWord,
- "" + currentWord + "");
- }
- }
+ } catch (ParserException e) {
+ prop.put("error",5);
+ prop.put("error_errorText",e.getMessage());
+ prop.put("viewMode",VIEW_MODE_NO_TEXT);
+ return prop;
+ }
+ resMime = document.getMimeType();
- prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
- prop.put("viewMode_sentences_" + i + "_text",currentSentence);
- prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
+ if (viewMode.equals("parsed")) {
+ String content = new String(document.getText());
+ content = wikiCode.replaceHTML(content); //added by Marc Nause
+ content = content.replaceAll("\n","
")
+ .replaceAll("\t"," ");
+
+ prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
+ prop.put("viewMode_parsedText",content);
+ } else if (viewMode.equals("iframe")) {
+ prop.put("viewMode",VIEW_MODE_AS_IFRAME);
+ prop.put("viewMode_url",url.toString());
+ } else {
+ prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
+ String[] sentences = document.getSentences();
+
+ boolean dark = true;
+ for (int i=0; i < sentences.length; i++) {
+ String currentSentence = wikiCode.replaceHTML(sentences[i]);
+
+ // Search word highlighting
+ String words = post.get("words",null);
+ if (words != null) {
+ try {
+ words = URLDecoder.decode(words,"UTF-8");
+ } catch (UnsupportedEncodingException e) {}
+
+ String[] wordArray = words.substring(1,words.length()-1).split(",");
+ for (int j=0; j < wordArray.length; j++) {
+ String currentWord = wordArray[j].trim();
+ currentSentence = currentSentence.replaceAll(currentWord,
+ "" + currentWord + "");
+ }
}
- prop.put("viewMode_sentences",sentences.length);
- }
- }
- prop.put("error",0);
- prop.put("error_url",url.toString());
- prop.put("error_hash",urlHash);
- prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
- prop.put("error_desc",urlEntry.descr());
- prop.put("error_size",urlEntry.size());
- prop.put("error_mimeType",resMime);
- }
+ prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
+ prop.put("viewMode_sentences_" + i + "_text",currentSentence);
+ prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
+ }
+ prop.put("viewMode_sentences",sentences.length);
+
+ }
+ }
+ prop.put("error",0);
+ prop.put("error_url",url.toString());
+ prop.put("error_hash",urlHash);
+ prop.put("error_wordCount",Integer.toString(urlEntry.wordCount()));
+ prop.put("error_desc",urlEntry.descr());
+ prop.put("error_size",urlEntry.size());
+ prop.put("error_mimeType",resMime);
return prop;
}
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 3d56f9eb4..72adb9831 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -56,6 +56,7 @@ import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
+import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchImages;
@@ -64,7 +65,6 @@ import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.net.URL;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
@@ -192,13 +192,15 @@ public class yacysearch {
plasmaCrawlLURL.Entry urlentry = sb.urlPool.loadedURL.load(recommendHash, null);
if (urlentry != null) {
plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true);
- // create a news message
- HashMap map = new HashMap();
- map.put("url", urlentry.url().toNormalform().replace(',', '|'));
- map.put("title", urlentry.descr().replace(',', ' '));
- map.put("description", ((document == null) ? urlentry.descr() : document.getMainLongTitle()).replace(',', ' '));
- map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
- yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
+ if (document != null) {
+ // create a news message
+ HashMap map = new HashMap();
+ map.put("url", urlentry.url().toNormalform().replace(',', '|'));
+ map.put("title", urlentry.descr().replace(',', ' '));
+ map.put("description", ((document == null) ? urlentry.descr() : document.getMainLongTitle()).replace(',', ' '));
+ map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
+ yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
+ }
}
}
diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
index 7889df481..2df4f4d4b 100644
--- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
+++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
@@ -93,6 +93,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
protected plasmaCrawlProfile.entry profile;
protected boolean acceptAllContent;
+ protected String errorMessage;
+
/**
* The crawler thread pool
*/
@@ -186,6 +188,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
}
public void execute() {
+
+ plasmaHTCache.Entry loadedResource = null;
try {
// setting threadname
this.setName(plasmaCrawlWorker.threadBaseName + "_" + this.url);
@@ -194,15 +198,23 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
init();
// loading resource
- plasmaHTCache.Entry resource = load();
+ loadedResource = load();
+ } catch (IOException e) {
+ //throw e;
+ } finally {
+ // setting the error message (if available)
+ if (this.errorMessage != null) {
+ this.theMsg.setError(this.errorMessage);
+ }
// store a reference to the result in the message object
// this is e.g. needed by the snippet fetcher
- this.theMsg.setResult(resource);
-
- } catch (IOException e) {
- //throw e;
- } finally {
+ //
+ // Note: this is always called, even on empty results.
+ // Otherwise the caller will block forever
+ this.theMsg.setResult(loadedResource);
+
+ // signal that this worker thread has finished the job
this.done = true;
}
}
@@ -256,9 +268,13 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
this.startdate = 0;
this.profile = null;
this.acceptAllContent = false;
+ this.errorMessage = null;
}
- protected void addURLtoErrorDB(String failreason) {
+ protected void addURLtoErrorDB(String failreason) {
+ // remember error message
+ this.errorMessage = failreason;
+
// convert the referrer URL into a hash value
String referrerHash = (this.refererURLString==null)?null:indexURL.urlHash(this.refererURLString);
diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerException.java b/source/de/anomic/plasma/crawler/plasmaCrawlerException.java
new file mode 100644
index 000000000..165dd4e78
--- /dev/null
+++ b/source/de/anomic/plasma/crawler/plasmaCrawlerException.java
@@ -0,0 +1,9 @@
+package de.anomic.plasma.crawler;
+
+import java.io.IOException;
+
+public class plasmaCrawlerException extends IOException {
+ public plasmaCrawlerException(String errorMsg) {
+ super(errorMsg);
+ }
+}
diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java
index c69c60496..9507e5ca5 100644
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@@ -49,6 +49,7 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
+import java.io.IOException;
import java.io.InputStream;
import de.anomic.net.URL;
@@ -93,12 +94,35 @@ public abstract class AbstractParser implements Parser{
this.libxDependencies = libxDependencies;
}
+ /**
+ * Check if the parser was interrupted.
+ * @throws InterruptedException if the parser was interrupted
+ */
public static final void checkInterruption() throws InterruptedException {
Thread currentThread = Thread.currentThread();
if ((currentThread instanceof serverThread) && ((serverThread)currentThread).shutdownInProgress()) throw new InterruptedException("Shutdown in progress ...");
if (currentThread.isInterrupted()) throw new InterruptedException("Shutdown in progress ...");
}
+ public final File createTempFile(String name) throws IOException {
+ String parserClassName = this.getClass().getName();
+ int idx = parserClassName.lastIndexOf(".");
+ if (idx != -1) {
+ parserClassName = parserClassName.substring(idx+1);
+ }
+
+ // getting the file extension
+ idx = name.lastIndexOf("/");
+ String fileName = (idx != -1) ? name.substring(idx+1) : name;
+
+ idx = fileName.lastIndexOf(".");
+ String fileExt = (idx > -1) ? fileName.substring(idx+1) : "";
+
+ // creates the temp file
+ File tempFile = File.createTempFile(parserClassName + "_" + ((idx>-1)?fileName.substring(0,idx):fileName), (fileExt.length()>0)?"."+fileExt:fileExt);
+ return tempFile;
+ }
+
/**
* Parsing a document available as byte array.
* @param location the origin of the document
@@ -119,14 +143,17 @@ public abstract class AbstractParser implements Parser{
) throws ParserException, InterruptedException {
ByteArrayInputStream contentInputStream = null;
try {
+ // convert the byte array into a stream
contentInputStream = new ByteArrayInputStream(source);
+
+ // parse the stream
return this.parse(location,mimeType,charset,contentInputStream);
} finally {
if (contentInputStream != null) {
try {
contentInputStream.close();
contentInputStream = null;
- } catch (Exception e){}
+ } catch (Exception e){ /* ignore this */}
}
}
}
@@ -151,12 +178,15 @@ public abstract class AbstractParser implements Parser{
) throws ParserException, InterruptedException {
BufferedInputStream contentInputStream = null;
try {
+ // create a stream from the file
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
+
+ // parse the stream
return this.parse(location, mimeType, charset, contentInputStream);
} catch (FileNotFoundException e) {
- throw new ParserException(e.getMessage());
+ throw new ParserException("Unexpected error while parsing file. " + e.getMessage(),location);
} finally {
- if (contentInputStream != null) try{contentInputStream.close();}catch(Exception e){}
+ if (contentInputStream != null) try{contentInputStream.close();}catch(Exception e){/* ignore this */}
}
}
@@ -201,6 +231,6 @@ public abstract class AbstractParser implements Parser{
* Return the name of the parser
*/
public String getName() {
- return parserName;
+ return this.parserName;
}
}
diff --git a/source/de/anomic/plasma/parser/ParserException.java b/source/de/anomic/plasma/parser/ParserException.java
index cdb730ec6..c05d9a484 100644
--- a/source/de/anomic/plasma/parser/ParserException.java
+++ b/source/de/anomic/plasma/parser/ParserException.java
@@ -44,24 +44,45 @@
package de.anomic.plasma.parser;
+import de.anomic.net.URL;
+import de.anomic.plasma.plasmaCrawlEURL;
+
public class ParserException extends Exception
{
-
+ private String errorCode = null;
+ private URL url = null;
+
private static final long serialVersionUID = 1L;
public ParserException() {
super();
}
- public ParserException(String message) {
+ public ParserException(String message, URL url) {
+ this(message,url,plasmaCrawlEURL.DENIED_PARSER_ERROR);
+ }
+
+ public ParserException(String message, URL url, String errorCode) {
super(message);
+ this.errorCode = errorCode;
+ this.url = url;
}
- public ParserException(String message, Throwable cause) {
+ public ParserException(String message, URL url, Throwable cause) {
+ this(message,url,cause,plasmaCrawlEURL.DENIED_PARSER_ERROR);
+ }
+
+ public ParserException(String message, URL url, Throwable cause, String errorCode) {
super(message, cause);
+ this.errorCode = errorCode;
+ this.url = url;
}
- public ParserException(Throwable cause) {
- super(cause);
+ public String getErrorCode() {
+ return this.errorCode;
+ }
+
+ public URL getURL() {
+ return this.url;
}
}
diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java
index 8b2020c81..53aa52e40 100644
--- a/source/de/anomic/plasma/parser/bzip/bzipParser.java
+++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java
@@ -80,7 +80,7 @@ public class bzipParser extends AbstractParser implements Parser {
public bzipParser() {
super(LIBX_DEPENDENCIES);
- parserName = "Bzip 2 UNIX Compressed File Parser";
+ this.parserName = "Bzip 2 UNIX Compressed File Parser";
}
public Hashtable getSupportedMimeTypes() {
@@ -129,7 +129,9 @@ public class bzipParser extends AbstractParser implements Parser {
return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing bzip file. " + e.getMessage(),location);
} finally {
if (tempFile != null) tempFile.delete();
}
diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java
index 46aa1196a..2a89dbfee 100644
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@@ -75,7 +75,7 @@ implements Parser {
public docParser() {
super(LIBX_DEPENDENCIES);
- parserName = "Word Document Parser";
+ this.parserName = "Word Document Parser";
}
public plasmaParserDocument parse(URL location, String mimeType, String charset,
@@ -99,14 +99,16 @@ implements Parser {
null,
null,
null,
- contents.getBytes(),
+ contents.getBytes("UTF-8"),
null,
null);
return theDoc;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the doc content. " + e.getMessage());
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing doc file. " + e.getMessage(),location);
}
}
diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java
index abc58e26e..389795372 100644
--- a/source/de/anomic/plasma/parser/gzip/gzipParser.java
+++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java
@@ -76,7 +76,7 @@ public class gzipParser extends AbstractParser implements Parser {
public gzipParser() {
super(LIBX_DEPENDENCIES);
- parserName = "GNU Zip Compressed Archive Parser";
+ this.parserName = "GNU Zip Compressed Archive Parser";
}
public Hashtable getSupportedMimeTypes() {
@@ -113,7 +113,9 @@ public class gzipParser extends AbstractParser implements Parser {
return theParser.parseSource(location,null,null,tempFile);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the gzip content. " + e.getMessage());
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing gzip file. " + e.getMessage(),location);
} finally {
if (tempFile != null) tempFile.delete();
}
diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
index f2b86124f..6d5eabc33 100644
--- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
+++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
@@ -44,6 +44,7 @@
package de.anomic.plasma.parser.mimeType;
import java.io.File;
+import java.io.IOException;
import java.io.InputStream;
import de.anomic.net.URL;
import java.util.Collection;
@@ -99,7 +100,7 @@ implements Parser {
public mimeTypeParser() {
super(LIBX_DEPENDENCIES);
- parserName = "MimeType Parser";
+ this.parserName = "MimeType Parser";
}
public String getMimeType (File sourceFile) {
@@ -142,8 +143,8 @@ implements Parser {
threadLoopDetection.put(Thread.currentThread(),new Integer(loopDepth.intValue()+1));
// deactivating the logging for jMimeMagic
- Logger theLogger = Logger.getLogger("net.sf.jmimemagic");
- theLogger.setLevel(Level.OFF);
+ Logger jmimeMagicLogger = Logger.getLogger("net.sf.jmimemagic");
+ jmimeMagicLogger.setLevel(Level.OFF);
Magic theMagic = new Magic();
MagicMatch match = theMagic.getMagicMatch(sourceFile);
@@ -160,8 +161,8 @@ implements Parser {
}
// to avoid loops we have to test if the mimetype has changed ...
- if (this.getSupportedMimeTypes().containsKey(mimeType)) return null;
- if (orgMimeType.equals(mimeType)) return null;
+ if (this.getSupportedMimeTypes().containsKey(mimeType)) throw new ParserException("Unable to detect mimetype of resource.",location);
+ if (orgMimeType.equals(mimeType)) throw new ParserException("Unable to detect mimetype of resource.",location);
// check for interruption
checkInterruption();
@@ -170,11 +171,13 @@ implements Parser {
plasmaParser theParser = new plasmaParser();
return theParser.parseSource(location,mimeType,charset,sourceFile);
}
- return null;
+ throw new ParserException("Unable to detect mimetype of resource.",location);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- return null;
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while detect mimetype of resource. " + e.getMessage(),location);
} finally {
Integer loopDepth = (Integer) threadLoopDetection.get(Thread.currentThread());
if (loopDepth.intValue() <= 1) {
@@ -186,14 +189,14 @@ implements Parser {
}
public plasmaParserDocument parse(URL location, String mimeType,String charset,
- InputStream source) throws ParserException {
+ InputStream source) throws ParserException, InterruptedException {
File dstFile = null;
try {
dstFile = File.createTempFile("mimeTypeParser",".tmp");
serverFileUtils.copy(source,dstFile);
return parse(location,mimeType,charset,dstFile);
- } catch (Exception e) {
- return null;
+ } catch (IOException e) {
+ throw new ParserException("Unexpected error while detect mimetype of resource. " + e.getMessage(),location);
} finally {
if (dstFile != null) {dstFile.delete();}
}
diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java
index f8a9a10be..a2b1b8cbd 100644
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@@ -84,7 +84,7 @@ public class odtParser extends AbstractParser implements Parser {
public odtParser() {
super(LIBX_DEPENDENCIES);
- parserName = "OASIS OpenDocument V2 Text Document Parser";
+ this.parserName = "OASIS OpenDocument V2 Text Document Parser";
}
public Hashtable getSupportedMimeTypes() {
@@ -96,7 +96,7 @@ public class odtParser extends AbstractParser implements Parser {
try {
byte[] docContent = null;
String docDescription = null;
- String docKeywords = null;
+ String docKeywordStr = null;
String docShortTitle = null;
String docLongTitle = null;
@@ -125,7 +125,7 @@ public class odtParser extends AbstractParser implements Parser {
ODFMetaFileAnalyzer metaAnalyzer = new ODFMetaFileAnalyzer();
OpenDocumentMetadata metaData = metaAnalyzer.analyzeMetaData(zipFileEntryStream);
docDescription = metaData.getDescription();
- docKeywords = metaData.getKeyword();
+ docKeywordStr = metaData.getKeyword();
docShortTitle = metaData.getTitle();
docLongTitle = metaData.getSubject();
@@ -149,11 +149,16 @@ public class odtParser extends AbstractParser implements Parser {
}
}
+ // split the keywords
+ String[] docKeywords = null;
+ if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
+
+ // create the parser document
return new plasmaParserDocument(
location,
mimeType,
"UTF-8",
- docKeywords.split(" |,"),
+ docKeywords,
docShortTitle,
docLongTitle,
null,
@@ -163,13 +168,13 @@ public class odtParser extends AbstractParser implements Parser {
null);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the odt content. " + e.getMessage());
- } catch (Error e) {
- throw new ParserException("Unable to parse the odt content. " + e.getMessage());
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing odt file. " + e.getMessage(),location);
}
}
- public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException {
+ public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
File dest = null;
try {
// creating a tempfile
@@ -182,9 +187,12 @@ public class odtParser extends AbstractParser implements Parser {
// parsing the content
return parse(location, mimeType, charset, dest);
} catch (Exception e) {
- throw new ParserException("Unable to parse the odt document. " + e.getMessage());
+ if (e instanceof InterruptedException) throw (InterruptedException) e;
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing odt file. " + e.getMessage(),location);
} finally {
- if (dest != null) try { dest.delete(); } catch (Exception e){}
+ if (dest != null) try { dest.delete(); } catch (Exception e){/* ignore this */}
}
}
diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
index 298a87f41..1b67fceb4 100644
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -78,7 +78,7 @@ public class pdfParser extends AbstractParser implements Parser {
public pdfParser() {
super(LIBX_DEPENDENCIES);
- parserName = "Acrobat Portable Document Parser";
+ this.parserName = "Acrobat Portable Document Parser";
}
public Hashtable getSupportedMimeTypes() {
@@ -98,7 +98,7 @@ public class pdfParser extends AbstractParser implements Parser {
// Logger theLogger = Logger.getLogger("org.pdfbox");
// theLogger.setLevel(Level.INFO);
- String docTitle = null, docSubject = null, /*docAuthor = null,*/ docKeyWords = null;
+ String docTitle = null, docSubject = null, /*docAuthor = null,*/ docKeywordStr = null;
// check for interruption
checkInterruption();
@@ -120,7 +120,7 @@ public class pdfParser extends AbstractParser implements Parser {
docTitle = theDocInfo.getTitle();
docSubject = theDocInfo.getSubject();
//docAuthor = theDocInfo.getAuthor();
- docKeyWords = theDocInfo.getKeywords();
+ docKeywordStr = theDocInfo.getKeywords();
}
serverByteBuffer out = new serverByteBuffer();
@@ -142,18 +142,14 @@ public class pdfParser extends AbstractParser implements Parser {
replaceAll("\t"," ");
}
- /*
- * public document(URL location, String mimeType,
- String keywords, String shortTitle, String longTitle,
- String[] sections, String abstrct,
- byte[] text, Map anchors, Map images) {
- *
- */
+ String[] docKeywords = null;
+ if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
+
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
"UTF-8",
- docKeyWords.split(" |,"),
+ docKeywords,
docSubject,
docTitle,
null,
@@ -166,10 +162,12 @@ public class pdfParser extends AbstractParser implements Parser {
}
catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the pdf content. " + e.getMessage(),e);
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location);
} finally {
- if (theDocument != null) try { theDocument.close(); } catch (Exception e) {}
- if (writer != null) try { writer.close(); } catch (Exception e) {}
+ if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */}
+ if (writer != null) try { writer.close(); } catch (Exception e) {/* ignore this */}
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
}
}
diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java
index 7e117f4f5..eef4ca2fb 100644
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@@ -84,7 +84,7 @@ public class rpmParser extends AbstractParser implements Parser {
public rpmParser() {
super(LIBX_DEPENDENCIES);
- parserName = "rpm Parser";
+ this.parserName = "rpm Parser";
}
public Hashtable getSupportedMimeTypes() {
@@ -126,12 +126,12 @@ public class rpmParser extends AbstractParser implements Parser {
// getting the next tag
DataTypeIf tag = rpmFile.getTag(headerNames[i]);
- if (tag != null) {
- content.append(headerNames[i])
- .append(": ")
- .append(tag.toString())
- .append("\n");
- }
+ if (tag == null) continue;
+
+ content.append(headerNames[i])
+ .append(": ")
+ .append(tag.toString())
+ .append("\n");
if (headerNames[i].equals("N")) name = tag.toString();
else if (headerNames[i].equals("SUMMARY")) summary = tag.toString();
@@ -153,16 +153,18 @@ public class rpmParser extends AbstractParser implements Parser {
summary,
null,
description,
- content.toString().getBytes(),
+ content.toString().getBytes("UTF-8"),
anchors,
null);
return theDoc;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the rpm file. " + e.getMessage());
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing rpm file. " + e.getMessage(),location);
} finally {
- if (rpmFile != null) try { rpmFile.close(); } catch (Exception e) {}
+ if (rpmFile != null) try { rpmFile.close(); } catch (Exception e) {/* ignore this */}
}
}
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index 299e3f865..41cf8573b 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -98,7 +98,7 @@ public class rssParser extends AbstractParser implements Parser {
public rssParser() {
super(LIBX_DEPENDENCIES);
- parserName = "Rich Site Summary/Atom Feed Parser";
+ this.parserName = "Rich Site Summary/Atom Feed Parser";
}
public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
@@ -149,7 +149,7 @@ public class rssParser extends AbstractParser implements Parser {
anchors.put(itemURL.toString(),itemTitle);
if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
- text.append(new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))).trim()).append(' '); // TODO: this does not work for utf-8
+ text.append(new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))).trim().toString()).append(' ');
String itemContent = item.getElementValue("content");
if ((itemContent != null) && (itemContent.length() > 0)) {
@@ -183,11 +183,6 @@ public class rssParser extends AbstractParser implements Parser {
}
}
- /* (URL location, String mimeType,
- String keywords, String shortTitle, String longTitle,
- String[] sections, String abstrct,
- byte[] text, Map anchors, Map images)
- */
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
@@ -205,7 +200,9 @@ public class rssParser extends AbstractParser implements Parser {
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the rss file. " + e.getMessage());
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing rss file." + e.getMessage(),location);
}
}
diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java
index fdef82b99..4fa5d3028 100644
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@@ -77,7 +77,7 @@ implements Parser {
public rtfParser() {
super(LIBX_DEPENDENCIES);
- parserName = "Rich Text Format Parser";
+ this.parserName = "Rich Text Format Parser";
}
public plasmaParserDocument parse(URL location, String mimeType, String charset,
@@ -113,7 +113,9 @@ implements Parser {
}
catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the rdf content. " + e.getMessage());
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing rtf resource." + e.getMessage(),location);
}
}
diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
index ba30acc91..c70c4e26c 100644
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -87,7 +87,7 @@ public class tarParser extends AbstractParser implements Parser {
public tarParser() {
super(LIBX_DEPENDENCIES);
- parserName = "Tape Archive File Parser";
+ this.parserName = "Tape Archive File Parser";
}
public Hashtable getSupportedMimeTypes() {
@@ -128,12 +128,11 @@ public class tarParser extends AbstractParser implements Parser {
// skip directories
if (entry.isDirectory()) continue;
- // Get the entry name
- int idx = -1;
+ // Get the short entry name
String entryName = entry.getName();
- idx = entryName.lastIndexOf("/");
- if (idx != -1) entryName = entryName.substring(idx+1);
- idx = entryName.lastIndexOf(".");
+
+ // getting the entry file extension
+ int idx = entryName.lastIndexOf(".");
String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// trying to determine the mimeType per file extension
@@ -143,19 +142,21 @@ public class tarParser extends AbstractParser implements Parser {
plasmaParserDocument theDoc = null;
File tempFile = null;
try {
- byte[] buf = new byte[(int) entry.getSize()];
- /*int bytesRead =*/ tin.read(buf);
-
- tempFile = File.createTempFile("tarParser_" + ((idx>-1)?entryName.substring(0,idx):entryName), (entryExt.length()>0)?"."+entryExt:entryExt);
- serverFileUtils.write(buf, tempFile);
+ // create the temp file
+ tempFile = createTempFile(entryName);
+
+ // copy the data into the file
+ serverFileUtils.copy(tin,tempFile,entry.getSize());
// check for interruption
checkInterruption();
// parsing the content
- theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile);
+ theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,tempFile);
+ } catch (ParserException e) {
+ this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getErrorCode());
} finally {
- if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){}
+ if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
}
if (theDoc == null) continue;
@@ -200,7 +201,9 @@ public class tarParser extends AbstractParser implements Parser {
docImages);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the zip content. " + e.getMessage());
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing tar resource. " + e.getMessage(),location);
}
}
diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java
index 1dc963e95..f92835236 100644
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@@ -215,7 +215,7 @@ public class vcfParser extends AbstractParser implements Parser {
URL newURL = new URL(value);
anchors.put(newURL.toString(),newURL.toString());
//parsedData.put(key,value);
- } catch (MalformedURLException ex) {}
+ } catch (MalformedURLException ex) {/* ignore this */}
} else if (
!key.equalsIgnoreCase("BEGIN") &&
!key.equalsIgnoreCase("END") &&
@@ -255,12 +255,10 @@ public class vcfParser extends AbstractParser implements Parser {
return theDoc;
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
+ if (e instanceof ParserException) throw (ParserException) e;
- String errorMsg = "Unable to parse the vcard content. " + e.getMessage();
- this.theLogger.logSevere(errorMsg);
- throw new ParserException(errorMsg);
- } finally {
- }
+ throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),location);
+ }
}
public void reset() {
diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java
index c6d07a66e..7b55085d8 100644
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@@ -43,9 +43,8 @@
package de.anomic.plasma.parser.zip;
-import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.InputStream;
-import de.anomic.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
@@ -55,12 +54,14 @@ import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
+import de.anomic.net.URL;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverByteBuffer;
+import de.anomic.server.serverFileUtils;
public class zipParser extends AbstractParser implements Parser {
@@ -84,7 +85,7 @@ public class zipParser extends AbstractParser implements Parser {
public zipParser() {
super(LIBX_DEPENDENCIES);
- parserName = "Compressed Archive File Parser";
+ this.parserName = "Compressed Archive File Parser";
}
public Hashtable getSupportedMimeTypes() {
@@ -110,29 +111,39 @@ public class zipParser extends AbstractParser implements Parser {
ZipEntry entry;
ZipInputStream zippedContent = new ZipInputStream(source);
while ((entry = zippedContent.getNextEntry()) !=null) {
+ // check for interruption
+ checkInterruption();
+
// skip directories
if (entry.isDirectory()) continue;
// Get the entry name
String entryName = entry.getName();
int idx = entryName.lastIndexOf(".");
- String entryExt = (idx > -1) ? entryName.substring(idx+1) : null;
-
- // trying to determine the mimeType per file extension
- String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
- // getting the entry content
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- byte[] buf = new byte[(int) entry.getSize()];
- /*int bytesRead =*/ zippedContent.read(buf);
- bos.write(buf);
- byte[] ut = bos.toByteArray();
+ // getting the file extension
+ String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
- // check for interruption
- checkInterruption();
+ // trying to determine the mimeType per file extension
+ String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
// parsing the content
- plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,null, ut);
+ plasmaParserDocument theDoc = null;
+ File tempFile = null;
+ try {
+ // create the temp file
+ tempFile = createTempFile(entryName);
+
+ // copy the data into the file
+ serverFileUtils.copy(zippedContent,tempFile,entry.getSize());
+
+ // parsing the zip file entry
+ theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, tempFile);
+ } catch (ParserException e) {
+ this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getErrorCode());
+ } finally {
+ if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
+ }
if (theDoc == null) continue;
// merging all documents together
@@ -157,11 +168,7 @@ public class zipParser extends AbstractParser implements Parser {
docImages.addAll(theDoc.getImages());
}
- /* (URL location, String mimeType,
- String keywords, String shortTitle, String longTitle,
- String[] sections, String abstrct,
- byte[] text, Map anchors, Map images)
- */
+
return new plasmaParserDocument(
location,
mimeType,
@@ -176,9 +183,9 @@ public class zipParser extends AbstractParser implements Parser {
docImages);
} catch (Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
- throw new ParserException("Unable to parse the zip content. " + e.getMessage());
- } catch (Error e) {
- throw new ParserException("Unable to parse the zip content. " + e.getMessage());
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing zip resource. " + e.getMessage(),location);
}
}
diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java
index d8b8fdca7..e349da8bf 100644
--- a/source/de/anomic/plasma/plasmaCrawlLoader.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoader.java
@@ -52,6 +52,7 @@ import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.net.URL;
import de.anomic.plasma.crawler.plasmaCrawlWorker;
+import de.anomic.plasma.crawler.plasmaCrawlerException;
import de.anomic.plasma.crawler.plasmaCrawlerFactory;
import de.anomic.plasma.crawler.plasmaCrawlerMsgQueue;
import de.anomic.plasma.crawler.plasmaCrawlerPool;
@@ -83,7 +84,7 @@ public final class plasmaCrawlLoader extends Thread {
// supported protocols
// TODO: change this, e.g. by loading settings from file
- this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https" /* ,"ftp" */}));
+ this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https"/* ,"ftp" */}));
// configuring the crawler messagequeue
this.theQueue = new plasmaCrawlerMsgQueue();
@@ -99,6 +100,8 @@ public final class plasmaCrawlLoader extends Thread {
// The maximum number of idle connections connections in the pool
// 0 = no limit.
this.crawlerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawler.MaxIdleThreads","7"));
+
+ // minIdle configuration not possible for keyedObjectPools
//this.crawlerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawler.MinIdleThreads","5"));
// block undefinitely
@@ -216,7 +219,7 @@ public final class plasmaCrawlLoader extends Thread {
int depth,
plasmaCrawlProfile.entry profile,
int timeout
- ) {
+ ) throws plasmaCrawlerException {
plasmaHTCache.Entry result = null;
if (!this.crawlwerPool.isClosed) {
@@ -241,11 +244,17 @@ public final class plasmaCrawlLoader extends Thread {
this.execute(theMsg);
// wait for the crawl job result
- result = theMsg.waitForResult();
-
+ result = theMsg.waitForResult();
} catch (Exception e) {
- this.log.logSevere("plasmaCrawlLoader.loadSync", e);
+ this.log.logSevere("plasmaCrawlLoader.loadSync: Unexpected error", e);
+ throw new plasmaCrawlerException("Unexpected error: " + e.getMessage());
}
+
+ // check if an error has occured
+ if (result == null) {
+ String errorMsg = theMsg.getError();
+ throw new plasmaCrawlerException(errorMsg);
+ }
}
// return the result
diff --git a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java
index d79674b19..b3d678c67 100644
--- a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java
@@ -59,6 +59,7 @@ public final class plasmaCrawlLoaderMessage {
private serverSemaphore resultSync = null;
private plasmaHTCache.Entry result;
+ private String errorMessage;
// loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) {
public plasmaCrawlLoaderMessage(
@@ -86,6 +87,14 @@ public final class plasmaCrawlLoaderMessage {
this.result = null;
}
+ public void setError(String errorMessage) {
+ this.errorMessage = errorMessage;
+ }
+
+ public String getError() {
+ return this.errorMessage;
+ }
+
public void setResult(plasmaHTCache.Entry theResult) {
// store the result
this.result = theResult;
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 9c22a93ca..0e5933193 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -73,12 +73,14 @@ import de.anomic.htmlFilter.htmlFilterInputStream;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
+import de.anomic.index.indexURL;
import de.anomic.net.URL;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.parser.ParserInfo;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
+import de.anomic.tools.bitfield;
public final class plasmaParser {
public static final String PARSER_MODE_PROXY = "PROXY";
@@ -407,7 +409,7 @@ public final class plasmaParser {
if (neededLibx != null) {
for (int libxId=0; libxId < neededLibx.length; libxId++) {
if (javaClassPath.indexOf(neededLibx[libxId]) == -1) {
- throw new ParserException("Missing dependency detected: '" + neededLibx[libxId] + "'.");
+ throw new Exception("Missing dependency detected: '" + neededLibx[libxId] + "'.");
}
neededLibxBuf.append(neededLibx[libxId])
.append(",");
@@ -464,42 +466,67 @@ public final class plasmaParser {
// closing the parser object pool
try {
theParserPool.close();
- } catch (Exception e) { }
+ } catch (Exception e) {/* ignore this */}
}
- public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) throws InterruptedException {
+ public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source)
+ throws InterruptedException, ParserException {
File tempFile = null;
try {
+ // creating a temp file to store the byte array
tempFile = File.createTempFile("parseSource", ".tmp");
serverFileUtils.write(source, tempFile);
+
+ // parsing the temp file
return parseSource(location, mimeType, charset, tempFile);
+
} catch (Exception e) {
+ // Interrupted- and Parser-Exceptions should pass through
if (e instanceof InterruptedException) throw (InterruptedException) e;
- serverLog.logSevere("PARSER", "parseSource1: " + e.getMessage(), e);
- return null;
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ // log unexpected error
+ this.theLogger.logSevere("Unexpected exception in parseSource1: " + e.getMessage(), e);
+ throw new ParserException("Unexpected exception while parsing " + location,location, e);
} finally {
- if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){}
+ if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){/* ignore this */}
}
}
- public plasmaParserDocument parseSource(URL location, String mimeType, String documentCharset, File sourceFile) throws InterruptedException {
+ public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile)
+ throws InterruptedException, ParserException {
Parser theParser = null;
+ String mimeType = null;
try {
// getting the mimetype of the document
- mimeType = getRealMimeType(mimeType);
+ mimeType = getRealMimeType(theMimeType);
// getting the file extension of the document
String fileExt = getFileExt(location);
// getting the charset of the document
- if (documentCharset == null)
- // TODO: do a charset detection here ....
- documentCharset = "ISO-8859-1";
+ // TODO: do a charset detection here ....
+ String documentCharset = (theDocumentCharset == null) ? "ISO-8859-1" : theDocumentCharset;
+
+ // testing if parsing is supported for this resource
+ if (!plasmaParser.supportedContent(location,mimeType)) {
+ String errorMsg = "No parser available to parse mimetype";
+ this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
+ }
+
+ // testing if the resource is not empty
+ if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
+ String errorMsg = "No resource content available.";
+ this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
+ }
+
if (this.theLogger.isFine())
- this.theLogger.logFine("Parsing " + location + " with mimeType '" + mimeType +
+ this.theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType +
"' and file extension '" + fileExt + "'.");
/*
@@ -555,26 +582,43 @@ public final class plasmaParser {
theParser = this.getParser(mimeType);
// if a parser was found we use it ...
+ plasmaParserDocument doc = null;
if (theParser != null) {
- return theParser.parse(location, mimeType,documentCharset,sourceFile);
+ doc = theParser.parse(location, mimeType,documentCharset,sourceFile);
} else if (realtimeParsableMimeTypesContains(mimeType)) {
- return parseHtml(location, mimeType, documentCharset, sourceFile);
+ doc = parseHtml(location, mimeType, documentCharset, sourceFile);
} else {
- serverLog.logWarning("PARSER", "parseSource2: wrong mime type");
- return null;
+ String errorMsg = "No parser available to parse mimetype";
+ this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
+ }
+
+ // check result
+ if (doc == null) {
+ String errorMsg = "Unexpected error. Parser returned null.";
+ this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+ throw new ParserException(errorMsg,location);
}
+ return doc;
+
} catch (Exception e) {
+ // Interrupted- and Parser-Exceptions should pass through
if (e instanceof InterruptedException) throw (InterruptedException) e;
- serverLog.logSevere("PARSER", "parseSource2: " + e.getMessage(), e);
- return null;
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ // log unexpected error
+ String errorMsg = "Unexpected exception. " + e.getMessage();
+ this.theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
+ throw new ParserException(errorMsg,location,e);
+
} finally {
if (theParser != null) {
- try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { }
+ try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { /* ignore this */}
}
}
}
- private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException {
+ private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException, ParserException {
// ...otherwise we make a scraper and transformer
FileInputStream fileIn = new FileInputStream(sourceFile);
@@ -596,8 +640,9 @@ public final class plasmaParser {
//serverFileUtils.copy(sourceFile, hfos);
//hfos.close();
if (writer.binarySuspect()) {
- this.theLogger.logInfo("Binary data found in URL " + location);
- return null;
+ String errorMsg = "Binary data found in resource";
+ this.theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg);
+ throw new ParserException(errorMsg,location);
}
return transformScraper(location, mimeType, documentCharset, scraper);
}
diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java
index 3782ff752..a7387604b 100644
--- a/source/de/anomic/plasma/plasmaSearchImages.java
+++ b/source/de/anomic/plasma/plasmaSearchImages.java
@@ -43,6 +43,8 @@ package de.anomic.plasma;
import java.net.MalformedURLException;
import de.anomic.net.URL;
+import de.anomic.plasma.parser.ParserException;
+
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
@@ -60,10 +62,16 @@ public final class plasmaSearchImages {
if (maxTime > 10) {
byte[] res = sc.getResource(url, true, (int) maxTime);
if (res != null) {
- plasmaParserDocument document = sc.parseDocument(url, res);
-
+ plasmaParserDocument document = null;
+ try {
+ document = sc.parseDocument(url, res);
+ } catch (ParserException e) {
+ // parsing failed
+ }
+ if (document == null) return;
+
// add the image links
- if (document != null) this.addAll(document.getImages());
+ this.addAll(document.getImages());
// add also links from pages one step deeper, if depth > 0
if (depth > 0) {
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index 60e4f3e60..efed8fbba 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -45,6 +45,8 @@ package de.anomic.plasma;
import java.io.IOException;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
+import de.anomic.plasma.crawler.plasmaCrawlerException;
+import de.anomic.plasma.parser.ParserException;
import java.util.Enumeration;
import java.util.HashMap;
@@ -164,30 +166,51 @@ public class plasmaSnippetCache {
return new Snippet(line, source, null);
}
+ /* ===========================================================================
+ * LOADING RESOURCE DATA
+ * =========================================================================== */
// if the snippet is not in the cache, we can try to get it from the htcache
byte[] resource = null;
IResourceInfo docInfo = null;
try {
+ // trying to load the resource from the cache
resource = this.cacheManager.loadResourceContent(url);
- if ((fetchOnline) && (resource == null)) {
+ docInfo = this.cacheManager.loadResourceInfo(url);
+
+ // if not found try to download it
+ if ((resource == null) && (fetchOnline)) {
+ // download resource using the crawler
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
+
+ // getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
docInfo = entry.getDocumentInfo();
}
+
+ // now the resource should be stored in the cache, load body
resource = this.cacheManager.loadResourceContent(url);
+ if (resource == null) {
+ //System.out.println("cannot load document for URL " + url);
+ return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL");
+ }
source = SOURCE_WEB;
}
- } catch (IOException e) {
- e.printStackTrace();
+ } catch (Exception e) {
+ if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource from web: " + e.getMessage());
}
- if (resource == null) {
- //System.out.println("cannot load document for URL " + url);
- return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL");
- }
- plasmaParserDocument document = parseDocument(url, resource, docInfo);
+ /* ===========================================================================
+ * PARSING RESOURCE
+ * =========================================================================== */
+ plasmaParserDocument document = null;
+ try {
+ document = parseDocument(url, resource, docInfo);
+ } catch (ParserException e) {
+ return new Snippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
+ }
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
+
//System.out.println("loaded document for URL " + url);
String[] sentences = document.getSentences();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
@@ -196,6 +219,9 @@ public class plasmaSnippetCache {
return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
}
+ /* ===========================================================================
+ * COMPUTE SNIPPET
+ * =========================================================================== */
// we have found a parseable non-empty file: use the lines
line = computeSnippet(sentences, queryhashes, 8 + 6 * queryhashes.size(), snippetMaxLength);
//System.out.println("loaded snippet for URL " + url + ": " + line);
@@ -207,22 +233,48 @@ public class plasmaSnippetCache {
return new Snippet(line, source, null);
}
+ /**
+ * Tries to load and parse a resource specified by it's URL.
+ * If the resource is not stored in cache and if fetchOnline is set the
+ * this function tries to download the resource from web.
+ *
+ * @param url the URL of the resource
+ * @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache
+ * @return the parsed document as {@link plasmaParserDocument}
+ */
public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) {
byte[] resource = null;
IResourceInfo docInfo = null;
try {
+ // trying to load the resource body from cache
resource = this.cacheManager.loadResourceContent(url);
+
+ // if not available try to load resource from web
if ((fetchOnline) && (resource == null)) {
+ // download resource using crawler
plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
+
+ // fetching metadata of the resource (e.g. http headers for http resource)
if (entry != null) docInfo = entry.getDocumentInfo();
+
+ // getting the resource body from the cache
resource = this.cacheManager.loadResourceContent(url);
+ } else {
+ // trying to load resource metadata
+ docInfo = this.cacheManager.loadResourceInfo(url);
}
- } catch (IOException e) {
- e.printStackTrace();
+
+ // parsing document
+ if (resource == null) return null;
+ return parseDocument(url, resource, docInfo);
+ } catch (ParserException e) {
+ this.log.logWarning("Unable to parse resource. " + e.getMessage());
+ return null;
+ } catch (Exception e) {
+ this.log.logWarning("Unexpected error while retrieving document. " + e.getMessage(),e);
return null;
}
- if (resource == null) return null;
- return parseDocument(url, resource, docInfo);
+
}
public void storeToCache(String wordhashes, String urlhash, String snippet) {
@@ -374,11 +426,11 @@ public class plasmaSnippetCache {
return map;
}
- public plasmaParserDocument parseDocument(URL url, byte[] resource) {
+ public plasmaParserDocument parseDocument(URL url, byte[] resource) throws ParserException {
return parseDocument(url, resource, null);
}
- public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) {
+ public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) throws ParserException {
try {
if (resource == null) return null;
@@ -425,9 +477,15 @@ public class plasmaSnippetCache {
public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
// load the url as resource from the web
try {
+ // trying to load the resource body from cache
byte[] resource = cacheManager.loadResourceContent(url);
+
+ // if the content is not available in cache try to download it from web
if ((fetchOnline) && (resource == null)) {
+ // try to download the resource using a crawler
loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout);
+
+ // get the content from cache
resource = cacheManager.loadResourceContent(url);
}
return resource;
@@ -436,7 +494,7 @@ public class plasmaSnippetCache {
}
}
- public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws IOException {
+ public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws plasmaCrawlerException {
plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync(
url,
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index d68ce3a0e..87c86f10e 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -144,6 +144,7 @@ import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroMapTable;
import de.anomic.plasma.dbImport.dbImportManager;
+import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverCodings;
@@ -1392,7 +1393,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
- private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException {
+ private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException, ParserException {
plasmaParserDocument document = null;
// the mimetype of this entry
@@ -1402,29 +1403,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// the parser logger
serverLog parserLogger = parser.getLogger();
- // if the document content is supported we can start to parse the content
- if (plasmaParser.supportedContent(
- entry.url(),
- mimeType)
- ){
- if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) {
- parserLogger.logFine("'" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File");
- document = parser.parseSource(entry.url(), mimeType, charset, entry.cacheFile());
- } else {
- parserLogger.logFine("'" + entry.normalizedURLString() + "' cannot be parsed, no resource available");
- addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength));
- }
- if (document == null) {
- parserLogger.logSevere("'" + entry.normalizedURLString() + "' parse failure");
- addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_PARSER_ERROR, new bitfield(indexURL.urlFlagLength));
- }
- } else {
- parserLogger.logFine("'" + entry.normalizedURLString() + "'. Unsupported mimeType '" + ((mimeType == null) ? "null" : mimeType) + "'.");
- addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT, new bitfield(indexURL.urlFlagLength));
- }
-
- checkInterruption();
- return document;
+ // parse the document
+ return parseResource(entry.url(), mimeType, charset, entry.cacheFile());
+ }
+
+ public plasmaParserDocument parseResource(URL location, String mimeType, String documentCharset, File sourceFile) throws InterruptedException, ParserException {
+ plasmaParserDocument doc = parser.parseSource(location, mimeType, documentCharset, sourceFile);
+ assert(doc != null) : "Unexpected error. Parser returned null.";
+ return doc;
}
private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
@@ -1471,8 +1457,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaParserDocument document = null;
parsingStartTime = System.currentTimeMillis();
+ try {
document = this.parseResource(entry, initiatorPeerHash);
if (document == null) return;
+ } catch (ParserException e) {
+ this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
+ addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexURL.urlFlagLength));
+ return;
+ }
parsingEndTime = System.currentTimeMillis();
@@ -2172,16 +2164,22 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// determine the url string
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0;
+
URL url = entry.url();
if (url == null) return 0;
- // get set of words
- // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
- Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
- // delete all word references
- int count = removeReferences(urlhash, witer);
- // finally delete the url entry itself
- urlPool.loadedURL.remove(urlhash);
- return count;
+
+ try {
+ // get set of words
+ // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
+ Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
+ // delete all word references
+ int count = removeReferences(urlhash, witer);
+ // finally delete the url entry itself
+ urlPool.loadedURL.remove(urlhash);
+ return count;
+ } catch (ParserException e) {
+ return 0;
+ }
}
public int removeReferences(URL url, Set words) {
diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java
index 295583d6a..9030ee036 100644
--- a/source/de/anomic/server/serverByteBuffer.java
+++ b/source/de/anomic/server/serverByteBuffer.java
@@ -188,6 +188,10 @@ public final class serverByteBuffer extends OutputStream {
public serverByteBuffer append(String s) {
return append(s.getBytes());
}
+
+ public serverByteBuffer append(String s, String charset) throws UnsupportedEncodingException {
+ return append(s.getBytes(charset));
+ }
public serverByteBuffer append(serverByteBuffer bb) {
return append(bb.buffer, bb.offset, bb.length);
diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java
index a90db5f01..974994b24 100644
--- a/source/de/anomic/server/serverFileUtils.java
+++ b/source/de/anomic/server/serverFileUtils.java
@@ -73,24 +73,39 @@ import de.anomic.kelondro.kelondroRowSet;
public final class serverFileUtils {
+ private static final int DEFAULT_BUFFER_SIZE = 4096;
+
+ public static long copy(InputStream source, OutputStream dest) throws IOException {
+ return copy(source,dest);
+ }
+
/**
* Copies an InputStream to an OutputStream.
- * @param source InputStream
- * @param dest OutputStream
+ * @param source InputStream
+ * @param dest OutputStream
+ * @param count the total amount of bytes to copy
* @return Total number of bytes copied.
+ *
* @see copy(InputStream source, File dest)
* @see copyRange(File source, OutputStream dest, int start)
* @see copy(File source, OutputStream dest)
* @see copy(File source, File dest)
*/
- public static int copy(InputStream source, OutputStream dest) throws IOException {
- byte[] buffer = new byte[4096];
+ public static long copy(InputStream source, OutputStream dest, long count) throws IOException {
+ byte[] buffer = new byte[DEFAULT_BUFFER_SIZE];
+ int chunkSize = (int) ((count > 0) ? Math.min(count, DEFAULT_BUFFER_SIZE) : DEFAULT_BUFFER_SIZE);
- int c, total = 0;
- while ((c = source.read(buffer)) > 0) {
+ int c; long total = 0;
+ while ((c = source.read(buffer,0,chunkSize)) > 0) {
dest.write(buffer, 0, c);
dest.flush();
total += c;
+
+ if (count > 0) {
+ chunkSize = (int)Math.min(count-total,DEFAULT_BUFFER_SIZE);
+ if (chunkSize == 0) break;
+ }
+
}
dest.flush();
@@ -165,21 +180,26 @@ public final class serverFileUtils {
}
return count;
}
+
+ public static void copy(InputStream source, File dest) throws IOException {
+ copy(source,dest,-1);
+ }
/**
* Copies an InputStream to a File.
* @param source InputStream
* @param dest File
+ * @param the amount of bytes to copy
* @see copy(InputStream source, OutputStream dest)
* @see copyRange(File source, OutputStream dest, int start)
* @see copy(File source, OutputStream dest)
* @see copy(File source, File dest)
*/
- public static void copy(InputStream source, File dest) throws IOException {
+ public static void copy(InputStream source, File dest, long count) throws IOException {
FileOutputStream fos = null;
try {
fos = new FileOutputStream(dest);
- copy(source, fos);
+ copy(source, fos, count);
} finally {
if (fos != null) try {fos.close();} catch (Exception e) {}
}
@@ -201,7 +221,7 @@ public final class serverFileUtils {
fis = new FileInputStream(source);
long skipped = fis.skip(start);
if (skipped != start) throw new IllegalStateException("Unable to skip '" + start + "' bytes. Only '" + skipped + "' bytes skipped.");
- copy(fis, dest);
+ copy(fis, dest,-1);
} finally {
if (fis != null) try { fis.close(); } catch (Exception e) {}
}
@@ -220,28 +240,33 @@ public final class serverFileUtils {
InputStream fis = null;
try {
fis = new FileInputStream(source);
- copy(fis, dest);
+ copy(fis, dest, -1);
} finally {
if (fis != null) try { fis.close(); } catch (Exception e) {}
}
}
+ public static void copy(File source, File dest) throws IOException {
+ copy(source,dest,-1);
+ }
+
/**
* Copies a File to a File.
* @param source File
* @param dest File
+ * @param count the amount of bytes to copy
* @see copy(InputStream source, OutputStream dest)
* @see copy(InputStream source, File dest)
* @see copyRange(File source, OutputStream dest, int start)
* @see copy(File source, OutputStream dest)
*/
- public static void copy(File source, File dest) throws IOException {
+ public static void copy(File source, File dest, long count) throws IOException {
FileInputStream fis = null;
FileOutputStream fos = null;
try {
fis = new FileInputStream(source);
fos = new FileOutputStream(dest);
- copy(fis, fos);
+ copy(fis, fos, count);
} finally {
if (fis != null) try {fis.close();} catch (Exception e) {}
if (fos != null) try {fos.close();} catch (Exception e) {}
@@ -250,7 +275,7 @@ public final class serverFileUtils {
public static byte[] read(InputStream source) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
- copy(source, baos);
+ copy(source, baos, -1);
baos.close();
return baos.toByteArray();
}
@@ -309,7 +334,7 @@ public final class serverFileUtils {
}
public static void write(byte[] source, OutputStream dest) throws IOException {
- copy(new ByteArrayInputStream(source), dest);
+ copy(new ByteArrayInputStream(source), dest, -1);
}
public static void write(byte[] source, File dest) throws IOException {