From b6c7b915827edf6b61e8c506e22326312f2762db Mon Sep 17 00:00:00 2001 From: theli Date: Wed, 20 Sep 2006 12:25:07 +0000 Subject: [PATCH] *) Parser now throws an ParserException instead of returning null on parsing errors (e.g. needed by snippet fetcher) *) better logging of parser failures *) simplified usage of plasmaparser through switchboard *) restructuring of crawler - crawler now returns an error message if it is used in sync mode (e.g. by snippet fetcher) *) snippet-fetcher: more verbose error messages *) serverByteBuffer.java: adding new function append(String,encoding) *) serverFileUtils.java: adding functions to copy only a given number of bytes between streams git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2641 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 2 +- htroot/ViewFile.html | 6 +- htroot/ViewFile.java | 295 +++++++++--------- htroot/yacysearch.java | 18 +- .../plasma/crawler/AbstractCrawlWorker.java | 30 +- .../crawler/plasmaCrawlerException.java | 9 + .../anomic/plasma/parser/AbstractParser.java | 38 ++- .../anomic/plasma/parser/ParserException.java | 31 +- .../anomic/plasma/parser/bzip/bzipParser.java | 6 +- .../anomic/plasma/parser/doc/docParser.java | 8 +- .../anomic/plasma/parser/gzip/gzipParser.java | 6 +- .../parser/mimeType/mimeTypeParser.java | 23 +- .../anomic/plasma/parser/odt/odtParser.java | 28 +- .../anomic/plasma/parser/pdf/pdfParser.java | 26 +- .../anomic/plasma/parser/rpm/rpmParser.java | 22 +- .../anomic/plasma/parser/rss/rssParser.java | 13 +- .../anomic/plasma/parser/rtf/rtfParser.java | 6 +- .../anomic/plasma/parser/tar/tarParser.java | 31 +- .../anomic/plasma/parser/vcf/vcfParser.java | 10 +- .../anomic/plasma/parser/zip/zipParser.java | 55 ++-- .../de/anomic/plasma/plasmaCrawlLoader.java | 19 +- .../plasma/plasmaCrawlLoaderMessage.java | 9 + source/de/anomic/plasma/plasmaParser.java | 89 ++++-- .../de/anomic/plasma/plasmaSearchImages.java | 14 +- .../de/anomic/plasma/plasmaSnippetCache.java | 88 +++++- .../de/anomic/plasma/plasmaSwitchboard.java | 62 ++-- source/de/anomic/server/serverByteBuffer.java | 4 + source/de/anomic/server/serverFileUtils.java | 53 +++- 28 files changed, 637 insertions(+), 364 deletions(-) create mode 100644 source/de/anomic/plasma/crawler/plasmaCrawlerException.java diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index f25acaa4e..96b7b0d15 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -51,11 +51,11 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import de.anomic.net.URL; import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; import de.anomic.data.bookmarksDB.Tag; import de.anomic.http.httpHeader; +import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html index 87830d891..9cccb1fdc 100644 --- a/htroot/ViewFile.html +++ b/htroot/ViewFile.html @@ -53,9 +53,11 @@ Unable to find URL Entry in DB :: Invalid URL :: -Unable to download resource content. +Unable to download resource content.
+#[errorText]# :: -Unable to parse resource content. +Unable to parse resource content.
+#[errorText]# :: Unsupported protocol. #(/error)# diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index d28daa23a..ca76bf10a 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -57,6 +57,8 @@ import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.plasma.crawler.plasmaCrawlerException; +import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.plasmaCrawlLURL.Entry; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -83,174 +85,185 @@ public class ViewFile { serverObjects prop = new serverObjects(); plasmaSwitchboard sb = (plasmaSwitchboard)env; - - - if (post.containsKey("words")) + if (post != null && post.containsKey("words")) try { prop.put("error_words",URLEncoder.encode((String) post.get("words"), "UTF-8")); } catch (UnsupportedEncodingException e1) { - // TODO Auto-generated catch block - e1.printStackTrace(); + // ignore this. this should not occure } - if (post != null) { - // getting the url hash from which the content should be loaded - String urlHash = post.get("urlHash",""); - if (urlHash.equals("")) { - prop.put("error",1); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - String viewMode = post.get("viewMode","sentences"); - - // getting the urlEntry that belongs to the url hash - Entry urlEntry = null; - urlEntry = sb.urlPool.loadedURL.load(urlHash, null); - if (urlEntry == null) { - prop.put("error",2); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } + // getting the url hash from which the content should be loaded + String urlHash = post.get("urlHash",""); + if (urlHash.equals("")) { + prop.put("error",1); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } - // gettin the url that belongs to the entry - URL url = urlEntry.url(); - if (url == null) { - prop.put("error",3); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } + String viewMode = post.get("viewMode","sentences"); + + // getting the urlEntry that belongs to the url hash + Entry urlEntry = null; + urlEntry = sb.urlPool.loadedURL.load(urlHash, null); + if (urlEntry == null) { + prop.put("error",2); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + + // gettin the url that belongs to the entry + URL url = urlEntry.url(); + if (url == null) { + prop.put("error",3); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + + // loading the resource content as byte array + byte[] resource = null; + IResourceInfo resInfo = null; + String resMime = null; + try { + // trying to load the resource body + resource = sb.cacheManager.loadResourceContent(url); + + // if the resource body was not cached we try to load it from web + if (resource == null) { + plasmaHTCache.Entry entry = null; + try { + entry = sb.snippetCache.loadResourceFromWeb(url, 5000); + } catch (plasmaCrawlerException e) { + prop.put("error",4); + prop.put("error_errorText",e.getMessage()); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } - // loading the resource content as byte array - byte[] resource = null; - IResourceInfo resInfo = null; - String resMime = null; - try { - // trying to load the resource body - resource = sb.cacheManager.loadResourceContent(url); + if (entry != null) { + resInfo = entry.getDocumentInfo(); + resource = sb.cacheManager.loadResourceContent(url); + } - // if the resource body was not cached we try to load it from web if (resource == null) { - plasmaHTCache.Entry entry = sb.snippetCache.loadResourceFromWeb(url, 5000); + prop.put("error",4); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + } - if (entry != null) { - resInfo = entry.getDocumentInfo(); - resource = sb.cacheManager.loadResourceContent(url); + // try to load resource metadata + if (resInfo == null) { + + // try to load the metadata from cache + try { + resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url()); + } catch (Exception e) { /* ignore this */} + + // if the metadata where not cached try to load it from web + if (resInfo == null) { + String protocol = url.getProtocol(); + if (!((protocol.equals("http") || protocol.equals("https")))) { + prop.put("error",6); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; } - if (resource == null) { + httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig); + if (responseHeader == null) { prop.put("error",4); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; } + resMime = responseHeader.mime(); } + } else { + resMime = resInfo.getMimeType(); + } + } catch (IOException e) { + prop.put("error",4); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + if (viewMode.equals("plain")) { + String content = new String(resource); + content = content.replaceAll("<","<") + .replaceAll(">",">") + .replaceAll("\"",""") + .replaceAll("\n","
") + .replaceAll("\t","    "); - // try to load resource metadata - if (resInfo == null) { - - // try to load the metadata from cache - try { - resInfo = sb.cacheManager.loadResourceInfo(urlEntry.url()); - } catch (Exception e) { /* ignore this */} - - // if the metadata where not cached try to load it from web - if (resInfo == null) { - String protocol = url.getProtocol(); - if (!((protocol.equals("http") || protocol.equals("https")))) { - prop.put("error",6); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - - httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig); - if (responseHeader == null) { - prop.put("error",4); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - resMime = responseHeader.mime(); - } - } else { - resMime = resInfo.getMimeType(); - } - } catch (IOException e) { - if (url == null) { - prop.put("error",4); - prop.put("viewMode",VIEW_MODE_NO_TEXT); - return prop; - } - } - if (viewMode.equals("plain")) { - String content = new String(resource); - content = content.replaceAll("<","<") - .replaceAll(">",">") - .replaceAll("\"",""") - .replaceAll("\n","
") - .replaceAll("\t","    "); - - prop.put("error",0); - prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT); - prop.put("viewMode_plainText",content); - } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) { - // parsing the resource content - plasmaParserDocument document = sb.snippetCache.parseDocument(url, resource,resInfo); + prop.put("error",0); + prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT); + prop.put("viewMode_plainText",content); + } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) { + // parsing the resource content + plasmaParserDocument document = null; + try { + document = sb.snippetCache.parseDocument(url, resource,resInfo); if (document == null) { prop.put("error",5); + prop.put("error_errorText","Unknown error"); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; } - resMime = document.getMimeType(); - - if (viewMode.equals("parsed")) { - String content = new String(document.getText()); - content = wikiCode.replaceHTML(content); //added by Marc Nause - content = content.replaceAll("\n","
") - .replaceAll("\t","    "); - - prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT); - prop.put("viewMode_parsedText",content); - } else if (viewMode.equals("iframe")) { - prop.put("viewMode",VIEW_MODE_AS_IFRAME); - prop.put("viewMode_url",url.toString()); - } else { - prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES); - String[] sentences = document.getSentences(); - - boolean dark = true; - for (int i=0; i < sentences.length; i++) { - String currentSentence = wikiCode.replaceHTML(sentences[i]); - - // Search word highlighting - String words = post.get("words",null); - if (words != null) { - try { - words = URLDecoder.decode(words,"UTF-8"); - } catch (UnsupportedEncodingException e) {} - - String[] wordArray = words.substring(1,words.length()-1).split(","); - for (int j=0; j < wordArray.length; j++) { - String currentWord = wordArray[j].trim(); - currentSentence = currentSentence.replaceAll(currentWord, - "" + currentWord + ""); - } - } + } catch (ParserException e) { + prop.put("error",5); + prop.put("error_errorText",e.getMessage()); + prop.put("viewMode",VIEW_MODE_NO_TEXT); + return prop; + } + resMime = document.getMimeType(); - prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); - prop.put("viewMode_sentences_" + i + "_text",currentSentence); - prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark; + if (viewMode.equals("parsed")) { + String content = new String(document.getText()); + content = wikiCode.replaceHTML(content); //added by Marc Nause + content = content.replaceAll("\n","
") + .replaceAll("\t","    "); + + prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT); + prop.put("viewMode_parsedText",content); + } else if (viewMode.equals("iframe")) { + prop.put("viewMode",VIEW_MODE_AS_IFRAME); + prop.put("viewMode_url",url.toString()); + } else { + prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES); + String[] sentences = document.getSentences(); + + boolean dark = true; + for (int i=0; i < sentences.length; i++) { + String currentSentence = wikiCode.replaceHTML(sentences[i]); + + // Search word highlighting + String words = post.get("words",null); + if (words != null) { + try { + words = URLDecoder.decode(words,"UTF-8"); + } catch (UnsupportedEncodingException e) {} + + String[] wordArray = words.substring(1,words.length()-1).split(","); + for (int j=0; j < wordArray.length; j++) { + String currentWord = wordArray[j].trim(); + currentSentence = currentSentence.replaceAll(currentWord, + "" + currentWord + ""); + } } - prop.put("viewMode_sentences",sentences.length); - } - } - prop.put("error",0); - prop.put("error_url",url.toString()); - prop.put("error_hash",urlHash); - prop.put("error_wordCount",Integer.toString(urlEntry.wordCount())); - prop.put("error_desc",urlEntry.descr()); - prop.put("error_size",urlEntry.size()); - prop.put("error_mimeType",resMime); - } + prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1)); + prop.put("viewMode_sentences_" + i + "_text",currentSentence); + prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark; + } + prop.put("viewMode_sentences",sentences.length); + + } + } + prop.put("error",0); + prop.put("error_url",url.toString()); + prop.put("error_hash",urlHash); + prop.put("error_wordCount",Integer.toString(urlEntry.wordCount())); + prop.put("error_desc",urlEntry.descr()); + prop.put("error_size",urlEntry.size()); + prop.put("error_mimeType",resMime); return prop; } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 3d56f9eb4..72adb9831 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -56,6 +56,7 @@ import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.http.httpHeader; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroNaturalOrder; +import de.anomic.net.URL; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSearchImages; @@ -64,7 +65,6 @@ import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.net.URL; import de.anomic.server.serverCore; import de.anomic.server.serverDate; import de.anomic.server.serverObjects; @@ -192,13 +192,15 @@ public class yacysearch { plasmaCrawlLURL.Entry urlentry = sb.urlPool.loadedURL.load(recommendHash, null); if (urlentry != null) { plasmaParserDocument document = sb.snippetCache.retrieveDocument(urlentry.url(), true); - // create a news message - HashMap map = new HashMap(); - map.put("url", urlentry.url().toNormalform().replace(',', '|')); - map.put("title", urlentry.descr().replace(',', ' ')); - map.put("description", ((document == null) ? urlentry.descr() : document.getMainLongTitle()).replace(',', ' ')); - map.put("tags", ((document == null) ? "" : document.getKeywords(' '))); - yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map)); + if (document != null) { + // create a news message + HashMap map = new HashMap(); + map.put("url", urlentry.url().toNormalform().replace(',', '|')); + map.put("title", urlentry.descr().replace(',', ' ')); + map.put("description", ((document == null) ? urlentry.descr() : document.getMainLongTitle()).replace(',', ' ')); + map.put("tags", ((document == null) ? "" : document.getKeywords(' '))); + yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map)); + } } } diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java index 7889df481..2df4f4d4b 100644 --- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java +++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java @@ -93,6 +93,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW protected plasmaCrawlProfile.entry profile; protected boolean acceptAllContent; + protected String errorMessage; + /** * The crawler thread pool */ @@ -186,6 +188,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW } public void execute() { + + plasmaHTCache.Entry loadedResource = null; try { // setting threadname this.setName(plasmaCrawlWorker.threadBaseName + "_" + this.url); @@ -194,15 +198,23 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW init(); // loading resource - plasmaHTCache.Entry resource = load(); + loadedResource = load(); + } catch (IOException e) { + //throw e; + } finally { + // setting the error message (if available) + if (this.errorMessage != null) { + this.theMsg.setError(this.errorMessage); + } // store a reference to the result in the message object // this is e.g. needed by the snippet fetcher - this.theMsg.setResult(resource); - - } catch (IOException e) { - //throw e; - } finally { + // + // Note: this is always called, even on empty results. + // Otherwise the caller will block forever + this.theMsg.setResult(loadedResource); + + // signal that this worker thread has finished the job this.done = true; } } @@ -256,9 +268,13 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW this.startdate = 0; this.profile = null; this.acceptAllContent = false; + this.errorMessage = null; } - protected void addURLtoErrorDB(String failreason) { + protected void addURLtoErrorDB(String failreason) { + // remember error message + this.errorMessage = failreason; + // convert the referrer URL into a hash value String referrerHash = (this.refererURLString==null)?null:indexURL.urlHash(this.refererURLString); diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerException.java b/source/de/anomic/plasma/crawler/plasmaCrawlerException.java new file mode 100644 index 000000000..165dd4e78 --- /dev/null +++ b/source/de/anomic/plasma/crawler/plasmaCrawlerException.java @@ -0,0 +1,9 @@ +package de.anomic.plasma.crawler; + +import java.io.IOException; + +public class plasmaCrawlerException extends IOException { + public plasmaCrawlerException(String errorMsg) { + super(errorMsg); + } +} diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java index c69c60496..9507e5ca5 100644 --- a/source/de/anomic/plasma/parser/AbstractParser.java +++ b/source/de/anomic/plasma/parser/AbstractParser.java @@ -49,6 +49,7 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.IOException; import java.io.InputStream; import de.anomic.net.URL; @@ -93,12 +94,35 @@ public abstract class AbstractParser implements Parser{ this.libxDependencies = libxDependencies; } + /** + * Check if the parser was interrupted. + * @throws InterruptedException if the parser was interrupted + */ public static final void checkInterruption() throws InterruptedException { Thread currentThread = Thread.currentThread(); if ((currentThread instanceof serverThread) && ((serverThread)currentThread).shutdownInProgress()) throw new InterruptedException("Shutdown in progress ..."); if (currentThread.isInterrupted()) throw new InterruptedException("Shutdown in progress ..."); } + public final File createTempFile(String name) throws IOException { + String parserClassName = this.getClass().getName(); + int idx = parserClassName.lastIndexOf("."); + if (idx != -1) { + parserClassName = parserClassName.substring(idx+1); + } + + // getting the file extension + idx = name.lastIndexOf("/"); + String fileName = (idx != -1) ? name.substring(idx+1) : name; + + idx = fileName.lastIndexOf("."); + String fileExt = (idx > -1) ? fileName.substring(idx+1) : ""; + + // creates the temp file + File tempFile = File.createTempFile(parserClassName + "_" + ((idx>-1)?fileName.substring(0,idx):fileName), (fileExt.length()>0)?"."+fileExt:fileExt); + return tempFile; + } + /** * Parsing a document available as byte array. * @param location the origin of the document @@ -119,14 +143,17 @@ public abstract class AbstractParser implements Parser{ ) throws ParserException, InterruptedException { ByteArrayInputStream contentInputStream = null; try { + // convert the byte array into a stream contentInputStream = new ByteArrayInputStream(source); + + // parse the stream return this.parse(location,mimeType,charset,contentInputStream); } finally { if (contentInputStream != null) { try { contentInputStream.close(); contentInputStream = null; - } catch (Exception e){} + } catch (Exception e){ /* ignore this */} } } } @@ -151,12 +178,15 @@ public abstract class AbstractParser implements Parser{ ) throws ParserException, InterruptedException { BufferedInputStream contentInputStream = null; try { + // create a stream from the file contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile)); + + // parse the stream return this.parse(location, mimeType, charset, contentInputStream); } catch (FileNotFoundException e) { - throw new ParserException(e.getMessage()); + throw new ParserException("Unexpected error while parsing file. " + e.getMessage(),location); } finally { - if (contentInputStream != null) try{contentInputStream.close();}catch(Exception e){} + if (contentInputStream != null) try{contentInputStream.close();}catch(Exception e){/* ignore this */} } } @@ -201,6 +231,6 @@ public abstract class AbstractParser implements Parser{ * Return the name of the parser */ public String getName() { - return parserName; + return this.parserName; } } diff --git a/source/de/anomic/plasma/parser/ParserException.java b/source/de/anomic/plasma/parser/ParserException.java index cdb730ec6..c05d9a484 100644 --- a/source/de/anomic/plasma/parser/ParserException.java +++ b/source/de/anomic/plasma/parser/ParserException.java @@ -44,24 +44,45 @@ package de.anomic.plasma.parser; +import de.anomic.net.URL; +import de.anomic.plasma.plasmaCrawlEURL; + public class ParserException extends Exception { - + private String errorCode = null; + private URL url = null; + private static final long serialVersionUID = 1L; public ParserException() { super(); } - public ParserException(String message) { + public ParserException(String message, URL url) { + this(message,url,plasmaCrawlEURL.DENIED_PARSER_ERROR); + } + + public ParserException(String message, URL url, String errorCode) { super(message); + this.errorCode = errorCode; + this.url = url; } - public ParserException(String message, Throwable cause) { + public ParserException(String message, URL url, Throwable cause) { + this(message,url,cause,plasmaCrawlEURL.DENIED_PARSER_ERROR); + } + + public ParserException(String message, URL url, Throwable cause, String errorCode) { super(message, cause); + this.errorCode = errorCode; + this.url = url; } - public ParserException(Throwable cause) { - super(cause); + public String getErrorCode() { + return this.errorCode; + } + + public URL getURL() { + return this.url; } } diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java index 8b2020c81..53aa52e40 100644 --- a/source/de/anomic/plasma/parser/bzip/bzipParser.java +++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java @@ -80,7 +80,7 @@ public class bzipParser extends AbstractParser implements Parser { public bzipParser() { super(LIBX_DEPENDENCIES); - parserName = "Bzip 2 UNIX Compressed File Parser"; + this.parserName = "Bzip 2 UNIX Compressed File Parser"; } public Hashtable getSupportedMimeTypes() { @@ -129,7 +129,9 @@ public class bzipParser extends AbstractParser implements Parser { return theParser.parseSource(location,null,null,tempFile); } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the gzip content. " + e.getMessage()); + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing bzip file. " + e.getMessage(),location); } finally { if (tempFile != null) tempFile.delete(); } diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index 46aa1196a..2a89dbfee 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -75,7 +75,7 @@ implements Parser { public docParser() { super(LIBX_DEPENDENCIES); - parserName = "Word Document Parser"; + this.parserName = "Word Document Parser"; } public plasmaParserDocument parse(URL location, String mimeType, String charset, @@ -99,14 +99,16 @@ implements Parser { null, null, null, - contents.getBytes(), + contents.getBytes("UTF-8"), null, null); return theDoc; } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the doc content. " + e.getMessage()); + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing doc file. " + e.getMessage(),location); } } diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java index abc58e26e..389795372 100644 --- a/source/de/anomic/plasma/parser/gzip/gzipParser.java +++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java @@ -76,7 +76,7 @@ public class gzipParser extends AbstractParser implements Parser { public gzipParser() { super(LIBX_DEPENDENCIES); - parserName = "GNU Zip Compressed Archive Parser"; + this.parserName = "GNU Zip Compressed Archive Parser"; } public Hashtable getSupportedMimeTypes() { @@ -113,7 +113,9 @@ public class gzipParser extends AbstractParser implements Parser { return theParser.parseSource(location,null,null,tempFile); } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the gzip content. " + e.getMessage()); + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing gzip file. " + e.getMessage(),location); } finally { if (tempFile != null) tempFile.delete(); } diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java index f2b86124f..6d5eabc33 100644 --- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java +++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java @@ -44,6 +44,7 @@ package de.anomic.plasma.parser.mimeType; import java.io.File; +import java.io.IOException; import java.io.InputStream; import de.anomic.net.URL; import java.util.Collection; @@ -99,7 +100,7 @@ implements Parser { public mimeTypeParser() { super(LIBX_DEPENDENCIES); - parserName = "MimeType Parser"; + this.parserName = "MimeType Parser"; } public String getMimeType (File sourceFile) { @@ -142,8 +143,8 @@ implements Parser { threadLoopDetection.put(Thread.currentThread(),new Integer(loopDepth.intValue()+1)); // deactivating the logging for jMimeMagic - Logger theLogger = Logger.getLogger("net.sf.jmimemagic"); - theLogger.setLevel(Level.OFF); + Logger jmimeMagicLogger = Logger.getLogger("net.sf.jmimemagic"); + jmimeMagicLogger.setLevel(Level.OFF); Magic theMagic = new Magic(); MagicMatch match = theMagic.getMagicMatch(sourceFile); @@ -160,8 +161,8 @@ implements Parser { } // to avoid loops we have to test if the mimetype has changed ... - if (this.getSupportedMimeTypes().containsKey(mimeType)) return null; - if (orgMimeType.equals(mimeType)) return null; + if (this.getSupportedMimeTypes().containsKey(mimeType)) throw new ParserException("Unable to detect mimetype of resource.",location); + if (orgMimeType.equals(mimeType)) throw new ParserException("Unable to detect mimetype of resource.",location); // check for interruption checkInterruption(); @@ -170,11 +171,13 @@ implements Parser { plasmaParser theParser = new plasmaParser(); return theParser.parseSource(location,mimeType,charset,sourceFile); } - return null; + throw new ParserException("Unable to detect mimetype of resource.",location); } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - return null; + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while detect mimetype of resource. " + e.getMessage(),location); } finally { Integer loopDepth = (Integer) threadLoopDetection.get(Thread.currentThread()); if (loopDepth.intValue() <= 1) { @@ -186,14 +189,14 @@ implements Parser { } public plasmaParserDocument parse(URL location, String mimeType,String charset, - InputStream source) throws ParserException { + InputStream source) throws ParserException, InterruptedException { File dstFile = null; try { dstFile = File.createTempFile("mimeTypeParser",".tmp"); serverFileUtils.copy(source,dstFile); return parse(location,mimeType,charset,dstFile); - } catch (Exception e) { - return null; + } catch (IOException e) { + throw new ParserException("Unexpected error while detect mimetype of resource. " + e.getMessage(),location); } finally { if (dstFile != null) {dstFile.delete();} } diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index f8a9a10be..a2b1b8cbd 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -84,7 +84,7 @@ public class odtParser extends AbstractParser implements Parser { public odtParser() { super(LIBX_DEPENDENCIES); - parserName = "OASIS OpenDocument V2 Text Document Parser"; + this.parserName = "OASIS OpenDocument V2 Text Document Parser"; } public Hashtable getSupportedMimeTypes() { @@ -96,7 +96,7 @@ public class odtParser extends AbstractParser implements Parser { try { byte[] docContent = null; String docDescription = null; - String docKeywords = null; + String docKeywordStr = null; String docShortTitle = null; String docLongTitle = null; @@ -125,7 +125,7 @@ public class odtParser extends AbstractParser implements Parser { ODFMetaFileAnalyzer metaAnalyzer = new ODFMetaFileAnalyzer(); OpenDocumentMetadata metaData = metaAnalyzer.analyzeMetaData(zipFileEntryStream); docDescription = metaData.getDescription(); - docKeywords = metaData.getKeyword(); + docKeywordStr = metaData.getKeyword(); docShortTitle = metaData.getTitle(); docLongTitle = metaData.getSubject(); @@ -149,11 +149,16 @@ public class odtParser extends AbstractParser implements Parser { } } + // split the keywords + String[] docKeywords = null; + if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,"); + + // create the parser document return new plasmaParserDocument( location, mimeType, "UTF-8", - docKeywords.split(" |,"), + docKeywords, docShortTitle, docLongTitle, null, @@ -163,13 +168,13 @@ public class odtParser extends AbstractParser implements Parser { null); } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the odt content. " + e.getMessage()); - } catch (Error e) { - throw new ParserException("Unable to parse the odt content. " + e.getMessage()); + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing odt file. " + e.getMessage(),location); } } - public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException { + public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { File dest = null; try { // creating a tempfile @@ -182,9 +187,12 @@ public class odtParser extends AbstractParser implements Parser { // parsing the content return parse(location, mimeType, charset, dest); } catch (Exception e) { - throw new ParserException("Unable to parse the odt document. " + e.getMessage()); + if (e instanceof InterruptedException) throw (InterruptedException) e; + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing odt file. " + e.getMessage(),location); } finally { - if (dest != null) try { dest.delete(); } catch (Exception e){} + if (dest != null) try { dest.delete(); } catch (Exception e){/* ignore this */} } } diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 298a87f41..1b67fceb4 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -78,7 +78,7 @@ public class pdfParser extends AbstractParser implements Parser { public pdfParser() { super(LIBX_DEPENDENCIES); - parserName = "Acrobat Portable Document Parser"; + this.parserName = "Acrobat Portable Document Parser"; } public Hashtable getSupportedMimeTypes() { @@ -98,7 +98,7 @@ public class pdfParser extends AbstractParser implements Parser { // Logger theLogger = Logger.getLogger("org.pdfbox"); // theLogger.setLevel(Level.INFO); - String docTitle = null, docSubject = null, /*docAuthor = null,*/ docKeyWords = null; + String docTitle = null, docSubject = null, /*docAuthor = null,*/ docKeywordStr = null; // check for interruption checkInterruption(); @@ -120,7 +120,7 @@ public class pdfParser extends AbstractParser implements Parser { docTitle = theDocInfo.getTitle(); docSubject = theDocInfo.getSubject(); //docAuthor = theDocInfo.getAuthor(); - docKeyWords = theDocInfo.getKeywords(); + docKeywordStr = theDocInfo.getKeywords(); } serverByteBuffer out = new serverByteBuffer(); @@ -142,18 +142,14 @@ public class pdfParser extends AbstractParser implements Parser { replaceAll("\t"," "); } - /* - * public document(URL location, String mimeType, - String keywords, String shortTitle, String longTitle, - String[] sections, String abstrct, - byte[] text, Map anchors, Map images) { - * - */ + String[] docKeywords = null; + if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,"); + plasmaParserDocument theDoc = new plasmaParserDocument( location, mimeType, "UTF-8", - docKeyWords.split(" |,"), + docKeywords, docSubject, docTitle, null, @@ -166,10 +162,12 @@ public class pdfParser extends AbstractParser implements Parser { } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the pdf content. " + e.getMessage(),e); + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location); } finally { - if (theDocument != null) try { theDocument.close(); } catch (Exception e) {} - if (writer != null) try { writer.close(); } catch (Exception e) {} + if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */} + if (writer != null) try { writer.close(); } catch (Exception e) {/* ignore this */} Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } } diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index 7e117f4f5..eef4ca2fb 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -84,7 +84,7 @@ public class rpmParser extends AbstractParser implements Parser { public rpmParser() { super(LIBX_DEPENDENCIES); - parserName = "rpm Parser"; + this.parserName = "rpm Parser"; } public Hashtable getSupportedMimeTypes() { @@ -126,12 +126,12 @@ public class rpmParser extends AbstractParser implements Parser { // getting the next tag DataTypeIf tag = rpmFile.getTag(headerNames[i]); - if (tag != null) { - content.append(headerNames[i]) - .append(": ") - .append(tag.toString()) - .append("\n"); - } + if (tag == null) continue; + + content.append(headerNames[i]) + .append(": ") + .append(tag.toString()) + .append("\n"); if (headerNames[i].equals("N")) name = tag.toString(); else if (headerNames[i].equals("SUMMARY")) summary = tag.toString(); @@ -153,16 +153,18 @@ public class rpmParser extends AbstractParser implements Parser { summary, null, description, - content.toString().getBytes(), + content.toString().getBytes("UTF-8"), anchors, null); return theDoc; } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the rpm file. " + e.getMessage()); + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing rpm file. " + e.getMessage(),location); } finally { - if (rpmFile != null) try { rpmFile.close(); } catch (Exception e) {} + if (rpmFile != null) try { rpmFile.close(); } catch (Exception e) {/* ignore this */} } } diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index 299e3f865..41cf8573b 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -98,7 +98,7 @@ public class rssParser extends AbstractParser implements Parser { public rssParser() { super(LIBX_DEPENDENCIES); - parserName = "Rich Site Summary/Atom Feed Parser"; + this.parserName = "Rich Site Summary/Atom Feed Parser"; } public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { @@ -149,7 +149,7 @@ public class rssParser extends AbstractParser implements Parser { anchors.put(itemURL.toString(),itemTitle); if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32); - text.append(new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))).trim()).append(' '); // TODO: this does not work for utf-8 + text.append(new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))).trim().toString()).append(' '); String itemContent = item.getElementValue("content"); if ((itemContent != null) && (itemContent.length() > 0)) { @@ -183,11 +183,6 @@ public class rssParser extends AbstractParser implements Parser { } } - /* (URL location, String mimeType, - String keywords, String shortTitle, String longTitle, - String[] sections, String abstrct, - byte[] text, Map anchors, Map images) - */ plasmaParserDocument theDoc = new plasmaParserDocument( location, mimeType, @@ -205,7 +200,9 @@ public class rssParser extends AbstractParser implements Parser { } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the rss file. " + e.getMessage()); + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing rss file." + e.getMessage(),location); } } diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index fdef82b99..4fa5d3028 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -77,7 +77,7 @@ implements Parser { public rtfParser() { super(LIBX_DEPENDENCIES); - parserName = "Rich Text Format Parser"; + this.parserName = "Rich Text Format Parser"; } public plasmaParserDocument parse(URL location, String mimeType, String charset, @@ -113,7 +113,9 @@ implements Parser { } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the rdf content. " + e.getMessage()); + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing rtf resource." + e.getMessage(),location); } } diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index ba30acc91..c70c4e26c 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -87,7 +87,7 @@ public class tarParser extends AbstractParser implements Parser { public tarParser() { super(LIBX_DEPENDENCIES); - parserName = "Tape Archive File Parser"; + this.parserName = "Tape Archive File Parser"; } public Hashtable getSupportedMimeTypes() { @@ -128,12 +128,11 @@ public class tarParser extends AbstractParser implements Parser { // skip directories if (entry.isDirectory()) continue; - // Get the entry name - int idx = -1; + // Get the short entry name String entryName = entry.getName(); - idx = entryName.lastIndexOf("/"); - if (idx != -1) entryName = entryName.substring(idx+1); - idx = entryName.lastIndexOf("."); + + // getting the entry file extension + int idx = entryName.lastIndexOf("."); String entryExt = (idx > -1) ? entryName.substring(idx+1) : ""; // trying to determine the mimeType per file extension @@ -143,19 +142,21 @@ public class tarParser extends AbstractParser implements Parser { plasmaParserDocument theDoc = null; File tempFile = null; try { - byte[] buf = new byte[(int) entry.getSize()]; - /*int bytesRead =*/ tin.read(buf); - - tempFile = File.createTempFile("tarParser_" + ((idx>-1)?entryName.substring(0,idx):entryName), (entryExt.length()>0)?"."+entryExt:entryExt); - serverFileUtils.write(buf, tempFile); + // create the temp file + tempFile = createTempFile(entryName); + + // copy the data into the file + serverFileUtils.copy(tin,tempFile,entry.getSize()); // check for interruption checkInterruption(); // parsing the content - theDoc = theParser.parseSource(new URL(tempFile),entryMime,null,tempFile); + theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,tempFile); + } catch (ParserException e) { + this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getErrorCode()); } finally { - if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){} + if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */} } if (theDoc == null) continue; @@ -200,7 +201,9 @@ public class tarParser extends AbstractParser implements Parser { docImages); } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the zip content. " + e.getMessage()); + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing tar resource. " + e.getMessage(),location); } } diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index 1dc963e95..f92835236 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -215,7 +215,7 @@ public class vcfParser extends AbstractParser implements Parser { URL newURL = new URL(value); anchors.put(newURL.toString(),newURL.toString()); //parsedData.put(key,value); - } catch (MalformedURLException ex) {} + } catch (MalformedURLException ex) {/* ignore this */} } else if ( !key.equalsIgnoreCase("BEGIN") && !key.equalsIgnoreCase("END") && @@ -255,12 +255,10 @@ public class vcfParser extends AbstractParser implements Parser { return theDoc; } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; + if (e instanceof ParserException) throw (ParserException) e; - String errorMsg = "Unable to parse the vcard content. " + e.getMessage(); - this.theLogger.logSevere(errorMsg); - throw new ParserException(errorMsg); - } finally { - } + throw new ParserException("Unexpected error while parsing vcf resource. " + e.getMessage(),location); + } } public void reset() { diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index c6d07a66e..7b55085d8 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -43,9 +43,8 @@ package de.anomic.plasma.parser.zip; -import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.InputStream; -import de.anomic.net.URL; import java.util.Arrays; import java.util.HashMap; import java.util.Hashtable; @@ -55,12 +54,14 @@ import java.util.TreeSet; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import de.anomic.net.URL; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; import de.anomic.server.serverByteBuffer; +import de.anomic.server.serverFileUtils; public class zipParser extends AbstractParser implements Parser { @@ -84,7 +85,7 @@ public class zipParser extends AbstractParser implements Parser { public zipParser() { super(LIBX_DEPENDENCIES); - parserName = "Compressed Archive File Parser"; + this.parserName = "Compressed Archive File Parser"; } public Hashtable getSupportedMimeTypes() { @@ -110,29 +111,39 @@ public class zipParser extends AbstractParser implements Parser { ZipEntry entry; ZipInputStream zippedContent = new ZipInputStream(source); while ((entry = zippedContent.getNextEntry()) !=null) { + // check for interruption + checkInterruption(); + // skip directories if (entry.isDirectory()) continue; // Get the entry name String entryName = entry.getName(); int idx = entryName.lastIndexOf("."); - String entryExt = (idx > -1) ? entryName.substring(idx+1) : null; - - // trying to determine the mimeType per file extension - String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt); - // getting the entry content - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - byte[] buf = new byte[(int) entry.getSize()]; - /*int bytesRead =*/ zippedContent.read(buf); - bos.write(buf); - byte[] ut = bos.toByteArray(); + // getting the file extension + String entryExt = (idx > -1) ? entryName.substring(idx+1) : ""; - // check for interruption - checkInterruption(); + // trying to determine the mimeType per file extension + String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt); // parsing the content - plasmaParserDocument theDoc = theParser.parseSource(location,entryMime,null, ut); + plasmaParserDocument theDoc = null; + File tempFile = null; + try { + // create the temp file + tempFile = createTempFile(entryName); + + // copy the data into the file + serverFileUtils.copy(zippedContent,tempFile,entry.getSize()); + + // parsing the zip file entry + theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, tempFile); + } catch (ParserException e) { + this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getErrorCode()); + } finally { + if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */} + } if (theDoc == null) continue; // merging all documents together @@ -157,11 +168,7 @@ public class zipParser extends AbstractParser implements Parser { docImages.addAll(theDoc.getImages()); } - /* (URL location, String mimeType, - String keywords, String shortTitle, String longTitle, - String[] sections, String abstrct, - byte[] text, Map anchors, Map images) - */ + return new plasmaParserDocument( location, mimeType, @@ -176,9 +183,9 @@ public class zipParser extends AbstractParser implements Parser { docImages); } catch (Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; - throw new ParserException("Unable to parse the zip content. " + e.getMessage()); - } catch (Error e) { - throw new ParserException("Unable to parse the zip content. " + e.getMessage()); + if (e instanceof ParserException) throw (ParserException) e; + + throw new ParserException("Unexpected error while parsing zip resource. " + e.getMessage(),location); } } diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index d8b8fdca7..e349da8bf 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -52,6 +52,7 @@ import org.apache.commons.pool.impl.GenericObjectPool; import de.anomic.net.URL; import de.anomic.plasma.crawler.plasmaCrawlWorker; +import de.anomic.plasma.crawler.plasmaCrawlerException; import de.anomic.plasma.crawler.plasmaCrawlerFactory; import de.anomic.plasma.crawler.plasmaCrawlerMsgQueue; import de.anomic.plasma.crawler.plasmaCrawlerPool; @@ -83,7 +84,7 @@ public final class plasmaCrawlLoader extends Thread { // supported protocols // TODO: change this, e.g. by loading settings from file - this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https" /* ,"ftp" */})); + this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https"/* ,"ftp" */})); // configuring the crawler messagequeue this.theQueue = new plasmaCrawlerMsgQueue(); @@ -99,6 +100,8 @@ public final class plasmaCrawlLoader extends Thread { // The maximum number of idle connections connections in the pool // 0 = no limit. this.crawlerPoolConfig.maxIdle = Integer.parseInt(switchboard.getConfig("crawler.MaxIdleThreads","7")); + + // minIdle configuration not possible for keyedObjectPools //this.crawlerPoolConfig.minIdle = Integer.parseInt(switchboard.getConfig("crawler.MinIdleThreads","5")); // block undefinitely @@ -216,7 +219,7 @@ public final class plasmaCrawlLoader extends Thread { int depth, plasmaCrawlProfile.entry profile, int timeout - ) { + ) throws plasmaCrawlerException { plasmaHTCache.Entry result = null; if (!this.crawlwerPool.isClosed) { @@ -241,11 +244,17 @@ public final class plasmaCrawlLoader extends Thread { this.execute(theMsg); // wait for the crawl job result - result = theMsg.waitForResult(); - + result = theMsg.waitForResult(); } catch (Exception e) { - this.log.logSevere("plasmaCrawlLoader.loadSync", e); + this.log.logSevere("plasmaCrawlLoader.loadSync: Unexpected error", e); + throw new plasmaCrawlerException("Unexpected error: " + e.getMessage()); } + + // check if an error has occured + if (result == null) { + String errorMsg = theMsg.getError(); + throw new plasmaCrawlerException(errorMsg); + } } // return the result diff --git a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java index d79674b19..b3d678c67 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java +++ b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java @@ -59,6 +59,7 @@ public final class plasmaCrawlLoaderMessage { private serverSemaphore resultSync = null; private plasmaHTCache.Entry result; + private String errorMessage; // loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) { public plasmaCrawlLoaderMessage( @@ -86,6 +87,14 @@ public final class plasmaCrawlLoaderMessage { this.result = null; } + public void setError(String errorMessage) { + this.errorMessage = errorMessage; + } + + public String getError() { + return this.errorMessage; + } + public void setResult(plasmaHTCache.Entry theResult) { // store the result this.result = theResult; diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 9c22a93ca..0e5933193 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -73,12 +73,14 @@ import de.anomic.htmlFilter.htmlFilterInputStream; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; import de.anomic.http.httpc; +import de.anomic.index.indexURL; import de.anomic.net.URL; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.parser.ParserInfo; import de.anomic.server.serverFileUtils; import de.anomic.server.logging.serverLog; +import de.anomic.tools.bitfield; public final class plasmaParser { public static final String PARSER_MODE_PROXY = "PROXY"; @@ -407,7 +409,7 @@ public final class plasmaParser { if (neededLibx != null) { for (int libxId=0; libxId < neededLibx.length; libxId++) { if (javaClassPath.indexOf(neededLibx[libxId]) == -1) { - throw new ParserException("Missing dependency detected: '" + neededLibx[libxId] + "'."); + throw new Exception("Missing dependency detected: '" + neededLibx[libxId] + "'."); } neededLibxBuf.append(neededLibx[libxId]) .append(","); @@ -464,42 +466,67 @@ public final class plasmaParser { // closing the parser object pool try { theParserPool.close(); - } catch (Exception e) { } + } catch (Exception e) {/* ignore this */} } - public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) throws InterruptedException { + public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) + throws InterruptedException, ParserException { File tempFile = null; try { + // creating a temp file to store the byte array tempFile = File.createTempFile("parseSource", ".tmp"); serverFileUtils.write(source, tempFile); + + // parsing the temp file return parseSource(location, mimeType, charset, tempFile); + } catch (Exception e) { + // Interrupted- and Parser-Exceptions should pass through if (e instanceof InterruptedException) throw (InterruptedException) e; - serverLog.logSevere("PARSER", "parseSource1: " + e.getMessage(), e); - return null; + if (e instanceof ParserException) throw (ParserException) e; + + // log unexpected error + this.theLogger.logSevere("Unexpected exception in parseSource1: " + e.getMessage(), e); + throw new ParserException("Unexpected exception while parsing " + location,location, e); } finally { - if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){} + if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){/* ignore this */} } } - public plasmaParserDocument parseSource(URL location, String mimeType, String documentCharset, File sourceFile) throws InterruptedException { + public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile) + throws InterruptedException, ParserException { Parser theParser = null; + String mimeType = null; try { // getting the mimetype of the document - mimeType = getRealMimeType(mimeType); + mimeType = getRealMimeType(theMimeType); // getting the file extension of the document String fileExt = getFileExt(location); // getting the charset of the document - if (documentCharset == null) - // TODO: do a charset detection here .... - documentCharset = "ISO-8859-1"; + // TODO: do a charset detection here .... + String documentCharset = (theDocumentCharset == null) ? "ISO-8859-1" : theDocumentCharset; + + // testing if parsing is supported for this resource + if (!plasmaParser.supportedContent(location,mimeType)) { + String errorMsg = "No parser available to parse mimetype"; + this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT); + } + + // testing if the resource is not empty + if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { + String errorMsg = "No resource content available."; + this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT); + } + if (this.theLogger.isFine()) - this.theLogger.logFine("Parsing " + location + " with mimeType '" + mimeType + + this.theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); /* @@ -555,26 +582,43 @@ public final class plasmaParser { theParser = this.getParser(mimeType); // if a parser was found we use it ... + plasmaParserDocument doc = null; if (theParser != null) { - return theParser.parse(location, mimeType,documentCharset,sourceFile); + doc = theParser.parse(location, mimeType,documentCharset,sourceFile); } else if (realtimeParsableMimeTypesContains(mimeType)) { - return parseHtml(location, mimeType, documentCharset, sourceFile); + doc = parseHtml(location, mimeType, documentCharset, sourceFile); } else { - serverLog.logWarning("PARSER", "parseSource2: wrong mime type"); - return null; + String errorMsg = "No parser available to parse mimetype"; + this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT); + } + + // check result + if (doc == null) { + String errorMsg = "Unexpected error. Parser returned null."; + this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg,location); } + return doc; + } catch (Exception e) { + // Interrupted- and Parser-Exceptions should pass through if (e instanceof InterruptedException) throw (InterruptedException) e; - serverLog.logSevere("PARSER", "parseSource2: " + e.getMessage(), e); - return null; + if (e instanceof ParserException) throw (ParserException) e; + + // log unexpected error + String errorMsg = "Unexpected exception. " + e.getMessage(); + this.theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e); + throw new ParserException(errorMsg,location,e); + } finally { if (theParser != null) { - try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { } + try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { /* ignore this */} } } } - private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException { + private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException, ParserException { // ...otherwise we make a scraper and transformer FileInputStream fileIn = new FileInputStream(sourceFile); @@ -596,8 +640,9 @@ public final class plasmaParser { //serverFileUtils.copy(sourceFile, hfos); //hfos.close(); if (writer.binarySuspect()) { - this.theLogger.logInfo("Binary data found in URL " + location); - return null; + String errorMsg = "Binary data found in resource"; + this.theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg,location); } return transformScraper(location, mimeType, documentCharset, scraper); } diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index 3782ff752..a7387604b 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -43,6 +43,8 @@ package de.anomic.plasma; import java.net.MalformedURLException; import de.anomic.net.URL; +import de.anomic.plasma.parser.ParserException; + import java.util.Iterator; import java.util.Map; import java.util.TreeSet; @@ -60,10 +62,16 @@ public final class plasmaSearchImages { if (maxTime > 10) { byte[] res = sc.getResource(url, true, (int) maxTime); if (res != null) { - plasmaParserDocument document = sc.parseDocument(url, res); - + plasmaParserDocument document = null; + try { + document = sc.parseDocument(url, res); + } catch (ParserException e) { + // parsing failed + } + if (document == null) return; + // add the image links - if (document != null) this.addAll(document.getImages()); + this.addAll(document.getImages()); // add also links from pages one step deeper, if depth > 0 if (depth > 0) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 60e4f3e60..efed8fbba 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -45,6 +45,8 @@ package de.anomic.plasma; import java.io.IOException; import de.anomic.net.URL; import de.anomic.plasma.cache.IResourceInfo; +import de.anomic.plasma.crawler.plasmaCrawlerException; +import de.anomic.plasma.parser.ParserException; import java.util.Enumeration; import java.util.HashMap; @@ -164,30 +166,51 @@ public class plasmaSnippetCache { return new Snippet(line, source, null); } + /* =========================================================================== + * LOADING RESOURCE DATA + * =========================================================================== */ // if the snippet is not in the cache, we can try to get it from the htcache byte[] resource = null; IResourceInfo docInfo = null; try { + // trying to load the resource from the cache resource = this.cacheManager.loadResourceContent(url); - if ((fetchOnline) && (resource == null)) { + docInfo = this.cacheManager.loadResourceInfo(url); + + // if not found try to download it + if ((resource == null) && (fetchOnline)) { + // download resource using the crawler plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000); + + // getting resource metadata (e.g. the http headers for http resources) if (entry != null) { docInfo = entry.getDocumentInfo(); } + + // now the resource should be stored in the cache, load body resource = this.cacheManager.loadResourceContent(url); + if (resource == null) { + //System.out.println("cannot load document for URL " + url); + return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL"); + } source = SOURCE_WEB; } - } catch (IOException e) { - e.printStackTrace(); + } catch (Exception e) { + if (!(e instanceof plasmaCrawlerException)) e.printStackTrace(); return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource from web: " + e.getMessage()); } - if (resource == null) { - //System.out.println("cannot load document for URL " + url); - return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource from web, cacheManager returned NULL"); - } - plasmaParserDocument document = parseDocument(url, resource, docInfo); + /* =========================================================================== + * PARSING RESOURCE + * =========================================================================== */ + plasmaParserDocument document = null; + try { + document = parseDocument(url, resource, docInfo); + } catch (ParserException e) { + return new Snippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed + } if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed + //System.out.println("loaded document for URL " + url); String[] sentences = document.getSentences(); //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]); @@ -196,6 +219,9 @@ public class plasmaSnippetCache { return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences"); } + /* =========================================================================== + * COMPUTE SNIPPET + * =========================================================================== */ // we have found a parseable non-empty file: use the lines line = computeSnippet(sentences, queryhashes, 8 + 6 * queryhashes.size(), snippetMaxLength); //System.out.println("loaded snippet for URL " + url + ": " + line); @@ -207,22 +233,48 @@ public class plasmaSnippetCache { return new Snippet(line, source, null); } + /** + * Tries to load and parse a resource specified by it's URL. + * If the resource is not stored in cache and if fetchOnline is set the + * this function tries to download the resource from web. + * + * @param url the URL of the resource + * @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache + * @return the parsed document as {@link plasmaParserDocument} + */ public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) { byte[] resource = null; IResourceInfo docInfo = null; try { + // trying to load the resource body from cache resource = this.cacheManager.loadResourceContent(url); + + // if not available try to load resource from web if ((fetchOnline) && (resource == null)) { + // download resource using crawler plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000); + + // fetching metadata of the resource (e.g. http headers for http resource) if (entry != null) docInfo = entry.getDocumentInfo(); + + // getting the resource body from the cache resource = this.cacheManager.loadResourceContent(url); + } else { + // trying to load resource metadata + docInfo = this.cacheManager.loadResourceInfo(url); } - } catch (IOException e) { - e.printStackTrace(); + + // parsing document + if (resource == null) return null; + return parseDocument(url, resource, docInfo); + } catch (ParserException e) { + this.log.logWarning("Unable to parse resource. " + e.getMessage()); + return null; + } catch (Exception e) { + this.log.logWarning("Unexpected error while retrieving document. " + e.getMessage(),e); return null; } - if (resource == null) return null; - return parseDocument(url, resource, docInfo); + } public void storeToCache(String wordhashes, String urlhash, String snippet) { @@ -374,11 +426,11 @@ public class plasmaSnippetCache { return map; } - public plasmaParserDocument parseDocument(URL url, byte[] resource) { + public plasmaParserDocument parseDocument(URL url, byte[] resource) throws ParserException { return parseDocument(url, resource, null); } - public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) { + public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) throws ParserException { try { if (resource == null) return null; @@ -425,9 +477,15 @@ public class plasmaSnippetCache { public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) { // load the url as resource from the web try { + // trying to load the resource body from cache byte[] resource = cacheManager.loadResourceContent(url); + + // if the content is not available in cache try to download it from web if ((fetchOnline) && (resource == null)) { + // try to download the resource using a crawler loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout); + + // get the content from cache resource = cacheManager.loadResourceContent(url); } return resource; @@ -436,7 +494,7 @@ public class plasmaSnippetCache { } } - public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws IOException { + public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws plasmaCrawlerException { plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync( url, diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index d68ce3a0e..87c86f10e 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -144,6 +144,7 @@ import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroMapTable; import de.anomic.plasma.dbImport.dbImportManager; +import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverAbstractSwitch; import de.anomic.server.serverCodings; @@ -1392,7 +1393,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } - private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException { + private plasmaParserDocument parseResource(plasmaSwitchboardQueue.Entry entry, String initiatorHash) throws InterruptedException, ParserException { plasmaParserDocument document = null; // the mimetype of this entry @@ -1402,29 +1403,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // the parser logger serverLog parserLogger = parser.getLogger(); - // if the document content is supported we can start to parse the content - if (plasmaParser.supportedContent( - entry.url(), - mimeType) - ){ - if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) { - parserLogger.logFine("'" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File"); - document = parser.parseSource(entry.url(), mimeType, charset, entry.cacheFile()); - } else { - parserLogger.logFine("'" + entry.normalizedURLString() + "' cannot be parsed, no resource available"); - addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT, new bitfield(indexURL.urlFlagLength)); - } - if (document == null) { - parserLogger.logSevere("'" + entry.normalizedURLString() + "' parse failure"); - addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_PARSER_ERROR, new bitfield(indexURL.urlFlagLength)); - } - } else { - parserLogger.logFine("'" + entry.normalizedURLString() + "'. Unsupported mimeType '" + ((mimeType == null) ? "null" : mimeType) + "'."); - addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorHash, entry.anchorName(), plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT, new bitfield(indexURL.urlFlagLength)); - } - - checkInterruption(); - return document; + // parse the document + return parseResource(entry.url(), mimeType, charset, entry.cacheFile()); + } + + public plasmaParserDocument parseResource(URL location, String mimeType, String documentCharset, File sourceFile) throws InterruptedException, ParserException { + plasmaParserDocument doc = parser.parseSource(location, mimeType, documentCharset, sourceFile); + assert(doc != null) : "Unexpected error. Parser returned null."; + return doc; } private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException { @@ -1471,8 +1457,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser plasmaParserDocument document = null; parsingStartTime = System.currentTimeMillis(); + try { document = this.parseResource(entry, initiatorPeerHash); if (document == null) return; + } catch (ParserException e) { + this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage()); + addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield(indexURL.urlFlagLength)); + return; + } parsingEndTime = System.currentTimeMillis(); @@ -2172,16 +2164,22 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // determine the url string plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null); if (entry == null) return 0; + URL url = entry.url(); if (url == null) return 0; - // get set of words - // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); - Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText()); - // delete all word references - int count = removeReferences(urlhash, witer); - // finally delete the url entry itself - urlPool.loadedURL.remove(urlhash); - return count; + + try { + // get set of words + // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); + Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText()); + // delete all word references + int count = removeReferences(urlhash, witer); + // finally delete the url entry itself + urlPool.loadedURL.remove(urlhash); + return count; + } catch (ParserException e) { + return 0; + } } public int removeReferences(URL url, Set words) { diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java index 295583d6a..9030ee036 100644 --- a/source/de/anomic/server/serverByteBuffer.java +++ b/source/de/anomic/server/serverByteBuffer.java @@ -188,6 +188,10 @@ public final class serverByteBuffer extends OutputStream { public serverByteBuffer append(String s) { return append(s.getBytes()); } + + public serverByteBuffer append(String s, String charset) throws UnsupportedEncodingException { + return append(s.getBytes(charset)); + } public serverByteBuffer append(serverByteBuffer bb) { return append(bb.buffer, bb.offset, bb.length); diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java index a90db5f01..974994b24 100644 --- a/source/de/anomic/server/serverFileUtils.java +++ b/source/de/anomic/server/serverFileUtils.java @@ -73,24 +73,39 @@ import de.anomic.kelondro.kelondroRowSet; public final class serverFileUtils { + private static final int DEFAULT_BUFFER_SIZE = 4096; + + public static long copy(InputStream source, OutputStream dest) throws IOException { + return copy(source,dest); + } + /** * Copies an InputStream to an OutputStream. - * @param source InputStream - * @param dest OutputStream + * @param source InputStream + * @param dest OutputStream + * @param count the total amount of bytes to copy * @return Total number of bytes copied. + * * @see copy(InputStream source, File dest) * @see copyRange(File source, OutputStream dest, int start) * @see copy(File source, OutputStream dest) * @see copy(File source, File dest) */ - public static int copy(InputStream source, OutputStream dest) throws IOException { - byte[] buffer = new byte[4096]; + public static long copy(InputStream source, OutputStream dest, long count) throws IOException { + byte[] buffer = new byte[DEFAULT_BUFFER_SIZE]; + int chunkSize = (int) ((count > 0) ? Math.min(count, DEFAULT_BUFFER_SIZE) : DEFAULT_BUFFER_SIZE); - int c, total = 0; - while ((c = source.read(buffer)) > 0) { + int c; long total = 0; + while ((c = source.read(buffer,0,chunkSize)) > 0) { dest.write(buffer, 0, c); dest.flush(); total += c; + + if (count > 0) { + chunkSize = (int)Math.min(count-total,DEFAULT_BUFFER_SIZE); + if (chunkSize == 0) break; + } + } dest.flush(); @@ -165,21 +180,26 @@ public final class serverFileUtils { } return count; } + + public static void copy(InputStream source, File dest) throws IOException { + copy(source,dest,-1); + } /** * Copies an InputStream to a File. * @param source InputStream * @param dest File + * @param the amount of bytes to copy * @see copy(InputStream source, OutputStream dest) * @see copyRange(File source, OutputStream dest, int start) * @see copy(File source, OutputStream dest) * @see copy(File source, File dest) */ - public static void copy(InputStream source, File dest) throws IOException { + public static void copy(InputStream source, File dest, long count) throws IOException { FileOutputStream fos = null; try { fos = new FileOutputStream(dest); - copy(source, fos); + copy(source, fos, count); } finally { if (fos != null) try {fos.close();} catch (Exception e) {} } @@ -201,7 +221,7 @@ public final class serverFileUtils { fis = new FileInputStream(source); long skipped = fis.skip(start); if (skipped != start) throw new IllegalStateException("Unable to skip '" + start + "' bytes. Only '" + skipped + "' bytes skipped."); - copy(fis, dest); + copy(fis, dest,-1); } finally { if (fis != null) try { fis.close(); } catch (Exception e) {} } @@ -220,28 +240,33 @@ public final class serverFileUtils { InputStream fis = null; try { fis = new FileInputStream(source); - copy(fis, dest); + copy(fis, dest, -1); } finally { if (fis != null) try { fis.close(); } catch (Exception e) {} } } + public static void copy(File source, File dest) throws IOException { + copy(source,dest,-1); + } + /** * Copies a File to a File. * @param source File * @param dest File + * @param count the amount of bytes to copy * @see copy(InputStream source, OutputStream dest) * @see copy(InputStream source, File dest) * @see copyRange(File source, OutputStream dest, int start) * @see copy(File source, OutputStream dest) */ - public static void copy(File source, File dest) throws IOException { + public static void copy(File source, File dest, long count) throws IOException { FileInputStream fis = null; FileOutputStream fos = null; try { fis = new FileInputStream(source); fos = new FileOutputStream(dest); - copy(fis, fos); + copy(fis, fos, count); } finally { if (fis != null) try {fis.close();} catch (Exception e) {} if (fos != null) try {fos.close();} catch (Exception e) {} @@ -250,7 +275,7 @@ public final class serverFileUtils { public static byte[] read(InputStream source) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); - copy(source, baos); + copy(source, baos, -1); baos.close(); return baos.toByteArray(); } @@ -309,7 +334,7 @@ public final class serverFileUtils { } public static void write(byte[] source, OutputStream dest) throws IOException { - copy(new ByteArrayInputStream(source), dest); + copy(new ByteArrayInputStream(source), dest, -1); } public static void write(byte[] source, File dest) throws IOException {